# ADA word embeddings here...

In [1]:
!pip install openai

Collecting openai
  Downloading openai-1.30.3-py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.6/320.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully installed h11-0.14.0 httpcore-1.0.5 ht

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import openai
import os

from openai import OpenAI




os.environ['OPENAI_API_KEY'] = 'your key here'

# Load the data
reviews = pd.read_json("/content/drive/MyDrive/Supervised_Final_Project/news_category_trainingdata.json")

# Prepare the data
reviews['combined_text'] = reviews['headline'] + ' ' + reviews['short_description']
reviews['wellness'] = np.where((reviews['category'].isin(['HEALTHY LIVING', 'WELLNESS'])), 1, 0)

total_wellness = reviews[reviews['wellness'] == 1].shape[0]
total_not_wellness = reviews[reviews['wellness'] == 0].shape[0]

print(f"Total number of 'wellness' rows: {total_wellness}")
print(f"Total number of 'not_wellness' rows: {total_not_wellness}")

# Balance the data
#sample_amount = 20000
sample_amount = 1000
wellness = reviews[reviews['wellness'] == 1].sample(n=sample_amount)
not_wellness = reviews[reviews['wellness'] == 0].sample(n=sample_amount)
review_sample = pd.concat([wellness, not_wellness])

# Train-test split
train_df, test_df = train_test_split(review_sample, test_size=0.1, stratify=review_sample['wellness'])

# Set your OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI()

# Function to get embeddings from OpenAI
def get_embedding(client, text):

    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
        #model="text-embedding-3-large"
        #model="text-embedding-ada-002"
    )

    #print(response)
    var_return = response.data[0].embedding
    return var_return

# Get embeddings for the train and test data
train_embeddings = np.array([get_embedding(client, text) for text in train_df['combined_text']])
test_embeddings = np.array([get_embedding(client, text) for text in test_df['combined_text']])

# Train a classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
print(clf)
clf.fit(train_embeddings, train_df['wellness'])

# Predict and evaluate
test_preds = clf.predict(test_embeddings)
report = classification_report(test_df['wellness'], test_preds, target_names=['NOT WELLNESS', 'WELLNESS'])
print(report)






Total number of 'wellness' rows: 24521
Total number of 'not_wellness' rows: 176332
RandomForestClassifier(random_state=42)
              precision    recall  f1-score   support

NOT WELLNESS       0.88      0.92      0.90       100
    WELLNESS       0.92      0.88      0.90       100

    accuracy                           0.90       200
   macro avg       0.90      0.90      0.90       200
weighted avg       0.90      0.90      0.90       200

