# ADA word embeddings here...

In [1]:
!pip install openai



In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import openai
import os

from openai import OpenAI




os.environ['OPENAI_API_KEY'] = 'your key here'

# Load the data
reviews = pd.read_json("/content/drive/MyDrive/Supervised_Final_Project/news_category_trainingdata.json")

# Prepare the data
reviews['combined_text'] = reviews['headline'] + ' ' + reviews['short_description']
reviews['wellness'] = np.where((reviews['category'].isin(['HEALTHY LIVING', 'WELLNESS'])), 1, 0)

total_wellness = reviews[reviews['wellness'] == 1].shape[0]
total_not_wellness = reviews[reviews['wellness'] == 0].shape[0]

print(f"Total number of 'wellness' rows: {total_wellness}")
print(f"Total number of 'not_wellness' rows: {total_not_wellness}")

# Balance the data
sample_amount = 20000
wellness = reviews[reviews['wellness'] == 1].sample(n=sample_amount)
not_wellness = reviews[reviews['wellness'] == 0].sample(n=sample_amount)
review_sample = pd.concat([wellness, not_wellness])

# Train-test split
train_df, test_df = train_test_split(review_sample, test_size=0.1, stratify=review_sample['wellness'])

# Set your OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI()

# Function to get embeddings from OpenAI
def get_embedding(client, text):

    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
        #model="text-embedding-3-large"
        #model="text-embedding-ada-002"
    )

    #print(response)
    var_return = response.data[0].embedding
    return var_return

# Get embeddings for the train and test data
train_embeddings = np.array([get_embedding(client, text) for text in train_df['combined_text']])
test_embeddings = np.array([get_embedding(client, text) for text in test_df['combined_text']])

# Train a classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
print(clf)
clf.fit(train_embeddings, train_df['wellness'])

# Predict and evaluate
test_preds = clf.predict(test_embeddings)
report = classification_report(test_df['wellness'], test_preds, target_names=['NOT WELLNESS', 'WELLNESS'])
print(report)






Total number of 'wellness' rows: 24521
Total number of 'not_wellness' rows: 176332
RandomForestClassifier(random_state=42)
              precision    recall  f1-score   support

NOT WELLNESS       0.88      0.90      0.89      2000
    WELLNESS       0.90      0.88      0.89      2000

    accuracy                           0.89      4000
   macro avg       0.89      0.89      0.89      4000
weighted avg       0.89      0.89      0.89      4000

