Install requirements: pycaret, openai, tiktoken.
Get all neccessary imports

In [None]:
! pip install pycaret
! pip install openai
! pip install tiktoken
import pandas as pd
import numpy as np
from ast import literal_eval

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import tiktoken
from openai import OpenAI
import os

Set up client and API key.
The API key can be founds in the submission comments. If there are any errors email me: v.vasile.1@student.rug.nl
The API key leads to my account - toped it off with 10-20 euros if the Grader whats to test the models. Embeddings is extremly quick so it should not take longer than a minute.

In [2]:
os.environ["OPENAI_API_KEY"] = "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"

client = OpenAI()

Load the encodings neccessary for the GPT embeddings model. The only option is ada.

In [3]:
# this the encoding for text-embedding-ada-002
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"

# the maximum for text-embedding-ada-002 is 8191
# None of our review go over 8000 tokens - so it is a good stop off point
max_tokens = 8000

Load the Train subset of the Movie Review Dataset

In [4]:
df_train = pd.DataFrame(pd.read_csv("train_data.csv"))

Get the encodings using tiktoken from the cl100k_base

In [5]:
encoding = tiktoken.get_encoding(embedding_encoding)

Get the number of tokens for each of the reviews in the training data

In [6]:
df_train["n_tokens"] = df_train['prompt'].apply(lambda x: len(encoding.encode(x)))

Show the number of tokens

In [7]:
df_train['n_tokens'].head()

Unnamed: 0,n_tokens
0,390
1,206
2,206
3,175
4,291


In [8]:
top_n = 8000

df_train = df_train[df_train.n_tokens <= max_tokens].tail(top_n)

Extract the embeddings from the tokens. Embeddings are floating point numbers. Each token is transformered into such an embedding. They are then stored into a new column called embedding

In [9]:
def create_embeddings(text):
    response = client.embeddings.create(
        input=text,
        model=embedding_model
    )

    return response.data[0].embedding

df_train["embedding"] = df_train["prompt"].apply(lambda x: create_embeddings(x))

Save a csv of the embeddings for later usage

In [None]:
df_train.to_csv("sentiment_embedding_train.csv")

Convert the embeddings vectors to numpy arrays for the model training

In [11]:
df_train["embedding"] = df_train["embedding"].apply(np.array)

Spit the training data into the inputs and ground truths.

In [12]:
df_train_X = df_train["embedding"]
df_train_y = df_train["completion"]
print(df_train_X)

0       [0.024462953209877014, -0.030525043606758118, ...
1       [0.0038094427436590195, -0.005578480660915375,...
2       [0.0042413328774273396, -0.016910823062062263,...
3       [-0.011880503036081791, -0.015930674970149994,...
4       [0.0031811397057026625, -0.022765344008803368,...
                              ...                        
3995    [-0.014199494384229183, -0.021848689764738083,...
3996    [-0.011443705298006535, -0.00418713828548789, ...
3997    [0.005860462319105864, -0.01622398942708969, 0...
3998    [-0.0029704624321311712, -0.018338633701205254...
3999    [0.014715444296598434, -0.0031127561815083027,...
Name: embedding, Length: 4000, dtype: object


For the RandomForestClassifer to accept the embeddings arrays, they must first be reshaped

In [13]:
df_train_X = np.stack(df_train_X)


Make sure shapes match

In [14]:
np.shape(df_train_X)

(4000, 1536)

In [15]:
df_train_y = df_train_y.to_numpy()
np.shape(df_train_y)

(4000,)

Load the Evaluation subset. Repeat the previous steps

In [16]:
df_test = pd.DataFrame(pd.read_csv("test_data.csv"))

In [17]:
df_test["n_tokens"] = df_test['prompt'].apply(lambda x: len(encoding.encode(x)))

In [18]:
df_test

Unnamed: 0,prompt,completion,n_tokens
0,A movie theater with a bad history of past gru...,2,358
1,The first time I saw this film I wanted to li...,1,459
2,I have watched some pretty poor films in the p...,2,228
3,The fact that a film is on DVD doesn t guarant...,2,877
4,I m not a huge Star Trek fan but I was lookin...,2,157
...,...,...,...
495,An interesting slasher film with multiple susp...,2,164
496,i watched this series when it first came out i...,1,378
497,Once again Jet Li brings his charismatic prese...,1,294
498,I rented this movie after hearing Chris Gore ...,2,734


In [19]:
df_test['embedding'] = df_test["prompt"].apply(lambda x: create_embeddings(x))

In [20]:
df_test.to_csv("sentiment_embedding_test.csv")

In [22]:
df_test["embedding"] = df_test.embedding.apply(np.array)

In [23]:
df_test_X = df_test["embedding"]
df_test_y = df_test["completion"]
df_test_X

Unnamed: 0,embedding
0,"[-0.014765665866434574, -0.028700243681669235,..."
1,"[0.0016441266052424908, -0.011905062012374401,..."
2,"[-0.01963004097342491, -0.02953287586569786, -..."
3,"[-0.016225965693593025, 0.003569712396711111, ..."
4,"[0.015576953999698162, -0.019757483154535294, ..."
...,...
495,"[-0.016666322946548462, -0.00815661158412695, ..."
496,"[-0.018328610807657242, -0.02107447013258934, ..."
497,"[-0.024762436747550964, -0.028390729799866676,..."
498,"[-0.0009974512504413724, -0.011303873732686043..."


In [24]:
df_test_X = np.stack(df_test_X)
np.shape(df_test_X)

(500, 1536)

Using sklearn, train a RandomForestClassifer model

In [25]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(df_train_X, df_train_y)


Generate the predictions using the trained RF

In [26]:
preds = rf.predict(df_test_X)
np.shape(preds)

(500,)

Compare the ground truths with the predictions - get accuracy, precision, recall and F1-score

In [None]:
accuracy = accuracy_score(df_test_y, preds)
print(f"Accuracy: {accuracy}")

precision = precision_score(df_test_y, preds)
print(f"Precession: {precision}")

f_1 = f1_score(df_test_y, preds)
print(f"F1 Score: {f_1}")


recall = recall_score(df_test_y, preds)
print(f"Recall: {recall}")


report = classification_report(df_test_y, preds)
print(f"Classification Report:\n{report}")

