## 1. Load the dataset



To run this notebook, you will need to install: pandas, openai, transformers, plotly, matplotlib, scikit-learn, torch (transformer dep), torchvision, and scipy.

In [24]:
import pandas as pd
import openai

# Setup API-Key for OpenAi
openai.api_key = "sk-"

# Define the embedding model which is used
embedding_model = "text-embedding-ada-002"

# Retrieve the data from the database
input_datapath = pd.read_sql_query("SELECT * FROM 'mortgage complaints'", "sqlite:///StaterData.db")

# Limit test size due to performance issues
data = input_datapath.loc[:30]

# Extract the relevant columns and put into a list
texts = data["Consumer complaint narrative"].tolist()

# Initialize a list to store the embeddings
embeddings = []

# Generate embeddings for the texts
for text in texts:
    response = openai.Completion.create(
        engine=embedding_model,
        prompt=text,
        max_tokens=10,
        temperature=0,
        top_p=0,
        n=1,
        stop=None,
        logprobs=10,
    )
    logprobs = response.choices[0].logprobs
    embedding = logprobs.token_logprobs
    embeddings.append(embedding)


# Assign the embeddings to the DataFrame
data["Embeddings"] = embeddings

# data = data["Embeddings"]

# Save the DataFrame with embeddings to a CSV file
data.to_csv("MortgageEmbeddings.csv", index=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Embeddings"] = embeddings


In [16]:
from openai.embeddings_utils import get_embedding

get_embedding("the fox crossed the road", engine='text-embedding-ada-002')

[-0.0006075783167034388,
 0.00036217979504726827,
 -0.020174754783511162,
 0.007051064632833004,
 -0.01384962908923626,
 0.025376256555318832,
 -0.022232631221413612,
 -0.022321006283164024,
 0.013874879106879234,
 -0.033128008246421814,
 0.029239507392048836,
 0.008850127458572388,
 0.032976508140563965,
 -0.015831753611564636,
 0.0048732515424489975,
 -0.0010605002753436565,
 0.01954350620508194,
 -0.0010549768339842558,
 0.018306255340576172,
 -0.030678758397698402,
 -0.009860128164291382,
 0.029441507533192635,
 0.0023498288355767727,
 -0.02365925721824169,
 -0.003907438833266497,
 0.006457689218223095,
 0.015238379128277302,
 -0.013824379071593285,
 -0.002794860163703561,
 -0.011589753441512585,
 0.008092626929283142,
 0.0018953286344185472,
 -0.028810258954763412,
 0.00047462122165597975,
 -0.013862254098057747,
 -0.017409879714250565,
 -0.0018827036255970597,
 -0.008660752326250076,
 0.01124256569892168,
 -0.027421507984399796,
 0.027143757790327072,
 0.005277251359075308,
 -0.0

In [30]:
import pandas as pd
import json
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Load the data with embeddings from the CSV file
data = pd.read_csv("MortgageEmbeddings.csv")

# Prepare the feature matrix X and target vector y
X = data["Embeddings"].apply(lambda x: json.loads(x)).tolist()
y = data["Issue"]

# Reshape the embeddings into a 2D array
X = np.array(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of the random forest classifier
rf_classifier = RandomForestClassifier()

# Train the random forest classifier
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

# Evaluate the performance of the model
accuracy = (y_pred == y_test).mean()
print("Accuracy:", accuracy)



Accuracy: 0.2857142857142857
