# Current Plan
1. Create a function to calculate the distance between descriptions, use that function to naively classify new descriptions. probably looks something like a search function.
2. Grab a text classification model of hugging face and fine-tune it on the dataset.

In [None]:
# setup requirements, which aren't loading in the venv for some reason
%pip install -r requirements.txt

In [35]:
# Set up data split
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

with open('data/Datas.csv') as file:
    data = pd.read_csv(file)
    
# Split data into training and testing sets in a stratified way, despite the minimum class only having 2 samples 
cat_count = data.value_counts('Category')
subcat_count = data.value_counts(['Category', 'Sub_Category'])

# handle the case where the minimum class has only 2 samples
data = data.groupby('Category').filter(lambda x: len(x) > 2) # this method works, but my pandas is pretty rusty. 

train, test = train_test_split(data, test_size=0.2, stratify=data['Category'], random_state=42) # random_state for reproducibility
train, val = train_test_split(train, test_size=0.3, stratify=train['Category'], random_state=77) 

## 1. Calculating distance 
Cosine similarity [shouldn't be affected by text length](https://nikoskalikis.medium.com/text-similarity-euclidian-distance-vs-cosine-similarity-3a1167f686a#:~:text=Cosine%20similarity%20is%20an%20important,value%20between%20%2D1%20and%201.), which is quite short here. 



In [36]:
# Function to measure distance 
from sklearn.feature_extraction.text import TfidfVectorizer
# have a think about whether the data needs cleaning


tfidf = TfidfVectorizer() # use token_pattern to remove punctuation if needed
tfidf.fit(train['Description']) # val set not included here 
tfidf_array = tfidf.transform(train['Description']).toarray()

data2 = pd.DataFrame(tfidf_array, 
                     columns=tfidf.get_feature_names_out(),
                     index=train['Description'])

In [37]:
# cosine distance
from sklearn.metrics.pairwise import cosine_similarity

values = cosine_similarity(data2, data2) # dist() = x.y / (||x|| * ||y||) or sum(x * y) / (sqrt(sum(x^2)) * sqrt(sum(y^2)))
output = pd.DataFrame(values, index=train.index, columns=train.index)

In [None]:
# plot
import seaborn as sns
import matplotlib.pyplot as plt

sns.heatmap(output)

#2. Models
I can use a variety of basic models, I'll stick with KNN and Naive Bayes since they are simple and known to work on text. 


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

# setup KNN
k = int(np.sqrt(train['Category'].count())) # sqrt(n) is a common choice for k
knn = KNeighborsClassifier(n_neighbors=k, metric='cosine')
knn.fit(data2, train['Category'])

# predict
#predictions = knn.predict(tfidf.transform(val['Description']).toarray())
#tmp = pd.DataFrame(predictions, index=val.index, columns=['Predicted'])
#val_pred = val.join(tmp)

# evaluate
accuracies = cross_val_score(knn, data2, train['Category'], cv=5)
print("Train Score:", np.mean(accuracies))
print("Test Score:", knn.score(tfidf.transform(val['Description']).toarray(), val['Category']))


# evaluate accuracy and precision metrics

#print("Test Score:", knn.score( predictions))
