In [21]:
import numpy as np
import pandas as pd
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [22]:
#stores the text into a dictionary where each TfidfVector can be found with the original message

readPath = 'TrainingData1.csv'
data = pd.read_csv(readPath)

wholeText = data['Message1'].astype(str).tolist() + data['Message2'].astype(str).tolist()
wholeText = pd.Series(wholeText)

v = TfidfVectorizer()
matrix = v.fit_transform(wholeText)
messageToVectorDictionary = dict(zip(wholeText, matrix))


In [23]:
#does cosine comparison to the messages vectors and adds them into the data

def FeatureDistinction(msg1, msg2):
    vector1 = messageToVectorDictionary[msg1]
    vector2 = messageToVectorDictionary[msg2]
    
    cosineSimilarity = cosine_similarity(vector1, vector2)[0][0]    
    return cosineSimilarity

featureList = []
for blank, row in data.iterrows():
    featureList.append(FeatureDistinction(row['Message1'], row['Message2']))

#adds features and maps True, False to 0 and 1 
data = data.assign(MessageFeature = featureList)

data['FromSameUserNum'] = data.FromSameUser.map({
    True: 1,
    False: 0
})

data.head()

Unnamed: 0,Message1,Message2,FromSameUser,MessageFeature,FromSameUserNum
0,"anyways , exploring this as a fill in for tiny...",ha ! gave em,True,0.0,1
1,makes sense,oh hai ¤ a red one ¤ i get its qik ¤ bet ¤ or tak,True,0.0,1
2,/fite dvs,Hello,False,0.0,0
3,So you guys did like my idea to have a discord...,thought about it before but to lazy to do anyt...,False,0.049688,0
4,no idea,tc was fun . i think tak set one of these up f...,True,0.0,1


In [24]:
#trains the model and gives results on the effectivenes 

#turns MessageFeature into 2D array 
X = np.vstack(data["MessageFeature"])  

X_train, X_test, y_train, y_test = train_test_split(
    X,
    data.FromSameUserNum,
    test_size = 0.2, 
    random_state = 40,
    stratify = data.FromSameUserNum
)

classifier = RandomForestClassifier(n_estimators=100, random_state=40)
classifier.fit(X_train, y_train)

y_prediction = classifier.predict(X_test)
print(classification_report(y_test, y_prediction))


              precision    recall  f1-score   support

           0       0.56      0.18      0.27       183
           1       0.53      0.87      0.66       195

    accuracy                           0.53       378
   macro avg       0.54      0.52      0.47       378
weighted avg       0.54      0.53      0.47       378

