In [1]:
import numpy as np
import pandas as pd
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [2]:
#stores the text into a dictionary where each TfidfVector can be found with the original message

readPath = 'TrainingData.csv'
data = pd.read_csv(readPath)

wholeText = data['Message1'].astype(str).tolist() + data['Message2'].astype(str).tolist()
wholeText = pd.Series(wholeText)

v = TfidfVectorizer()
matrix = v.fit_transform(wholeText)
messageToVectorDictionary = dict(zip(wholeText, matrix))


In [3]:
#does cosine comparison and absoluteDifference to the messages vectors and adds them into the training data

def FeatureDistinction(msg1, msg2):
    vector1 = messageToVectorDictionary[msg1]
    vector2 = messageToVectorDictionary[msg2]
    
    cosineSimilarity = cosine_similarity(vector1, vector2)[0][0]  
    absoluteDifference = np.abs(vector1 - vector2).toarray()[0]
    
    return np.hstack([cosineSimilarity, absoluteDifference])

featureList = []
for blank, row in data.iterrows():
    featureList.append(FeatureDistinction(row['Message1'], row['Message2']))

#adds features and maps True, False to 0 and 1 
data = data.assign(MessageFeature = featureList)

data['FromSameUserNum'] = data.FromSameUser.map({
    True: 1,
    False: 0
})

data.head()

Unnamed: 0,Message1,Message2,FromSameUser,MessageFeature,FromSameUserNum
0,@ Delta :smiley: ¤ Do you want moderator permi...,I made this how do i give you permissions anyw...,True,"[0.18767156108866093, 0.0, 0.0, 0.0, 0.0, 0.0,...",1
1,i dunno i grew up in a town of 300 . woo ! oh ...,peace m80's can we just take a second to remem...,False,"[0.11800560877602487, 0.0, 0.0, 0.0, 0.0, 0.0,...",0
2,Muffin you can edit in setting Huh Hi ¤ Check ...,i made this no idea tc was fun . i think tak s...,True,"[0.082686184302245, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",1
3,rest well dear :heart: Peace @ carawayseeds ok...,tfw you're waiting for your mac and cheese to ...,False,"[0.09748589080485451, 0.0, 0.0, 0.0, 0.0, 0.0,...",0
4,xD; i use that one a lot ¤ D: bbye ooo hi all ...,:open_mouth: ¤ gasps ¤ /nick D ¤ :frowning: oh...,True,"[0.045815412122113014, 0.0, 0.0, 0.0, 0.0, 0.0...",1


In [4]:
#trains the model and gives results on the effectivenes 

X = np.vstack(data["MessageFeature"])  

X_train, X_test, y_train, y_test = train_test_split(
    X,
    data.FromSameUserNum,
    test_size = 0.2, 
    random_state = 40,
    stratify = data.FromSameUserNum
)

classifier = RandomForestClassifier(n_estimators=100, random_state=40)
classifier.fit(X_train, y_train)

y_prediction = classifier.predict(X_test)
print(classification_report(y_test, y_prediction))


              precision    recall  f1-score   support

           0       0.58      0.59      0.58       104
           1       0.58      0.57      0.57       104

    accuracy                           0.58       208
   macro avg       0.58      0.58      0.58       208
weighted avg       0.58      0.58      0.58       208



In [5]:
data.FromSameUser.value_counts()

FromSameUser
True     519
False    519
Name: count, dtype: int64