In [15]:
import numpy as np
import pandas as pd
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [22]:
#stores the text into a dictionary where each TfidfVector can be found with the original message

readPath = 'TrainingData.csv'

def MessageToVectorDictionaryBuilder(readPath):
    df = pd.read_csv(readPath)
    wholeText = df['Message1'].astype(str).tolist() + df['Message2'].astype(str).tolist()
    wholeText = pd.Series(wholeText)

    vectorizer = TfidfVectorizer()
    vectorMatrix = vectorizer.fit_transform(wholeText)
    messageToVectorDictionary = dict(zip(wholeText, vectorMatrix))

    return df, messageToVectorDictionary

df, messageToVectorDictionary = MessageToVectorDictionaryBuilder(readPath)

In [17]:
#calculates all the features for the training model

#calculates average word length and punctuation frequency
def WordLengthAndPunctuation(message1, message2):
    punctuations = ["!", "?", ".", ",", ";"]
    
    message1AvgLen = 0
    message2AvgLen = 0
    words1 = 0
    words2 = 0

    punctuations1 = 0
    punctuations2 = 0
    
    for word in message1.split():
        message1AvgLen += len(word)
        if word in punctuations:
            punctuations1 += 1
        words1 += 1

    for word in message2.split(): 
        message2AvgLen += len(word)
        if word in punctuations:
            punctuations2 += 1
        words2 += 1

    message1AvgLen = message1AvgLen / words1
    message2AvgLen = message2AvgLen / words2
    avgLenDiff = message1AvgLen - message2AvgLen

    punctuations1Avg = punctuations1 / words1
    punctuations2Avg = punctuations2 / words2
    punctuationDiff = punctuations1Avg - punctuations2Avg 
    
    return avgLenDiff, punctuationDiff

#does cosine comparison to the messages vectors and adds them into the training data
def CosineComparison(message1, message2, messageToVectorDictionary):
    vector1 = messageToVectorDictionary[message1]
    vector2 = messageToVectorDictionary[message2]
    
    cosineSimilarity = cosine_similarity(vector1, vector2)[0][0]  
    
    return cosineSimilarity


def FeatureBuilder(df, messageToVectorDictionary):
    cosineComparisonList = []
    wordLengthComparisonList = []
    punctuationComparisonList = []

    for blank, row in df.iterrows():
        avgWordLenDiff, avgPunctuationDiff = WordLengthAndPunctuation(row['Message1'], row['Message2'])
    
        wordLengthComparisonList.append(avgWordLenDiff)
        punctuationComparisonList.append(avgPunctuationDiff)
        cosineComparisonList.append(CosineComparison(row['Message1'], row['Message2'], messageToVectorDictionary))

    #adds features and maps True, False to 0 and 1 
    df = df.assign(CosineFeature = cosineComparisonList)
    df = df.assign(WordLengthFeature = wordLengthComparisonList)
    df = df.assign(PunctuationFeature = punctuationComparisonList)

    df['FromSameUserNum'] = df.FromSameUser.map({
        True: 1,
        False: 0
    })

    return df

df = FeatureBuilder(df, messageToVectorDictionary)


In [18]:
#trains the model and gives results on the effectivenes 

#X = np.vstack(data["CosineFeature"])  
X = df.drop(columns=[
    'Message1',
    'Message2', 
    'FromSameUserNum',
    'FromSameUser'
])

X_train, X_test, y_train, y_test = train_test_split(
    X,
    df.FromSameUserNum,
    test_size = 0.2, 
    random_state = 40,
    stratify = df.FromSameUserNum
)


classifier = RandomForestClassifier(n_estimators=100, random_state=40)
classifier.fit(X_train, y_train)

y_prediction = classifier.predict(X_test)
print(classification_report(y_test, y_prediction))


              precision    recall  f1-score   support

           0       0.80      0.79      0.79       104
           1       0.79      0.80      0.79       104

    accuracy                           0.79       208
   macro avg       0.79      0.79      0.79       208
weighted avg       0.79      0.79      0.79       208



In [5]:
df.FromSameUser.value_counts()

FromSameUser
True     519
False    519
Name: count, dtype: int64

In [7]:
import joblib
#saves model for testing implementation
joblib.dump(classifier, "RandomForestModel.pkl")

['RandomForestModel.pkl']