In [18]:
import numpy as np
import pandas as pd
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import joblib


In [6]:
#stores the text into a dictionary where each TfidfVector can be found with the original message

readPath = 'TrainingData.csv'

def MessageToVectorDictionaryBuilder(readPath):
    df = pd.read_csv(readPath)
    wholeText = df['Message1'].astype(str).tolist() + df['Message2'].astype(str).tolist()
    wholeText = pd.Series(wholeText)

    vectorizer = TfidfVectorizer()
    vectorMatrix = vectorizer.fit_transform(wholeText)
    messageToVectorDictionary = dict(zip(wholeText, vectorMatrix))

    return df, messageToVectorDictionary

df, messageToVectorDictionary = MessageToVectorDictionaryBuilder(readPath)

In [7]:
#calculates all the features for the training model

#calculates average word length and punctuation frequency
def WordLengthAndPunctuation(message1, message2):
    punctuations = ["!", "?", ".", ",", ";"]
    
    message1AvgLen = 0
    message2AvgLen = 0
    words1 = 0
    words2 = 0

    punctuations1 = 0
    punctuations2 = 0
    
    for word in message1.split():
        message1AvgLen += len(word)
        if word in punctuations:
            punctuations1 += 1
        words1 += 1

    for word in message2.split(): 
        message2AvgLen += len(word)
        if word in punctuations:
            punctuations2 += 1
        words2 += 1

    message1AvgLen = message1AvgLen / words1
    message2AvgLen = message2AvgLen / words2
    avgLenDiff = message1AvgLen - message2AvgLen

    punctuations1Avg = punctuations1 / words1
    punctuations2Avg = punctuations2 / words2
    punctuationDiff = punctuations1Avg - punctuations2Avg 
    
    return avgLenDiff, punctuationDiff

#does cosine comparison to the messages vectors and adds them into the training data
def CosineComparison(message1, message2, messageToVectorDictionary):
    vector1 = messageToVectorDictionary[message1]
    vector2 = messageToVectorDictionary[message2]
    
    cosineSimilarity = cosine_similarity(vector1, vector2)[0][0]  
    
    return cosineSimilarity


def FeatureBuilder(df, messageToVectorDictionary):
    cosineComparisonList = []
    wordLengthComparisonList = []
    punctuationComparisonList = []

    for blank, row in df.iterrows():
        avgWordLenDiff, avgPunctuationDiff = WordLengthAndPunctuation(row['Message1'], row['Message2'])
    
        wordLengthComparisonList.append(avgWordLenDiff)
        punctuationComparisonList.append(avgPunctuationDiff)
        cosineComparisonList.append(CosineComparison(row['Message1'], row['Message2'], messageToVectorDictionary))

    #adds features and maps True, False to 0 and 1 
    df = df.assign(CosineFeature = cosineComparisonList)
    df = df.assign(WordLengthFeature = wordLengthComparisonList)
    df = df.assign(PunctuationFeature = punctuationComparisonList)

    df['FromSameUserNum'] = df.FromSameUser.map({
        True: 1,
        False: 0
    })

    return df

df = FeatureBuilder(df, messageToVectorDictionary)


In [13]:
#trains the model and gives results on the effectivenes 

def TrainTest(df):
    
    X = df.drop(columns=[
        'Message1',
        'Message2', 
        'FromSameUserNum',
        'FromSameUser'
    ])

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        df.FromSameUserNum,
        test_size = 0.2, 
        random_state = 40,
        stratify = df.FromSameUserNum
    )


    classifier = RandomForestClassifier(n_estimators=100, random_state=40)
    classifier.fit(X_train, y_train)

    y_prediction = classifier.predict(X_test)
    #print(classification_report(y_test, y_prediction))
    return classification_report(y_test, y_prediction, output_dict=True), classifier

result, model = TrainTest(df)
print(result)


{'0': {'precision': 0.7532467532467533, 'recall': 0.8405797101449275, 'f1-score': 0.7945205479452054, 'support': 69.0}, '1': {'precision': 0.819672131147541, 'recall': 0.7246376811594203, 'f1-score': 0.7692307692307693, 'support': 69.0}, 'accuracy': 0.782608695652174, 'macro avg': {'precision': 0.7864594421971471, 'recall': 0.7826086956521738, 'f1-score': 0.7818756585879874, 'support': 138.0}, 'weighted avg': {'precision': 0.7864594421971471, 'recall': 0.782608695652174, 'f1-score': 0.7818756585879874, 'support': 138.0}}


In [19]:
#implementation test
def ImplementationTest(df, model):
    model = joblib.load(model)

    X = df.drop(columns=[
        'Message1',
        'Message2', 
        'FromSameUserNum',
        'FromSameUser'
    ])
    true = df["FromSameUserNum"]

    #Make predictions
    prediction = model.predict(X)

    return classification_report(true, prediction, output_dict=True), confusion_matrix(true, prediction)

In [41]:
df.FromSameUser.value_counts()

FromSameUser
True     345
False    345
Name: count, dtype: int64

In [39]:
#builds the graphs needed

def F1GraphBuilder(f1ResultMatrix, length, nameList):
    for i, test in enumerate(f1ResultMatrix):
        plt.figure(figsize = (10, 10))
        subPlot = plt.subplot2grid((3, 3), (0, 0), colspan=2)
        

        #x = total cases
        x = range(length)
        subPlot.plot(x, test[0], color = 'r', label = 'precision0')
        subPlot.plot(x, test[1], color = 'g', label = 'recall0')
        subPlot.plot(x, test[2], color = 'c', label = 'f1-score0')
        subPlot.plot(x, test[3], color = 'b', label = 'precision1')
        subPlot.plot(x, test[4], color = 'y', label = 'recall1')
        subPlot.plot(x, test[5], color = 'm', label = 'f1-score1')
        subPlot.plot(x, test[6], color = 'k', label = 'accuracy')

        subPlot.legend()
        plt.tight_layout()
        
        plt.title(nameList[i])
        plt.xlabel('Model size')
        plt.ylabel("Score")
        
        plt.grid()
        plt.yticks(np.linspace(0, 1, 11))
        plt.show()
        
    
    return plt

def ConfusionMatrixGraphBuilder(confusionMatrix, length, nameList):
    for i, test in enumerate(confusionMatrix):
        plt.figure(figsize = (10, 10))
        subPlot = plt.subplot2grid((3, 3), (0, 0), colspan=2)

        x = range(length)
        subPlot.plot(x, test[0], color = 'b', label = 'TP')
        subPlot.plot(x, test[1], color = 'c', label = 'FP')
        subPlot.plot(x, test[2], color = 'r', label = 'TN')
        subPlot.plot(x, test[3], color = 'y', label = 'FN')

        subPlot.legend()
        plt.tight_layout()
        
        plt.title(nameList[i])
        plt.xlabel('Model size')
        plt.ylabel("Amount")
        
        plt.grid()
        plt.show()
    
    return plt