---
# Data Mining - Project 3 - Spring Semester 2020
---
##  Iglezou Myrto  | sdi1700038
---

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import sys # only needed to determine Python version number
import matplotlib # only needed to determine Matplotlib version number
import nltk
from pandas import DataFrame, read_csv
from nltk import word_tokenize, BigramCollocationFinder
from nltk.corpus import stopwords as stopwords2

In [2]:
trainPath = r"C:\Users\myrto\Documents\GitHub\-Project-3-DataMining\train.csv" 
testPath = r"C:\Users\myrto\Documents\GitHub\-Project-3-DataMining\impermium_verification_set.csv" 
y_testPath = r"C:\Users\myrto\Documents\GitHub\-Project-3-DataMining\impermium_verification_labels.csv"
tempfile = pd.read_csv(trainPath,index_col=False)
traindf = pd.DataFrame(data=tempfile)
tempfile = pd.read_csv(testPath,index_col=False)
testdf = pd.DataFrame(data=tempfile)
tempfile = pd.read_csv(y_testPath,index_col=False)
y_testdf = pd.DataFrame(data=tempfile)
y_test = y_testdf["Insult"]

In [3]:
from string import punctuation
from wordcloud import STOPWORDS,WordCloud

punctuation = list(punctuation)
punctuation.append('â€™')
stopwordsPath = r"C:\Users\myrto\Documents\GitHub\-Project-3-DataMining\english_stopwords.txt"
file = open(stopwordsPath,"r",errors='ignore')
stopwords_fromHub = [x.replace('\n',"") for x in file.readlines()]
stopwords = set(STOPWORDS)
stopwords_nltk =  set(stopwords2.words('english'))
stopwords.update(stopwords_nltk)
stopwords.update(set(stopwords_fromHub))

punctuation.append("``")
punctuation.append("...")

In [4]:
def vectorizeData(X_train,X_test,vectorizer):
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    return X_train,X_test

In [5]:
from sklearn import metrics

def classifier(NB,x_train, y_train,x_test,y_test):
    clf = NB
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    train_accuracy = metrics.accuracy_score(y_test, y_pred)
    train_f1 = metrics.f1_score(y_test, y_pred)
    print("Accuracy score = ",train_accuracy)
    print("F1 score = ",train_f1)

# Cleaning Data

In [6]:
import re
from nltk.stem import WordNetLemmatizer

testdf['Comment'] = testdf['Comment'].str.replace('http\S+|www.\S+', '', case=False)
traindf['Comment'] = traindf['Comment'].str.replace('http\S+|www.\S+', '', case=False)

x_train = traindf['Comment']
y_train = traindf['Insult']
x_test = testdf['Comment']

def cleanText(data):
    processed_data = []   
    for text in data.values:
        textWords = []
        # text = re.sub("[^a-zA-Z]"," ",text) # matches everything from numbers and punctuations etc , leaving only the words
        for word in word_tokenize(text):
            word = word.lower()
            if (word not in punctuation) and word.isalpha():
                textWords.append(word)
        processed_data.append(' '.join(textWords))
    data.update(pd.Series(processed_data,index=data.index))

def lemmatize(data):
    processed_data = []   
    for text in data.values:
        textWords = []
        for word in word_tokenize(text):
            word = WordNetLemmatizer().lemmatize(word)
            textWords.append(word)
        processed_data.append(' '.join(textWords))
    data.update(pd.Series(processed_data,index=data.index))

def removeStopWords(data):
    processed_data = []   
    for text in data.values:
        textWords = []
        for word in word_tokenize(text):
            if word not in stopwords:
                textWords.append(word)
        processed_data.append(' '.join(textWords))
    data.update(pd.Series(processed_data,index=data.index))

In [7]:
cleanText(x_train)
cleanText(x_test)

---
# Naive Bayes
---

In [8]:
y_test = y_test.tolist()
y_train = y_train.tolist()

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.naive_bayes import GaussianNB

count_train,count_test = vectorizeData(x_train,x_test,CountVectorizer(max_df=1.0, min_df=1)) 

count_train = count_train.toarray()
count_test = count_test.toarray()

classifier(GaussianNB(),count_train, y_train,count_test,y_test)

Accuracy score =  0.5221476510067115
F1 score =  0.5291005291005291


## 1. Lemmatization

In [10]:
lemmatize(x_train)
lemmatize(x_test)

count_train,count_test = vectorizeData(x_train,x_test,CountVectorizer(max_df=1.0, min_df=1)) 

count_train = count_train.toarray()
count_test = count_test.toarray()

classifier(GaussianNB(),count_train, y_train,count_test,y_test)

Accuracy score =  0.5257270693512305
F1 score =  0.5411255411255411


## 2. Stopwords

In [11]:
count_train,count_test = vectorizeData(x_train,x_test,CountVectorizer(max_df=1.0, min_df=1,stop_words=stopwords)) 

count_train = count_train.toarray()
count_test = count_test.toarray()

classifier(GaussianNB(),count_train, y_train,count_test,y_test)

Accuracy score =  0.5266219239373602
F1 score =  0.5543386689132266


## 3. Bigrams

In [12]:
count_train,count_test = vectorizeData(x_train,x_test,CountVectorizer(max_df=1.0, min_df=1,ngram_range=(2,2))) 

count_train = count_train.toarray()
count_test = count_test.toarray()

classifier(GaussianNB(),count_train, y_train,count_test,y_test)

Accuracy score =  0.5570469798657718
F1 score =  0.5240384615384615


## 4. Laplace Smoothing

In [13]:
from sklearn.naive_bayes import MultinomialNB

count_train,count_test = vectorizeData(x_train,x_test,CountVectorizer()) 

count_train = count_train.toarray()
count_test = count_test.toarray()
classifier(GaussianNB(var_smoothing=1),count_train, y_train,count_test,y_test)

Accuracy score =  0.48322147651006714
F1 score =  0.6479731789088692


## 5. Try all together

In [14]:
lemmatize(x_train)
lemmatize(x_test)

count_train,count_test = vectorizeData(x_train,x_test,CountVectorizer(max_df=1.0, min_df=1,ngram_range=(2,2),stop_words=stopwords)) 

count_train = count_train.toarray()
count_test = count_test.toarray()

classifier(GaussianNB(var_smoothing=1),count_train, y_train,count_test,y_test)

Accuracy score =  0.4818791946308725
F1 score =  0.6503623188405797


---
# Support Vector Machine (SVM)
---

In [15]:
from sklearn import svm

tfidf_train,tfidf_test = vectorizeData(x_train,x_test,TfidfVectorizer(max_df=1.0,min_df=1))
classifier(svm.SVC(),tfidf_train,y_train,tfidf_test,y_test)

Accuracy score =  0.6662192393736018
F1 score =  0.5174644243208278


In [16]:
tfidf_train,tfidf_test = vectorizeData(x_train,x_test,TfidfVectorizer(max_df=1.0,min_df=1,stop_words=stopwords))
classifier(svm.SVC(),tfidf_train,y_train,tfidf_test,y_test)

Accuracy score =  0.6469798657718121
F1 score =  0.46868686868686865


### Part-of-Speech for train data

In [17]:
listOfpos = []  
for text in x_train:
    textWords =  word_tokenize(text)
    listOfpos.append(nltk.pos_tag(textWords))

In [18]:
noun=0
verb=0
adverb=0
adjective=0
count=0
for list1 in listOfpos:
    for tuple1 in list1:
        count+=1
        if(tuple1[1]=='VBD'):
            verb+=1
        if(tuple1[1]=='NN'):
            noun+=1
        if(tuple1[1]=='VBG'):
            adverb+=1
        if(tuple1[1]=='JJ' or tuple1[1]=='JJS'):
            adjective+=1
train_fractionAdverbs  = adverb/count
train_fractionVerbs = verb/count
train_fractionAdjectives = adjective/count
train_fractionNouns = noun/count

### Part-of-Speech for test data

In [19]:
listOfpos = []  
for text in x_test:
    textWords =  word_tokenize(text)
    listOfpos.append(nltk.pos_tag(textWords))

In [20]:
noun=0
verb=0
adverb=0
adjective=0
count=0

for list1 in listOfpos:    
    for tuple1 in list1:
        count+=1
        verb += tuple1.count('VBD')
        noun += tuple1.count('NN')
        adverb += tuple1.count('VBG')
        adjective  += tuple1.count('JJ') + tuple1.count('JJS')

test_fractionAdverbs  = adverb/count
test_fractionVerbs = verb/count
test_fractionAdjectives = adjective/count
test_fractionNouns = noun/count

--- 
# Random Forest Classifier
---

In [21]:
from sklearn.ensemble import RandomForestClassifier
tfidf_train,tfidf_test = vectorizeData(x_train,x_test,TfidfVectorizer(max_df=1.0,min_df=1))
classifier(RandomForestClassifier(),tfidf_train,y_train,tfidf_test,y_test)

Accuracy score =  0.6335570469798658
F1 score =  0.4347826086956523


In [22]:
tfidf_train,tfidf_test = vectorizeData(x_train,x_test,TfidfVectorizer(max_df=1.0,min_df=1,stop_words=stopwords))
classifier(RandomForestClassifier(),tfidf_train,y_train,tfidf_test,y_test)

Accuracy score =  0.6796420581655481
F1 score =  0.5563816604708798


In [23]:
tfidf_train,tfidf_test = vectorizeData(x_train,x_test,TfidfVectorizer(max_df=1.0,min_df=1,max_features=20000))
classifier(RandomForestClassifier(),tfidf_train,y_train,tfidf_test,y_test)


Accuracy score =  0.6389261744966444
F1 score =  0.43368421052631584


In [32]:
import spacy
import en_core_web_sm
# Generate POS tags 

def pos_tag(df,data):
    nlp = spacy.load('en_core_web_sm')
    pos_tags_column = []

    for text in data:
        pos_tags = []
        doc = nlp(text)
        for token in doc:
            pos_tags.append(token.pos_)
        all_pos_tags = ' '.join(pos_tags)
        pos_tags_column.append(all_pos_tags)
        
    df['Text_POS'] = pos_tags_column

In [None]:
train_df = DataFrame(data=x_train)
test_df  = DataFrame(data=x_test)

pos_tag(train_df,x_train)
pos_tag(test_df,x_test)
pos_train = train_df['Text_POS']
pos_test = test_df['Text_POS']

In [58]:
pos_tfidf_vectorizer = TfidfVectorizer(max_df=1.0,min_df=1,stop_words=stopwords,smooth_idf=1)
pos_tfidf_train = pos_tfidf_vectorizer.fit_transform(pos_train.astype('str'))
pos_tfidf_test= pos_tfidf_vectorizer.transform(pos_test.astype('str'))

classifier(RandomForestClassifier(),pos_tfidf_train,y_train,pos_tfidf_test,y_test)

Accuracy score =  0.5324384787472036
F1 score =  0.2572850035536603


In [59]:
classifier(svm.SVC(),pos_tfidf_train,y_train,pos_tfidf_test,y_test)

Accuracy score =  0.5288590604026846
F1 score =  0.08673026886383348
