Τεχνικές Εξόρυξης Δεδομένων - 3η άσκηση

1115201600046 ΕΛΛΗΝΑ ΚΩΝΣΤΑΝΤΙΝΑ

In [1]:
#import the needed libraries

import os
import re
import nltk
import sklearn
import pandas as pd
from glob import glob
from pandas import DataFrame, read_csv
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import cross_val_predict, cross_val_score
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.csr import csr_matrix

In [2]:
#Read the data needed.
df = read_csv('data/train.csv')
test_set = read_csv('data/impermium_verification_set.csv')
labels = read_csv('data/impermium_verification_labels.csv')

### PREPROCESSING

In [3]:
#Clean text from punctuation and symbols in train.csv.
df['Comment'] = df['Comment'].str.lower()
df = df.replace(r'\\n',  ' ', regex=True)
df = df.replace(r'\\t',  ' ', regex=True)
df = df.replace(r'\\xa0',  ' ', regex=True)
df['Comment'] = df['Comment'].str.replace(r"http\S+", ' ', regex=True)
df['Comment'] = df['Comment'].str.replace('[^\w\s]', ' ')

In [4]:
#Clean text from punctuation and symbols in impermium_verification_set.csv.
test_set['Comment'] = test_set['Comment'].str.lower()
test_set = test_set.replace(r'\\n',  ' ', regex=True)
test_set = test_set.replace(r'\\t',  ' ', regex=True)
test_set = test_set.replace(r'\\xa0',  ' ', regex=True)
test_set['Comment'] = test_set['Comment'].str.replace(r"http\S+", ' ', regex=True)
test_set['Comment'] = test_set['Comment'].str.replace('[^\w\s]', ' ')

### METHOD

In [5]:
#We put the columns we will use in X_train_text, y_train_text.
X_train_text = df[['Comment']]
y_train_text = df[['Insult']]

#Label encoding in y_train_text.
label_encoder = preprocessing.LabelEncoder()
y_train = label_encoder.fit_transform(y_train_text.values[:, 0])

#Count Vectorizer in X_train_text.
count_vect = CountVectorizer()
X_train = count_vect.fit_transform(X_train_text.values[:, 0])

#Gaussian Naive Bayes for classifier.
X_train = X_train.toarray()
clf = GaussianNB()
clf.fit(X_train, y_train)

#Test data from test_set and labels files. Same as train data with label encoding and count vectorizer.
X_test_set = test_set[['Comment']]
X_test_labels = labels[['Insult']]

X_test = count_vect.transform(X_test_set.values[:, 0])
y_test = label_encoder.transform(X_test_labels.values[:, 0])

#Metrics for NB with CountVectorizer.
f1 = cross_val_score(clf, X_train, y_train, cv=10, scoring='f1_weighted')
print('F-Measure score: ', np.mean(f1), f1)
accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
print('Accuracy score: ', np.mean(accuracy),accuracy)

print("\nTest scores: \n")

# #Predictions for test
X_test = X_test.toarray()
pred = clf.predict(X_test)
f1 = f1_score(y_test, pred, average='weighted')
print('F-Measure score: ', f1)
accuracy = accuracy_score(y_test, pred)
print('Accuracy score: ', accuracy)

F-Measure score:  0.6755539805087347 [0.71400931 0.65530093 0.65465699 0.68989209 0.65424518 0.68619554
 0.68627959 0.65234854 0.6847501  0.67786154]
Accuracy score:  0.6642986570712587 [0.70886076 0.64303797 0.64303797 0.6835443  0.63797468 0.67594937
 0.67088608 0.63705584 0.6751269  0.66751269]

Test scores: 

F-Measure score:  0.5209425683665567
Accuracy score:  0.5208053691275167


#### Lemmatization

In [6]:
#Word tokenizing
tokens = df.apply(lambda row: nltk.word_tokenize(row['Comment']), axis=1)
df['tokenized'] = tokens

#Lemmatization
lemmatizer = WordNetLemmatizer()

def lemm(words):
    lem = [lemmatizer.lemmatize(w) for w in words]
    return lem

#Put the results in a new column.
df['tokenized'] = df['tokenized'].apply(lambda x : lemm(x))

#Detokenize so we can use the column to more predictions.
d = TreebankWordDetokenizer()
df['tokenized'] = df['tokenized'].apply(d.detokenize)

X_train_text = df[['tokenized']]

count_vect = CountVectorizer()
X_train = count_vect.fit_transform(X_train_text.values[:, 0])

#Gaussian Naive Bayes for classifier.
X_train = X_train.toarray()
clf = GaussianNB()
clf.fit(X_train, y_train)

#Test data from test_set and labels files. Same as train data with label encoding and count vectorizer.
X_test_set = test_set[['Comment']]
X_test_labels = labels[['Insult']]

X_test = count_vect.transform(X_test_set.values[:, 0])
y_test = label_encoder.transform(X_test_labels.values[:, 0])

#Metrics for NB with CountVectorizer after lemmatization.
f1 = cross_val_score(clf, X_train, y_train, cv=10, scoring='f1_weighted')
print('F-Measure score: ', np.mean(f1), f1)
accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
print('Accuracy score: ', np.mean(accuracy),accuracy)

print("\nTest scores: \n")

#Predictions for test
X_test = X_test.toarray()
pred = clf.predict(X_test)
f1 = f1_score(y_test, pred, average='weighted')
print('F-Measure score: ', f1)
accuracy = accuracy_score(y_test, pred)
print('Accuracy score: ', accuracy)

F-Measure score:  0.6668381200101035 [0.70184099 0.65275574 0.65591795 0.68984878 0.64141397 0.6689645
 0.65943489 0.65988715 0.67241818 0.66589904]
Accuracy score:  0.6536606052817581 [0.69367089 0.63797468 0.64303797 0.68101266 0.62531646 0.6556962
 0.64050633 0.64467005 0.65989848 0.65482234]

Test scores: 

F-Measure score:  0.52232786233773
Accuracy score:  0.5230425055928412


#### Stopwords

In [7]:
#Word tokenizing
tokens = df.apply(lambda row: nltk.word_tokenize(row['Comment']), axis=1)
df['tokenized'] = tokens

#Remove stopwords
def stops(words):
    stop = [s for s in words if s not in stopwords.words('english')]
    return stop
df['tokenized'] = df['tokenized'].apply(lambda x : stops(x))

#Detokenize so we can use the column to more predictions.
d = TreebankWordDetokenizer()
df['tokenized'] = df['tokenized'].apply(d.detokenize)

X_train_text = df[['tokenized']]

count_vect = CountVectorizer()
X_train = count_vect.fit_transform(X_train_text.values[:, 0])

#Gaussian Naive Bayes for classifier.
X_train = X_train.toarray()
clf = GaussianNB()
clf.fit(X_train, y_train)

#Test data from test_set and labels files. Same as train data with label encoding and count vectorizer.
X_test_set = test_set[['Comment']]
X_test_labels = labels[['Insult']]

X_test = count_vect.transform(X_test_set.values[:, 0])
y_test = label_encoder.transform(X_test_labels.values[:, 0])

#Metrics for NB with CountVectorizer after removing the stopwords.
f1 = cross_val_score(clf, X_train, y_train, cv=10, scoring='f1_weighted')
print('F-Measure score: ', np.mean(f1), f1)
accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
print('Accuracy score: ', np.mean(accuracy),accuracy)

print("\nTest scores: \n")

#Predictions for test
X_test = X_test.toarray()
pred = clf.predict(X_test)
f1 = f1_score(y_test, pred, average='weighted')
print('F-Measure score: ', f1)
accuracy = accuracy_score(y_test, pred)
print('Accuracy score: ', accuracy)

F-Measure score:  0.6756627394972731 [0.71400931 0.65530093 0.65465699 0.68989209 0.65424518 0.68404789
 0.68405773 0.6550284  0.6847501  0.68063878]
Accuracy score:  0.6642999421705327 [0.70886076 0.64303797 0.64303797 0.6835443  0.63797468 0.67341772
 0.66835443 0.63959391 0.6751269  0.67005076]

Test scores: 

F-Measure score:  0.5191331417278628
Accuracy score:  0.5190156599552572


#### Bigrams

In [8]:
#Bigrams
X_train_text = df[['Comment']]

count_vect = CountVectorizer(ngram_range=(2,2))
X_train = count_vect.fit_transform(X_train_text.values[:, 0])

#Gaussian Naive Bayes for classifier.
X_train = X_train.toarray()
clf = GaussianNB()
clf.fit(X_train, y_train)

#Test data from test_set and labels files. Same as train data with label encoding and count vectorizer.
X_test_set = test_set[['Comment']]
X_test_labels = labels[['Insult']]

X_test = count_vect.transform(X_test_set.values[:, 0])
y_test = label_encoder.transform(X_test_labels.values[:, 0])

#Metrics for NB with CountVectorizer with bigrams.
f1 = cross_val_score(clf, X_train, y_train, cv=10, scoring='f1_weighted')
print('F-Measure score: ', np.mean(f1), f1)
accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
print('Accuracy score: ', np.mean(accuracy),accuracy)

print("\nTest scores: \n")

# #Predictions for test
X_test = X_test.toarray()
pred = clf.predict(X_test)
f1 = f1_score(y_test, pred, average='weighted')
print('F-Measure score: ', f1)
accuracy = accuracy_score(y_test, pred)
print('Accuracy score: ', accuracy)

F-Measure score:  0.6919657402175319 [0.69634766 0.66342989 0.71172134 0.65019827 0.73197311 0.69384485
 0.66294016 0.70349204 0.69543147 0.71027859]
Accuracy score:  0.6883814174644991 [0.69873418 0.65822785 0.71898734 0.64303797 0.72405063 0.6835443
 0.65063291 0.70304569 0.69543147 0.70812183]

Test scores: 

F-Measure score:  0.5607032151674453
Accuracy score:  0.5615212527964206


#### Laplace smoothing

In [9]:
#Laplace Smoothing
count_vect = CountVectorizer()
X_train = count_vect.fit_transform(X_train_text.values[:, 0])

#We use multinomial naive bayes so we can use laplace smoothing.
clf =  MultinomialNB(alpha=1)
clf.fit(X_train, y_train)

#Test data from test_set and labels files. Same as train data with label encoding and count vectorizer.
X_test_set = test_set[['Comment']]
X_test_labels = labels[['Insult']]

X_test = count_vect.transform(X_test_set.values[:, 0])
y_test = label_encoder.transform(X_test_labels.values[:, 0])

#Metrics for NB with CountVectorizer with Laplace smoothing.
f1 = cross_val_score(clf, X_train, y_train, cv=10, scoring='f1_weighted')
print('F-Measure score: ', np.mean(f1), f1)
accuracy = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
print('Accuracy score: ', np.mean(accuracy),accuracy)

print("\nTest scores: \n")

#Predictions for test
X_test = X_test.toarray()
pred = clf.predict(X_test)
f1 = f1_score(y_test, pred, average='weighted')
print('F-Measure score: ', f1)
accuracy = accuracy_score(y_test, pred)
print('Accuracy score: ', accuracy)


#We see that the results are better when we use laplace smoothing. 

F-Measure score:  0.7898025956138476 [0.8136786  0.78069296 0.78637673 0.80851058 0.80578269 0.75922159
 0.75698291 0.79423642 0.78907345 0.80347002]
Accuracy score:  0.7843982522649875 [0.80759494 0.77468354 0.78481013 0.80506329 0.80253165 0.74936709
 0.74683544 0.79187817 0.78426396 0.79695431]

Test scores: 

F-Measure score:  0.677133943500741
Accuracy score:  0.6805369127516778


### Pos Tags

In [10]:
#Pos Tags for train data
tokens = df.apply(lambda row: nltk.word_tokenize(row['tokenized']), axis=1)
df['tokenized'] = tokens
tags = [nltk.pos_tag(w) for w in df['tokenized']]

#Pos Tags for test data
tokensT = test_set.apply(lambda row: nltk.word_tokenize(row['Comment']), axis=1)
test_set['tokenized'] = tokensT
tagsT = [nltk.pos_tag(w) for w in test_set['tokenized']]
tags

[[('fuck', 'NN'), ('dad', 'NN')],
 [('really', 'RB'),
  ('understand', 'JJ'),
  ('point', 'NN'),
  ('seems', 'VBZ'),
  ('mixing', 'VBG'),
  ('apples', 'NNS'),
  ('oranges', 'NNS')],
 [('xc2', 'JJ'),
  ('majority', 'NN'),
  ('canadians', 'NNS'),
  ('wrong', 'JJ'),
  ('unless', 'IN'),
  ('supportive', 'JJ'),
  ('idea', 'NN'),
  ('nothing', 'NN'),
  ('full', 'JJ'),
  ('proof', 'NN'),
  ('perfect', 'JJ'),
  ('take', 'VB'),
  ('chances', 'NNS'),
  ('inadvertently', 'RB'),
  ('kill', 'VBP'),
  ('son', 'JJ'),
  ('daughter', 'NN'),
  ('breaks', 'NNS'),
  ('always', 'RB'),
  ('regard', 'VBP'),
  ('collateral', 'JJ'),
  ('damage', 'NN'),
  ('like', 'IN'),
  ('wartime', 'NN'),
  ('sorry', 'NN'),
  ('xc2', 'NNP'),
  ('cheques', 'NNS'),
  ('mail', 'NN')],
 [('listen', 'JJ'),
  ('dont', 'JJ'),
  ('wannaget', 'NN'),
  ('married', 'VBD'),
  ('man', 'NN'),
  ('women', 'NNS'),
  ('dont', 'RB'),
  ('would', 'MD'),
  ('bother', 'VB'),
  ('gay', 'JJ'),
  ('people', 'NNS'),
  ('got', 'VBD'),
  ('married', '

In [11]:
#Train data
#For each sentence i have a 4-point array. We add the counts of pos tags and in the end we divide with the lenght
#of the sentence.
array = []
for sentence in tags:
    feats = np.array([0,0,0,0])
    sent_len = len(sentence)
    for word, pos in sentence:
        if pos in ['NN', 'NNP', 'NNS', 'NNPS']:
            feats[0] += 1
        if pos in ['JJ', 'JJR', 'JJS']:
            feats[1] += 1
        if pos in ['RB', 'RBR', 'RBS']:
            feats[2] += 1
        if pos in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
            feats[3] += 1
    if sent_len > 0:
        array.append(feats / sent_len)
    else:
        array.append(feats)
array = np.array(array)   

array

array([[1.        , 0.        , 0.        , 0.        ],
       [0.42857143, 0.14285714, 0.14285714, 0.28571429],
       [0.5       , 0.25      , 0.07142857, 0.10714286],
       ...,
       [0.69230769, 0.07692308, 0.        , 0.15384615],
       [0.33333333, 0.16666667, 0.05555556, 0.33333333],
       [0.44444444, 0.11111111, 0.07407407, 0.2962963 ]])

In [12]:
#Same as before for test data
arrayT = []
for sentence in tagsT:
    feats = np.array([0,0,0,0])
    sent_len = len(sentence)
    for word, pos in sentence:
        if pos in ['NN', 'NNP', 'NNS', 'NNPS']:
            feats[0] += 1
        if pos in ['JJ', 'JJR', 'JJS']:
            feats[1] += 1
        if pos in ['RB', 'RBR', 'RBS']:
            feats[2] += 1
        if pos in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
            feats[3] += 1
    if sent_len > 0:
        arrayT.append(feats / sent_len)
    else:
        arrayT.append(feats)
arrayT = np.array(arrayT)   

arrayT

array([[0.125     , 0.125     , 0.        , 0.125     ],
       [0.        , 0.        , 0.        , 0.66666667],
       [0.25714286, 0.14285714, 0.        , 0.25714286],
       ...,
       [0.26086957, 0.04347826, 0.08695652, 0.17391304],
       [0.44444444, 0.        , 0.        , 0.11111111],
       [0.3125    , 0.        , 0.        , 0.1875    ]])

### TFIDF - Part-of-speech

In [13]:
#TFIDF
tf_vect = TfidfVectorizer()
X_trainT = tf_vect.fit_transform(X_train_text.values[:, 0])

#We concatenate the part-of-speech array and the tfidf array.
X_trainT = np.concatenate([X_trainT.toarray(), array], 1)
X_trainT = csr_matrix(X_trainT)

#Gaussian Naive Bayes for classifier.
X_train_ar = X_trainT.toarray()
clf = GaussianNB()
clf.fit(X_train_ar, y_train)

#Test data from test_set and labels files. Same as train data with label encoding and count vectorizer.
X_test_set = test_set[['Comment']]
X_test_labels = labels[['Insult']]
X_testT = tf_vect.transform(X_test_set.values[:, 0])

#Concatenate test data.
X_testT = np.concatenate([X_testT.toarray(), arrayT], 1)
X_testT = csr_matrix(X_testT)

#Metrics for NB with the array of tfidf and part-of-speech.
f1 = cross_val_score(clf, X_train_ar, y_train, cv=10, scoring='f1_weighted')
print('F-Measure score: ', np.mean(f1), f1)
accuracy = cross_val_score(clf, X_train_ar, y_train, cv=10, scoring='accuracy')
print('Accuracy score: ', np.mean(accuracy),accuracy)

print("\nTest scores: \n")

#Predictions for test
X_test_ar = X_testT.toarray()
pred = clf.predict(X_test_ar)
f1 = f1_score(y_test, pred, average='weighted')
print('F-Measure score: ', f1)
accuracy = accuracy_score(y_test, pred)
print('Accuracy score: ', accuracy)

F-Measure score:  0.6766505294683794 [0.71329213 0.65889364 0.64627013 0.68914055 0.65861577 0.69048636
 0.69514376 0.65616048 0.68342972 0.67507275]
Accuracy score:  0.6663246160765919 [0.70886076 0.64810127 0.63544304 0.6835443  0.64303797 0.68101266
 0.68101266 0.64213198 0.6751269  0.66497462]

Test scores: 

F-Measure score:  0.5209608169326444
Accuracy score:  0.5208053691275167


### Random Forest - SVM

In [14]:
#Random Forest
clf = RandomForestClassifier()
clf.fit(X_trainT, y_train)

#Metrics for Random Forest with the array of tfidf and part-of-speech.
f1 = cross_val_score(clf, X_trainT, y_train, cv=10, scoring='f1_weighted')
print('F-Measure score: ', np.mean(f1), f1)
accuracy = cross_val_score(clf,X_trainT,y_train, cv=10, scoring='accuracy')
print('Accuracy score: ', np.mean(accuracy),accuracy)

print("\nTest scores: \n")

#Predictions for test
X_testT1 = X_testT.toarray()
pred = clf.predict(X_testT1)
f1 = f1_score(y_test, pred, average='weighted')
print('F-Measure score: ', f1)
accuracy = accuracy_score(y_test, pred)
print('Accuracy score: ', accuracy)

F-Measure score:  0.75904857946428 [0.80372968 0.76561148 0.74971495 0.74136066 0.76985394 0.7578945
 0.7476717  0.74911576 0.75315638 0.75237676]
Accuracy score:  0.8031420677247316 [0.82025316 0.80253165 0.79493671 0.78987342 0.81265823 0.8
 0.8        0.79187817 0.8071066  0.81218274]

Test scores: 

F-Measure score:  0.5830324066498046
Accuracy score:  0.6331096196868009


In [15]:
#SVM
clf_svm = SVC()
clf_svm.fit(X_trainT, y_train)

#Metrics for SVM with the array of tfidf and part-of-speech.
f1 = cross_val_score(clf_svm, X_trainT, y_train, cv=10, scoring='f1_weighted')
print('F-Measure score: ', np.mean(f1), f1)
accuracy = cross_val_score(clf_svm, X_trainT, y_train, cv=10, scoring='accuracy')
print('Accuracy score: ', np.mean(accuracy),accuracy)

print("\nTest scores: \n")

#Predictions for test
X_testT2 = X_testT.toarray()
pred = clf.predict(X_testT2)
f1 = f1_score(y_test, pred, average='weighted')
print('F-Measure score: ', f1)
accuracy = accuracy_score(y_test, pred)
print('Accuracy score: ', accuracy)

F-Measure score:  0.8138965716149471 [0.83655774 0.81665806 0.80660151 0.82909318 0.81455654 0.8100312
 0.80885058 0.80590532 0.80620951 0.80450208]
Accuracy score:  0.8335398059500096 [0.85063291 0.83291139 0.8278481  0.84556962 0.83544304 0.83037975
 0.83037975 0.82994924 0.8248731  0.82741117]

Test scores: 

F-Measure score:  0.5830324066498046
Accuracy score:  0.6331096196868009
