In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import string

import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\btben\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\btben\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
train = pd.read_csv("option1-data/train.csv")
test = pd.read_csv("option1-data/test.csv")

In [3]:
def repartition(dataset):
    unrelated = sum(dataset['label']=='unrelated')
    agreed = sum(dataset['label']=='agreed')
    disagreed = sum(dataset['label']=='disagreed')
    total = unrelated+agreed+disagreed
    print("Unrelated: ", unrelated, "-->", unrelated/total*100, "%")
    print("Agreed   : ", agreed, "-->", agreed/total*100, "%")
    print("Disagreed: ", disagreed, "-->", disagreed/total*100, "%")

In [4]:
repartition(train)

Unrelated:  175598 --> 68.47474282683804 %
Agreed   :  74238 --> 28.949236084572732 %
Disagreed:  6606 --> 2.5760210885892327 %


In [5]:
train

Unnamed: 0,id,tid1,tid2,title1_en,title2_en,label
0,195611,0,1,There are two new old-age insurance benefits f...,"Police disprove ""bird's nest congress each per...",unrelated
1,191474,2,3,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP outstrips Hong Kong? Shenzhen S...,unrelated
2,25300,2,4,"""If you do not come to Shenzhen, sooner or lat...",The GDP overtopped Hong Kong? Shenzhen clarifi...,unrelated
3,123757,2,8,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP overtakes Hong Kong? Bureau of ...,unrelated
4,141761,2,11,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP outpaces Hong Kong? Defending R...,unrelated
...,...,...,...,...,...,...
256437,113364,167562,48447,egypt 's presidential election failed to win m...,Salah is retiring? Football Association offici...,unrelated
256438,49407,167562,49795,egypt 's presidential election failed to win m...,Liverpool's bid for Little Germany? The Echo's...,unrelated
256439,130134,167562,114783,egypt 's presidential election failed to win m...,West Media Exposing Tallahlach has been recomm...,unrelated
256440,101494,167562,137705,egypt 's presidential election failed to win m...,Rumor has it that Egypt is very united and the...,unrelated


In [6]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = remove_punctuation(text)
    return text

def remove_punctuation(text: str) -> str:
    for p in string.punctuation:
        text = text.replace(p, '')
    return text

In [7]:
porter=PorterStemmer()

def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [8]:
train['title1_en'] = train['title1_en'].apply(lambda x: preprocess_text(x))
train['title2_en'] = train['title2_en'].apply(lambda x: preprocess_text(x))

In [9]:
train

Unnamed: 0,id,tid1,tid2,title1_en,title2_en,label
0,195611,0,1,there are two new oldage insurance benefits fo...,police disprove birds nest congress each perso...,unrelated
1,191474,2,3,if you do not come to shenzhen sooner or later...,shenzhens gdp outstrips hong kong shenzhen sta...,unrelated
2,25300,2,4,if you do not come to shenzhen sooner or later...,the gdp overtopped hong kong shenzhen clarifie...,unrelated
3,123757,2,8,if you do not come to shenzhen sooner or later...,shenzhens gdp overtakes hong kong bureau of st...,unrelated
4,141761,2,11,if you do not come to shenzhen sooner or later...,shenzhens gdp outpaces hong kong defending rum...,unrelated
...,...,...,...,...,...,...
256437,113364,167562,48447,egypt s presidential election failed to win mi...,salah is retiring football association officia...,unrelated
256438,49407,167562,49795,egypt s presidential election failed to win mi...,liverpools bid for little germany the echos di...,unrelated
256439,130134,167562,114783,egypt s presidential election failed to win mi...,west media exposing tallahlach has been recomm...,unrelated
256440,101494,167562,137705,egypt s presidential election failed to win mi...,rumor has it that egypt is very united and the...,unrelated


In [10]:
train['title1_en'] = train['title1_en'].apply(lambda x: stemSentence(x))
train['title2_en'] = train['title2_en'].apply(lambda x: stemSentence(x))

In [11]:
train

Unnamed: 0,id,tid1,tid2,title1_en,title2_en,label
0,195611,0,1,there are two new oldag insur benefit for old ...,polic disprov bird nest congress each person g...,unrelated
1,191474,2,3,if you do not come to shenzhen sooner or later...,shenzhen gdp outstrip hong kong shenzhen stati...,unrelated
2,25300,2,4,if you do not come to shenzhen sooner or later...,the gdp overtop hong kong shenzhen clarifi a l...,unrelated
3,123757,2,8,if you do not come to shenzhen sooner or later...,shenzhen gdp overtak hong kong bureau of stati...,unrelated
4,141761,2,11,if you do not come to shenzhen sooner or later...,shenzhen gdp outpac hong kong defend rumor the...,unrelated
...,...,...,...,...,...,...
256437,113364,167562,48447,egypt s presidenti elect fail to win million o...,salah is retir footbal associ offici are lie,unrelated
256438,49407,167562,49795,egypt s presidenti elect fail to win million o...,liverpool bid for littl germani the echo discl...,unrelated
256439,130134,167562,114783,egypt s presidenti elect fail to win million o...,west media expos tallahlach ha been recommend ...,unrelated
256440,101494,167562,137705,egypt s presidenti elect fail to win million o...,rumor ha it that egypt is veri unit and there ...,unrelated


In [12]:
def removeStopWords(sentence):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(sentence)
    filtered_sentence=[]
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return " ".join(filtered_sentence)

In [13]:
train['title1_en'] = train['title1_en'].apply(lambda x: removeStopWords(x))
train['title2_en'] = train['title2_en'].apply(lambda x: removeStopWords(x))

In [14]:
vectorizer =  TfidfVectorizer()
train_tf_idf_features1 =  vectorizer.fit_transform(train.loc[0:10000, 'title1_en']).toarray()
train_tf_idf_features2 =  vectorizer.fit_transform(train.loc[0:10000, 'title2_en']).toarray()

In [15]:
a =  np.hstack((train_tf_idf_features1, train_tf_idf_features2)) 
a

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
train_tf_idf = pd.DataFrame( np.hstack((train_tf_idf_features1, train_tf_idf_features2)) )
train_tf_idf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10473,10474,10475,10476,10477,10478,10479,10480,10481,10482
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
x_train_full = train.loc[:, train.columns != 'label']
y_train_full = train.loc[0:10000, 'label']

In [18]:
x_train, x_test, y_train, y_test = train_test_split(train_tf_idf, y_train_full, test_size=0.2, random_state = 42)

In [19]:
repartition(train.loc[0:10000, :])

Unrelated:  6592 --> 65.91340865913409 %
Agreed   :  3114 --> 31.136886311368862 %
Disagreed:  295 --> 2.9497050294970504 %


In [None]:
clf_model = RandomForestClassifier(n_estimators = 1000, min_samples_split = 15, random_state = 42, verbose=2)
clf_model.fit(x_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 1000


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s remaining:    0.0s


building tree 2 of 1000
building tree 3 of 1000
building tree 4 of 1000
building tree 5 of 1000
building tree 6 of 1000
building tree 7 of 1000
building tree 8 of 1000
building tree 9 of 1000
building tree 10 of 1000
building tree 11 of 1000
building tree 12 of 1000
building tree 13 of 1000
building tree 14 of 1000
building tree 15 of 1000
building tree 16 of 1000
building tree 17 of 1000
building tree 18 of 1000
building tree 19 of 1000
building tree 20 of 1000
building tree 21 of 1000
building tree 22 of 1000
building tree 23 of 1000
building tree 24 of 1000
building tree 25 of 1000
building tree 26 of 1000
building tree 27 of 1000
building tree 28 of 1000
building tree 29 of 1000
building tree 30 of 1000
building tree 31 of 1000
building tree 32 of 1000
building tree 33 of 1000
building tree 34 of 1000
building tree 35 of 1000
building tree 36 of 1000
building tree 37 of 1000
building tree 38 of 1000
building tree 39 of 1000
building tree 40 of 1000
building tree 41 of 1000
building

building tree 322 of 1000
building tree 323 of 1000
building tree 324 of 1000
building tree 325 of 1000
building tree 326 of 1000
building tree 327 of 1000
building tree 328 of 1000
building tree 329 of 1000
building tree 330 of 1000
building tree 331 of 1000
building tree 332 of 1000
building tree 333 of 1000
building tree 334 of 1000
building tree 335 of 1000
building tree 336 of 1000
building tree 337 of 1000
building tree 338 of 1000
building tree 339 of 1000
building tree 340 of 1000
building tree 341 of 1000
building tree 342 of 1000
building tree 343 of 1000
building tree 344 of 1000
building tree 345 of 1000
building tree 346 of 1000
building tree 347 of 1000
building tree 348 of 1000
building tree 349 of 1000
building tree 350 of 1000
building tree 351 of 1000
building tree 352 of 1000
building tree 353 of 1000
building tree 354 of 1000
building tree 355 of 1000
building tree 356 of 1000
building tree 357 of 1000
building tree 358 of 1000
building tree 359 of 1000
building tre

building tree 638 of 1000
building tree 639 of 1000
building tree 640 of 1000
building tree 641 of 1000
building tree 642 of 1000
building tree 643 of 1000
building tree 644 of 1000
building tree 645 of 1000
building tree 646 of 1000
building tree 647 of 1000
building tree 648 of 1000
building tree 649 of 1000
building tree 650 of 1000
building tree 651 of 1000


In [None]:
pred = clf_model.predict(x_test)
print("Accuracy => ", round(accuracy_score(pred, y_test)*100, 2))
print(classification_report(y_test, pred))

In [None]:
#define labels for visualizing confusion matrix
labels = ['agreed', 'disagreed', 'unrelated']

In [None]:
cm = confusion_matrix(y_test, pred)
sns.heatmap(cm, annot=True,fmt='d', xticklabels=labels, yticklabels=labels)

Save the CSV so we don't have to continously pre-process, stem, and remove stop words on entire dataset

In [None]:
csv_file = "pre-processed_data.csv"
train.to_csv(csv_file)

# Balanced Dataset

In [None]:
repartition(train)

In [None]:
train_unrelated = train[train['label'] == 'unrelated']
train_agreed = train[train['label'] == 'agreed']
train_disagreed = train[train['label'] == 'disagreed']

train_balanced = pd.concat([train_unrelated[0:6000], train_agreed[0:6000], train_disagreed[0:6000]])
train_balanced.sample(frac=1)

In [None]:
vectorizer =  TfidfVectorizer()
train_tf_idf_features1 =  vectorizer.fit_transform(train_balanced['title1_en']).toarray()
train_tf_idf_features2 =  vectorizer.fit_transform(train_balanced['title2_en']).toarray()

In [None]:
repartition(train_balanced)

In [None]:
train_tf_idf = pd.DataFrame( np.hstack((train_tf_idf_features1, train_tf_idf_features2)) )
y_train_balanced = train_balanced['label']
x_train, x_test, y_train, y_test = train_test_split(train_tf_idf, y_train_balanced, test_size=0.2, random_state = 42)

In [None]:
clf_model = RandomForestClassifier(n_estimators = 1000, min_samples_split = 15, random_state = 42, verbose=2)
clf_model.fit(x_train, y_train)

In [None]:
pred = clf_model.predict(x_test)
print("Accuracy => ", round(accuracy_score(pred, y_test)*100, 2))
print(classification_report(y_test, pred))

In [None]:
cm = confusion_matrix(y_test, pred)
sns.heatmap(cm, annot=True,fmt='d', xticklabels=labels, yticklabels=labels)