In [2]:
import re
import os 
import sys
from pathlib import Path 
import pickle 

import numpy as np

import string
from string import punctuation
from collections import Counter

from preprocess import * 

import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression


In [3]:
data, labels = open_text('train-v2.tsv')


In [4]:
X_data, X_data_test, y_train, y_test = train_test_split(
                                        data, np.array(labels),
                                        test_size = 0.20, random_state = 11)

In [5]:
X_data_test, test_labels = open_text('train-v2.tsv')
y_test = np.array(test_labels)

In [6]:
def remove_punc(text):
    return text.translate(str.maketrans('', '', string.punctuation.replace("@","")))

In [7]:
def tokenize(text, delimiter = " "):
    return text.split(" ")

In [8]:
def remove_stopwords(word_list, stopwords_list  = ["@user","{url}"]):
    temp_word_list = []
    for w in word_list: 
        if not (bool(re.match("(^@+\d*\w*\d*)", w)) or 
                bool(re.match("(^\d+[\s]*\w*$)", w)) or 
                bool(re.match("(^\d*[\S]*\w*\d+$)", w))):
            temp_word_list.append(w)
    word_list = [w.lower() for w in temp_word_list if not w.lower() in stopwords_list] 
    return word_list

In [9]:
def process_corpora(corpora_list): 
    processed_text = []
    global_word_list = []
    for text in corpora_list: 
        text = remove_punc(text)
        word_list = tokenize(text)
        word_list = remove_stopwords(word_list)
        if "" in word_list:
            word_list.remove("")
        processed_text.append(word_list)
        global_word_list += word_list
    return processed_text, global_word_list

In [10]:
processed_text, global_word_list = process_corpora(X_data)
processed_text_test, _ = process_corpora(X_data_test)

In [11]:
len(global_word_list)

861244

In [12]:
len(set(global_word_list))

62689

In [13]:
global_word_list[:2]

['moor', 'browd']

In [14]:
def count_words(word_list): 
    return Counter(word_list)

In [15]:
count_dict = Counter(global_word_list)

In [16]:
unique_word_list_train = []
word_frequency_train = [] 
for k, v in count_dict.items():
    unique_word_list_train.append(k)
    word_frequency_train.append(v)
    
    

In [17]:
sorted_word_frequency, sorted_unique_word_list = zip(*sorted(zip(word_frequency_train, unique_word_list_train),reverse= False))

In [18]:
# list(np.array(sorted_word_frequency)<2)

cleaned_unique_word_list = [d for (d, remove) in zip(sorted_unique_word_list, list(np.array(sorted_word_frequency)<5)) if not remove]
cleaned_word_frequency = [d for (d, remove) in zip(sorted_word_frequency, list(np.array(sorted_word_frequency)<5)) if not remove]


In [19]:
print(len(cleaned_word_frequency))
print(len(cleaned_unique_word_list))

10519
10519


In [20]:
encoded_text = []
encoded_test_text = []
for i in range(len(processed_text)):
    temp_list = []
    for w in cleaned_unique_word_list: 
        temp_list.append(processed_text[i].count(w))
    encoded_text.append(temp_list)
    
for i in range(len(processed_text_test)):
    temp_list = []
    for w in cleaned_unique_word_list: 
        temp_list.append(processed_text_test[i].count(w))
    encoded_test_text.append(temp_list)   

In [21]:
# with open('bow_vector_20.pkl', 'wb') as f:
#         pickle.dump(encoded_text, f)

In [22]:
encoded_array = np.array(encoded_text)
doc_appearance = np.array(cleaned_word_frequency)

encoded_test_array = np.array(encoded_test_text)

In [23]:
# with open('bow_vector_array_20.pkl', 'wb') as f:
#         pickle.dump(encoded_array, f)

In [24]:
print(encoded_array.shape)
print(encoded_test_array.shape)

(64000, 10519)
(80000, 10519)


In [25]:
tfidf = np.multiply(encoded_array, np.log(encoded_array.shape[0]/doc_appearance))
tfidf_test = np.multiply(encoded_test_array, np.log(encoded_array.shape[0]/doc_appearance))

In [26]:
tfidf.shape

(64000, 10519)

In [27]:
# with open('tfidf_vector_20.pkl', 'wb') as f:
#         pickle.dump(tfidf, f)

## Training models, using BoW

In [28]:
gnb = BernoulliNB() # GaussianNB()
y_pred = gnb.fit(encoded_array, y_train).predict(encoded_test_array)
print("Number of mislabeled points out of a total %d points : %d"
          % (encoded_test_array.shape[0], (y_test != y_pred).sum()))
print("The accuracy is : %.2f"
          % ((((y_test == y_pred).sum())/encoded_test_array.shape[0])*100))

Number of mislabeled points out of a total 80000 points : 18618
The accuracy is : 76.73


In [29]:
gnb = GaussianNB()
y_pred = gnb.fit(encoded_array, y_train).predict(encoded_test_array)
print("Number of mislabeled points out of a total %d points : %d"
          % (encoded_test_array.shape[0], (y_test != y_pred).sum()))
print("The accuracy is : %.2f"
          % ((((y_test == y_pred).sum())/encoded_test_array.shape[0])*100))

Number of mislabeled points out of a total 80000 points : 23068
The accuracy is : 71.17


In [30]:
lr = LogisticRegression(max_iter=1000)
y_pred = lr.fit(encoded_array, y_train).predict(encoded_test_array)
print("Number of mislabeled points out of a total %d points : %d"
          % (encoded_test_array.shape[0], (y_test != y_pred).sum()))
print("The accuracy is : %.2f"
          % ((((y_test == y_pred).sum())/encoded_test_array.shape[0])*100))

Number of mislabeled points out of a total 80000 points : 16037
The accuracy is : 79.95


## Training models, using TF-IDF

In [31]:
gnb = BernoulliNB() # GaussianNB()
y_pred = gnb.fit(tfidf, y_train).predict(tfidf_test)
print("Number of mislabeled points out of a total %d points : %d"
          % (tfidf_test.shape[0], (y_test != y_pred).sum()))
print("The accuracy is : %.2f"
          % ((((y_test == y_pred).sum())/tfidf_test.shape[0])*100))

Number of mislabeled points out of a total 80000 points : 18618
The accuracy is : 76.73


In [32]:
gnb = GaussianNB()
y_pred = gnb.fit(tfidf, y_train).predict(tfidf_test)
print("Number of mislabeled points out of a total %d points : %d"
          % (tfidf_test.shape[0], (y_test != y_pred).sum()))
print("The accuracy is : %.2f"
          % ((((y_test == y_pred).sum())/tfidf_test.shape[0])*100))

Number of mislabeled points out of a total 80000 points : 23501
The accuracy is : 70.62


In [33]:
lr = LogisticRegression(max_iter=1000)
y_pred = lr.fit(tfidf, y_train).predict(tfidf_test)
print("Number of mislabeled points out of a total %d points : %d"
          % (tfidf_test.shape[0], (y_test != y_pred).sum()))
print("The accuracy is : %.2f"
          % ((((y_test == y_pred).sum())/tfidf_test.shape[0])*100))

Number of mislabeled points out of a total 80000 points : 15468
The accuracy is : 80.66


### Best Result: Logistic Regression with TF-IDF 
Accuracy is **% 80.66**

## Grid Search
Unfortunately the time limit didn't let us finish the grid search.

In [None]:
lr = LogisticRegression()
cv = RepeatedStratifiedKFold(n_splits=5, random_state=1) # n_repeats=1
params = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [1e-3, 1e-1, 1, 10, 100]
    }
grid = GridSearchCV(lr, params, scoring='accuracy', n_jobs=-1, cv=cv)
# execute search
result = grid.fit(tfidf, y_train)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

## 