In [1]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter 
from matplotlib.colors import ListedColormap
import string
import re
from scipy.stats import hmean
from scipy.stats import norm
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline
pth = r"C:\EPFL\2018-2019\nltk_data" #change location according to your nltk data path
nltk.data.path.append(pth)
sns.set()

In [2]:
import sklearn
#from sklearn.model_selection.cross_validation import train_test_split
from sklearn import metrics, model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings("ignore")

In [3]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import accuracy_score

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

**Description of a pipeline. Used to apply sequence of transform. I had issue with cvec, needed to apply it two times on x_train and x_test, it gave bad results...**

https://medium.com/@chrisfotache/text-classification-in-python-pipelines-nlp-nltk-tf-idf-xgboost-and-more-b83451a327e0
https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

https://medium.com/@chrisfotache/text-classification-in-python-pipelines-nlp-nltk-tf-idf-xgboost-and-more-b83451a327e0
https://stackoverflow.com/questions/43366561/use-sklearns-gridsearchcv-with-a-pipeline-preprocessing-just-once

https://www.kaggle.com/cesartrevisan/scikit-learn-and-gridsearchcv

# Multi Classifier Comparison

**The best results we got so far were with tf-idf (even if the difference was not that significant with Countvectorizer) along with 100 000 features and use of unigram and bigram. Now we will compare using the same pipeline other models. Due to the expensive computation time, we will compare these models with a maximum of 10 000 features only and using search grid to get the best estimator each time**

In [5]:
from time import time
import multiprocessing
import cython

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC #support vector machine SVM
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.feature_selection import SelectFromModel

In [7]:
data = pd.read_pickle('clean_tweets')
X = data.text.values
y = data.sentiment.values
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
names = ["Logistic Regression", "Linear SVC", "Random Forest","Ridge Classifier"]

classifiers = [LogisticRegression(),LinearSVC(), RandomForestClassifier(), RidgeClassifier()]
zipped_clf = zip(names,classifiers)

In [11]:
tvec = TfidfVectorizer()

In [17]:
%load_ext cythonmagic
def classifier_comparator(vectorizer, ngram_range, classifier_list, n_features=10000, stop_words=None):
    result = []
    vectorizer.set_params(stop_words=stop_words, max_features=n_features, ngram_range=ngram_range)
    for n,c in classifier_list:
        checker_pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', c)
        ])
        
        #----------- SEARCH GRID -----------
        print("-----------------------------------------------------------------")
        print("Choosen classifier : {}".format(n))
        t0 = time()
        if(n == "Logistic Regression") :
            hyperparameters = dict(C=np.logspace(0, 4, 10), penalty=['l1', 'l2'])
            clf = GridSearchCV(c, hyperparameters, cv=5, verbose=0)
        elif(n == "Random Forest") :
            hyperparameters = {"max_depth": [3, None]}
        elif(n == "Ridge Classifier") :
            hyperparameters = dict(alpha = np.array([0.01,0.001,0.0001]))
        elif(n == "Linear SVC") :
            Cs = [0.01, 0.1, 1]
            hyperparameters = {'C': Cs}
        
        #search for best estimator
        clf = GridSearchCV(c, hyperparameters, cv=5, verbose=0)
        vectorizer.set_params(max_features=n_features, ngram_range=ngram_range)
        best_estimator = Pipeline([('vectorizer', vectorizer),('classifier', clf)])
        #best_estimator = Pipeline([('vectorizer', vectorizer),('classifier', c)])

        #---------- predict with best model ------------
        classifier_fit = best_estimator.fit(X_train, y_train)
        y_pred = classifier_fit.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        tt_time = time() - t0
        result.append([n, accuracy, tt_time])
        print("Accuracy score is {} :".format(accuracy))
        print("Train and test time took {}".format(tt_time))
    return result

The cythonmagic extension is already loaded. To reload it, use:
  %reload_ext cythonmagic


In [18]:
%%time
#pool = multiprocessing.Pool(processes=2)
#r = pool.map(classifier_comparator, years)
#pool.close()
all_results = classifier_comparator(tvec, range(1, 3), zipped_clf, n_features=10000)

-----------------------------------------------------------------
Choosen classifier : Logistic Regression
Accuracy score is 0.817795856107792 :
Train and test time took 605.3928697109222
-----------------------------------------------------------------
Choosen classifier : Linear SVC
Accuracy score is 0.8174145163340536 :
Train and test time took 40.70728349685669
-----------------------------------------------------------------
Choosen classifier : Random Forest
Accuracy score is 0.7755688318291598 :
Train and test time took 321.2097415924072
-----------------------------------------------------------------
Choosen classifier : Ridge Classifier
Accuracy score is 0.8126859031396975 :
Train and test time took 461.7082438468933
Wall time: 23min 49s


In [25]:
%%time
Cs = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
hyperparameters = {'C': Cs}
c = GridSearchCV(LinearSVC(), hyperparameters, cv=5, verbose=0)
vec = TfidfVectorizer()
vec.set_params(max_features=100000, ngram_range=(1, 3))
best_estimator = Pipeline([('vectorizer', vec),('classifier', c)])
classifier_fit = best_estimator.fit(X_train, y_train)
y_pred = classifier_fit.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.8289055548493708
Wall time: 3min 49s


https://stats.stackexchange.com/questions/17711/why-does-ridge-regression-classifier-work-quite-well-for-text-classification

In [7]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC

In [12]:
%%time

#------------ SVC ------------------------------
Cs = [0.0001, 0.001, 0.01, 0.1, 1, 10]
hyperparameters = {'C': Cs}
c = GridSearchCV(LinearSVC(), hyperparameters, cv=5, verbose=0)
vec = TfidfVectorizer()
vec.set_params(max_features=100000, ngram_range=(1, 1))
best_estimator = Pipeline([('vectorizer', vec),('classifier', c)])

#------------- SVC2 --------------
Cs = [0.0001, 0.001, 0.01, 0.1, 1, 10]
hyperparameters = {'C': Cs}
c = GridSearchCV(LinearSVC(), hyperparameters, cv=5, verbose=0)
vec = TfidfVectorizer()
vec.set_params(max_features=100000, ngram_range=(1, 3))
best_estimator2 = Pipeline([('vectorizer', vec),('classifier', c)])

#------------- SCV3 ----------------
Cs = [0.0001, 0.001, 0.01, 0.1, 1, 10]
hyperparameters = {'C': Cs}
c = GridSearchCV(LinearSVC(), hyperparameters, cv=5, verbose=0)
vec = TfidfVectorizer()
vec.set_params(max_features=100000, ngram_range=(1, 4))
best_estimator3 = Pipeline([('vectorizer', vec),('classifier', c)])

#----------- Voting classifier ------------
eclf = VotingClassifier(estimators=[('svc1', best_estimator), ('svc2', best_estimator2), ('svc3', best_estimator3)], voting='hard')


classifier_fit = eclf.fit(X_train, y_train)
y_pred = classifier_fit.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.8289564001525359
Wall time: 5min 10s


# Time to test on Kaggle

**Load train and test data**

In [49]:
data_folder = 'twitter-datasets'
train_pos = pd.read_table(data_folder + '/train_pos_full.txt', header = None, names=['text'], sep='\n')
train_pos['sentiment'] = 'positive'
train_neg = pd.read_table(data_folder + '/train_neg_full.txt', header = None, names=['text'], sep='\n')
train_neg['sentiment'] = 'negative'
train_data = pd.concat([train_neg, train_pos]) #merge positive and negative tweets
train_data = train_data.sample(frac=1).reset_index(drop=True) #shuffle the datas

In [50]:
with open(data_folder + '/test_data.txt') as f:
    lines =  [(line.rstrip('\n').split(',', 1)) for line in f]
test_data = (pd.DataFrame(lines, columns=['id', 'text']))

**Data cleaning**

In [51]:
pat = [r'<user>', r'<url>', r'#', r'[0-9]', r'\.', r'\,', r'\-', r'\(', r'\)', r'\/'\
       , r'rt', r' \: ', r' \~ ', r' \* ', r'<', r'>', r' \" ', r' \' ']
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def tweet_cleaner(text) : 
    allpat = text
    for p1 in pat :
        allpat = re.sub(p1,'',allpat)    
    
    allpat =  allpat.lower()
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], allpat)
    words = neg_handled.split()
    return (" ".join(words)).strip()

In [52]:
#train_data = pd.DataFrame(train_data.apply(lambda row : pd.Series([tweet_cleaner(row[0]), row[1]]), axis=1)) #clean each tweet

In [53]:
train_data.head()

Unnamed: 0,text,sentiment
0,kind of sad to think that .. after this final ...,negative
1,<user> thank you ! x luv lots,positive
2,<user> you can have mine lol,positive
3,"rt <user> i just want you to know , that "" i l...",positive
4,pong 360 official portable beer pong table - 8...,negative


In [54]:
clean_tweet_texts = []
for i in range(len(train_data)) :
    clean_tweet_texts.append(tweet_cleaner(train_data['text'][i]))
clean_df = pd.DataFrame(clean_tweet_texts,columns=['text'])
clean_df['sentiment'] = train_data.sentiment
train_data = clean_df

In [55]:
clean_tweet_texts = []
for i in range(len(test_data)) :
    clean_tweet_texts.append(tweet_cleaner(test_data['text'][i]))
clean_ = pd.DataFrame(clean_tweet_texts,columns=['text'])
clean_['id'] = test_data.id
test_data = clean_

**Train our model : We will use SVC along with tdf-if and 100k features without removing stopwords**

In [56]:
#mapping -1 to negative sentiment and +1 to positive sentiment
train_data.columns=['text', 'sentiment']
test_data.columns=['text', 'id']
train_data['sentiment'] = np.where(train_data['sentiment'] == 'negative', -1, 1)

In [57]:
X_train = train_data.text.values
y_train = train_data.sentiment.values
X_test = test_data.text.values

In [58]:
#Cs = [0.0001, 0.001, 0.01, 0.1, 1]
#hyperparameters = {'C': Cs}
hyperparameters = dict(C=np.logspace(0, 4, 10), penalty=['l1', 'l2'])
#c = GridSearchCV(LinearSVC(), hyperparameters, cv=5, verbose=0)
#c = RandomizedSearchCV(LinearSVC(), hyperparameters, cv=3, verbose=0, n_jobs=2)
c = RandomizedSearchCV(LogisticRegression(), hyperparameters, cv=3, verbose=0, n_jobs=2)
vec = TfidfVectorizer()
vec.set_params(max_features=100000, ngram_range=(1, 3))
best_estimator = Pipeline([('vectorizer', vec),('classifier', c)])
classifier_fit = best_estimator.fit(X_train, y_train)
y_pred = classifier_fit.predict(X_test)

In [59]:
ids = test_data.id.values

In [61]:
import csv

In [62]:
with open('submit2.csv', 'w', newline='') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})