In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pre_processing import get_pre_process_data
from pre_processing import get_pre_process_data_test
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import RidgeClassifier, LinearRegression
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score 
from time import time
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import NearestCentroid
from sklearn.feature_selection import SelectFromModel

In [64]:
def to_dataset(full=False,run_processing = False): 
    if run_processing : 
        tweet_pos, tweet_neg, tweet_test = desired_preprocessing(full,'preprocessing_pos.txt','preprocessing_neg.txt','preprocessing_test.txt')
    else : 
        if full : 
            path_pos = '../Resources/preprocessing_pos_full.txt'
            path_neg = '../Resources/preprocessing_neg_full.txt'
        else : 
            path_pos = '../Resources/preprocessing_pos1.txt'
            path_neg = '../Resources/preprocessing_neg1.txt'
        
        path_test = '../Resources/preprocessing_test1.txt'
    
        tweet_pos = [tweet.rstrip('\n') for tweet in open(path_pos)]
        tweet_neg = [tweet.rstrip('\n') for tweet in open(path_neg)]
        tweet_test = [tweet.rstrip('\n') for tweet in open(path_test)]
    
    data_test = pd.DataFrame({"tweet": tweet_test})  
    data_pos = pd.DataFrame({"tweet": tweet_pos,"sentiment": np.ones(len(tweet_pos))})
    data_neg = pd.DataFrame({ "tweet": tweet_neg, "sentiment": np.zeros(len(tweet_neg)) })
    
    data_train = pd.concat([data_pos, data_neg],axis=0).reset_index().drop(columns=['index'])

    data_train = data_train.sample(frac=1).reset_index(drop=True)

    
    return data_train, data_test

In [65]:
def split_train(X,y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_val, y_train, y_val

## visualize cross validation results

In [140]:
def list_ML_algo() : 
    names = ["Logistic Regression", "Linear SVC", "LinearSVC with L1-based feature selection","Multinomial NB", 
         "Bernoulli NB", "Ridge Classifier", "AdaBoost", "Perceptron","Passive-Aggresive", "Nearest Centroid"]
    classifiers = [
    LogisticRegression().set_params(max_iter=10000),
    LinearSVC(),
    Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False))),
  ('classification', LinearSVC(penalty="l2"))]),
    MultinomialNB(),
    BernoulliNB(),
    RidgeClassifier(),
    AdaBoostClassifier(),
    Perceptron(),
    PassiveAggressiveClassifier(),
    NearestCentroid()
    ]
    zipped_clf = zip(names,classifiers)
    return zipped_clf

In [141]:
def acc_summary(pipeline, x_train, y_train, x_test, y_test):
    t0 = time()
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(x_test)
    train_test_time = time() - t0
    accuracy = accuracy_score(y_test, y_pred)
    print("accuracy score: {0:.2f}%".format(accuracy*100))
    print("train and test time: {0:.2f}s".format(train_test_time))
    print("-"*80)
    return accuracy, train_test_time

## On small dataset

In [142]:
def classifier_comparator(X_train, y_train, X_val, y_val, n_features=100000, stop_words=None, ngram_range=(1, 1)):
    vectorizer=TfidfVectorizer()
    classifier=list_ML_algo()
    result = []
    vectorizer.set_params(stop_words=stop_words, max_features=n_features, ngram_range=ngram_range)
    for n,c in classifier:
        checker_pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', c)
        ])
        print("Validation result for {}".format(n))
        print (c)
        clf_acc,tt_time = acc_summary(checker_pipeline, X_train, y_train, X_val, y_val)
        result.append((n,clf_acc,tt_time))
    return result

In [143]:
data_train,data_test = to_dataset()
X_train, X_val, y_train, y_val = split_train(data_train.tweet,data_train.sentiment)

In [144]:
unigram_result = classifier_comparator(X_train, y_train, X_val, y_val, n_features=100000,ngram_range=(1,1))

Validation result for Logistic Regression
LogisticRegression(max_iter=10000)
accuracy score: 78.36%
train and test time: 4.56s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC()
accuracy score: 77.23%
train and test time: 3.34s
--------------------------------------------------------------------------------
Validation result for LinearSVC with L1-based feature selection
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', LinearSVC())])
accuracy score: 77.18%
train and test time: 4.58s
--------------------------------------------------------------------------------
Validation result for Multinomial NB
MultinomialNB()
accuracy score: 74.84%
train and test time: 1.60s
--------------------------------------------------------------------------------
Validation res

In [145]:
bigram_result = classifier_comparator(X_train, y_train, X_val, y_val, n_features=100000,ngram_range=(1,2))

Validation result for Logistic Regression
LogisticRegression(max_iter=10000)
accuracy score: 79.56%
train and test time: 9.51s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC()
accuracy score: 78.35%
train and test time: 6.79s
--------------------------------------------------------------------------------
Validation result for LinearSVC with L1-based feature selection
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', LinearSVC())])
accuracy score: 78.46%
train and test time: 9.97s
--------------------------------------------------------------------------------
Validation result for Multinomial NB
MultinomialNB()
accuracy score: 77.51%
train and test time: 5.29s
--------------------------------------------------------------------------------
Validation res

In [146]:
trigram_result = classifier_comparator(X_train, y_train, X_val, y_val, n_features=100000,ngram_range=(1,3))

Validation result for Logistic Regression
LogisticRegression(max_iter=10000)
accuracy score: 79.58%
train and test time: 13.29s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC()
accuracy score: 78.18%
train and test time: 9.81s
--------------------------------------------------------------------------------
Validation result for LinearSVC with L1-based feature selection
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', LinearSVC())])
accuracy score: 78.52%
train and test time: 13.78s
--------------------------------------------------------------------------------
Validation result for Multinomial NB
MultinomialNB()
accuracy score: 77.49%
train and test time: 8.67s
--------------------------------------------------------------------------------
Validation r

In [93]:
fourgram_result = classifier_comparator(X_train, y_train, X_val, y_val, n_features=100000,ngram_range=(1,4))

Validation result for Logistic Regression
LogisticRegression(max_iter=10000)
accuracy score: 79.55%
train and test time: 21.45s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC()
accuracy score: 78.63%
train and test time: 15.60s
--------------------------------------------------------------------------------
Validation result for LinearSVC with L1-based feature selection
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', LinearSVC())])
accuracy score: 78.59%
train and test time: 17.39s
--------------------------------------------------------------------------------
Validation result for Multinomial NB
MultinomialNB()
accuracy score: 77.59%
train and test time: 17.94s
--------------------------------------------------------------------------------
Validation

In [95]:
fivegram_result = classifier_comparator(X_train, y_train, X_val, y_val, n_features=100000,ngram_range=(1,5))

Validation result for Logistic Regression
LogisticRegression(max_iter=10000)
accuracy score: 79.50%
train and test time: 25.53s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC()
accuracy score: 78.53%
train and test time: 16.16s
--------------------------------------------------------------------------------
Validation result for LinearSVC with L1-based feature selection
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', LinearSVC())])
accuracy score: 78.61%
train and test time: 20.61s
--------------------------------------------------------------------------------
Validation result for Multinomial NB
MultinomialNB()
accuracy score: 77.45%
train and test time: 16.71s
--------------------------------------------------------------------------------
Validation

## run best one on full dataset

In [147]:
data_train_full,data_test = to_dataset(True,False)
X_train_f, X_val_f, y_train_f, y_val_f = split_train(data_train_full.tweet,data_train_full.sentiment)

In [149]:
trigram_result = classifier_comparator(X_train_f, y_train_f, X_val_f, y_val_f, n_features=100000,ngram_range=(1,3))

Validation result for Logistic Regression
LogisticRegression(max_iter=10000)
accuracy score: 81.76%
train and test time: 260.63s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC()
accuracy score: 81.50%
train and test time: 150.15s
--------------------------------------------------------------------------------
Validation result for LinearSVC with L1-based feature selection
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', LinearSVC())])
accuracy score: 81.50%
train and test time: 258.60s
--------------------------------------------------------------------------------
Validation result for Multinomial NB
MultinomialNB()
accuracy score: 78.46%
train and test time: 134.19s
--------------------------------------------------------------------------------
Valida