In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pre_processing import get_pre_process_data
from pre_processing import get_pre_process_data_test
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import RidgeClassifier, LinearRegression
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score 
from time import time
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import NearestCentroid
from sklearn.feature_selection import SelectFromModel

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\julie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\julie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def to_dataset(full=False,run_processing = False): 
    if run_processing : 
        tweet_pos, tweet_neg, tweet_test = desired_preprocessing(full,'preprocessing_pos.txt','preprocessing_neg.txt','preprocessing_test.txt')
    else : 
        if full : 
            path_pos = '../Resources/preprocessing_pos_full.txt'
            path_neg = '../Resources/preprocessing_neg_full.txt'
        else : 
            path_pos = '../Resources/preprocessing_pos1.txt'
            path_neg = '../Resources/preprocessing_neg1.txt'
        
        path_test = '../Resources/preprocessing_test1.txt'
    
        tweet_pos = [tweet.rstrip('\n') for tweet in open(path_pos)]
        tweet_neg = [tweet.rstrip('\n') for tweet in open(path_neg)]
        tweet_test = [tweet.rstrip('\n') for tweet in open(path_test)]
    
    data_test = pd.DataFrame({"tweet": tweet_test})  
    data_pos = pd.DataFrame({"tweet": tweet_pos,"sentiment": np.ones(len(tweet_pos))})
    data_neg = pd.DataFrame({ "tweet": tweet_neg, "sentiment": np.zeros(len(tweet_neg)) })
    
    data_train = pd.concat([data_pos, data_neg],axis=0).reset_index().drop(columns=['index'])

    data_train = data_train.sample(frac=1).reset_index(drop=True)

    
    return data_train, data_test

In [3]:
def split_train(X,y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_val, y_train, y_val

## visualize cross validation results

In [9]:
def list_ML_algo() : 
    names = ["Logistic Regression", "Linear SVC", "LinearSVC with L1-based feature selection","Multinomial NB", 
         "Bernoulli NB", "Ridge Classifier", "AdaBoost", "Perceptron","Passive-Aggresive", "Nearest Centroid"]
    classifiers = [
    LogisticRegression().set_params(max_iter=10000),
    LinearSVC(),
    Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False))),
  ('classification', LinearSVC(penalty="l2"))]),
    MultinomialNB(),
    BernoulliNB(),
    RidgeClassifier(),
    AdaBoostClassifier(),
    Perceptron(),
    PassiveAggressiveClassifier(),
    NearestCentroid()
    ]
    zipped_clf = zip(names,classifiers)
    return zipped_clf

In [10]:
def acc_summary(pipeline, x_train, y_train, x_test, y_test):
    t0 = time()
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(x_test)
    train_test_time = time() - t0
    accuracy = accuracy_score(y_test, y_pred)
    print("accuracy score: {0:.2f}%".format(accuracy*100))
    print("train and test time: {0:.2f}s".format(train_test_time))
    print("-"*80)
    return accuracy, train_test_time

In [11]:
def classifier_comparator(X_train, y_train, X_val, y_val, n_features=100000, stop_words=None, ngram_range=(1, 1)):
    vectorizer=TfidfVectorizer()
    classifier=list_ML_algo()
    result = []
    vectorizer.set_params(stop_words=stop_words, max_features=n_features, ngram_range=ngram_range)
    for n,c in classifier:
        checker_pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', c)
        ])
        print("Validation result for {}".format(n))
        print (c)
        clf_acc,tt_time = acc_summary(checker_pipeline, X_train, y_train, X_val, y_val)
        result.append((n,clf_acc,tt_time))
    return result

## On small dataset

In [12]:
data_train,data_test = to_dataset()
X_train, X_val, y_train, y_val = split_train(data_train.tweet,data_train.sentiment)

In [13]:
unigram_result = classifier_comparator(X_train, y_train, X_val, y_val, n_features=100000,ngram_range=(1,1))

Validation result for Logistic Regression
LogisticRegression(max_iter=10000)
accuracy score: 78.23%
train and test time: 6.06s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC()
accuracy score: 77.27%
train and test time: 3.16s
--------------------------------------------------------------------------------
Validation result for LinearSVC with L1-based feature selection
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', LinearSVC())])
accuracy score: 77.24%
train and test time: 5.38s
--------------------------------------------------------------------------------
Validation result for Multinomial NB
MultinomialNB()
accuracy score: 74.76%
train and test time: 1.74s
--------------------------------------------------------------------------------
Validation res

In [14]:
bigram_result = classifier_comparator(X_train, y_train, X_val, y_val, n_features=100000,ngram_range=(1,2))

Validation result for Logistic Regression
LogisticRegression(max_iter=10000)
accuracy score: 79.43%
train and test time: 10.41s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC()
accuracy score: 78.42%
train and test time: 6.22s
--------------------------------------------------------------------------------
Validation result for LinearSVC with L1-based feature selection
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', LinearSVC())])
accuracy score: 78.50%
train and test time: 9.48s
--------------------------------------------------------------------------------
Validation result for Multinomial NB
MultinomialNB()
accuracy score: 77.41%
train and test time: 4.69s
--------------------------------------------------------------------------------
Validation re

In [15]:
trigram_result = classifier_comparator(X_train, y_train, X_val, y_val, n_features=100000,ngram_range=(1,3))

Validation result for Logistic Regression
LogisticRegression(max_iter=10000)
accuracy score: 79.38%
train and test time: 12.99s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC()
accuracy score: 78.37%
train and test time: 9.55s
--------------------------------------------------------------------------------
Validation result for LinearSVC with L1-based feature selection
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', LinearSVC())])
accuracy score: 78.53%
train and test time: 13.69s
--------------------------------------------------------------------------------
Validation result for Multinomial NB
MultinomialNB()
accuracy score: 77.59%
train and test time: 8.25s
--------------------------------------------------------------------------------
Validation r

In [16]:
fourgram_result = classifier_comparator(X_train, y_train, X_val, y_val, n_features=100000,ngram_range=(1,4))

Validation result for Logistic Regression
LogisticRegression(max_iter=10000)
accuracy score: 79.39%
train and test time: 14.30s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC()
accuracy score: 78.49%
train and test time: 13.62s
--------------------------------------------------------------------------------
Validation result for LinearSVC with L1-based feature selection
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', LinearSVC())])
accuracy score: 78.48%
train and test time: 16.71s
--------------------------------------------------------------------------------
Validation result for Multinomial NB
MultinomialNB()
accuracy score: 77.55%
train and test time: 10.83s
--------------------------------------------------------------------------------
Validation

In [18]:
fivegram_result = classifier_comparator(X_train, y_train, X_val, y_val, n_features=100000,ngram_range=(1,5))

Validation result for Logistic Regression
LogisticRegression(max_iter=10000)
accuracy score: 79.37%
train and test time: 23.92s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC()
accuracy score: 78.34%
train and test time: 15.23s
--------------------------------------------------------------------------------
Validation result for LinearSVC with L1-based feature selection
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', LinearSVC())])
accuracy score: 78.53%
train and test time: 18.39s
--------------------------------------------------------------------------------
Validation result for Multinomial NB
MultinomialNB()
accuracy score: 77.59%
train and test time: 13.95s
--------------------------------------------------------------------------------
Validation

## run best one on full dataset

In [19]:
data_train_full,data_test = to_dataset(True,False)
X_train_f, X_val_f, y_train_f, y_val_f = split_train(data_train_full.tweet,data_train_full.sentiment)

In [20]:
trigram_result = classifier_comparator(X_train_f, y_train_f, X_val_f, y_val_f, n_features=100000,ngram_range=(1,3))

Validation result for Logistic Regression
LogisticRegression(max_iter=10000)
accuracy score: 81.78%
train and test time: 262.88s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC()
accuracy score: 81.55%
train and test time: 190.73s
--------------------------------------------------------------------------------
Validation result for LinearSVC with L1-based feature selection
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', LinearSVC())])
accuracy score: 81.55%
train and test time: 290.65s
--------------------------------------------------------------------------------
Validation result for Multinomial NB
MultinomialNB()
accuracy score: 78.55%
train and test time: 145.34s
--------------------------------------------------------------------------------
Valida

## Try with another preprocessing 

In [21]:
def to_dataset2(full=False,run_processing = False): 
    if run_processing : 
        tweet_pos, tweet_neg, tweet_test = desired_preprocessing(full,'preprocessing_pos.txt','preprocessing_neg.txt','preprocessing_test.txt')
    else : 
        if full : 
            path_pos = '../Resources/preprocessing_pos_fp_full.txt'
            path_neg = '../Resources/preprocessing_neg_fp_full.txt'
        else : 
            path_pos = '../Resources/preprocessing_pos_fp.txt'
            path_neg = '../Resources/preprocessing_neg_fp.txt'
        
        path_test = '../Resources/preprocessing_test_fp.txt'
    
        tweet_pos = [tweet.rstrip('\n') for tweet in open(path_pos)]
        tweet_neg = [tweet.rstrip('\n') for tweet in open(path_neg)]
        tweet_test = [tweet.rstrip('\n') for tweet in open(path_test)]
    
    data_test = pd.DataFrame({"tweet": tweet_test})  
    data_pos = pd.DataFrame({"tweet": tweet_pos,"sentiment": np.ones(len(tweet_pos))})
    data_neg = pd.DataFrame({ "tweet": tweet_neg, "sentiment": np.zeros(len(tweet_neg)) })
    
    data_train = pd.concat([data_pos, data_neg],axis=0).reset_index().drop(columns=['index'])

    data_train = data_train.sample(frac=1).reset_index(drop=True)

    
    return data_train, data_test

In [22]:
data_train_fp,data_test_fp = to_dataset2()
X_train_fp, X_val_fp, y_train_fp, y_val_fp = split_train(data_train_fp.tweet,data_train_fp.sentiment)

In [23]:
trigram_result = classifier_comparator(X_train_fp, y_train_fp, X_val_fp, y_val_fp, n_features=100000,ngram_range=(1,3))

Validation result for Logistic Regression
LogisticRegression(max_iter=10000)
accuracy score: 81.85%
train and test time: 24.44s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC()
accuracy score: 81.08%
train and test time: 14.94s
--------------------------------------------------------------------------------
Validation result for LinearSVC with L1-based feature selection
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', LinearSVC())])
accuracy score: 81.18%
train and test time: 23.73s
--------------------------------------------------------------------------------
Validation result for Multinomial NB
MultinomialNB()
accuracy score: 79.40%
train and test time: 13.69s
--------------------------------------------------------------------------------
Validation

In [16]:
fourgram_result = classifier_comparator(X_train_fp, y_train_fp, X_val_fp, y_val_fp, n_features=100000,ngram_range=(1,4))

Validation result for Logistic Regression
LogisticRegression(max_iter=10000)
accuracy score: 81.93%
train and test time: 31.49s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC()
accuracy score: 80.96%
train and test time: 23.32s
--------------------------------------------------------------------------------
Validation result for LinearSVC with L1-based feature selection
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', LinearSVC())])
accuracy score: 81.13%
train and test time: 35.35s
--------------------------------------------------------------------------------
Validation result for Multinomial NB
MultinomialNB()
accuracy score: 79.04%
train and test time: 25.30s
--------------------------------------------------------------------------------
Validation

In [17]:
fivegram_result = classifier_comparator(X_train_fp, y_train_fp, X_val_fp, y_val_fp, n_features=100000,ngram_range=(1,5))

Validation result for Logistic Regression
LogisticRegression(max_iter=10000)
accuracy score: 81.94%
train and test time: 36.11s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC()
accuracy score: 80.92%
train and test time: 27.97s
--------------------------------------------------------------------------------
Validation result for LinearSVC with L1-based feature selection
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', LinearSVC())])
accuracy score: 81.15%
train and test time: 40.61s
--------------------------------------------------------------------------------
Validation result for Multinomial NB
MultinomialNB()
accuracy score: 79.02%
train and test time: 30.51s
--------------------------------------------------------------------------------
Validation

## run best one on full dataset

In [24]:
data_train_fp_full,data_test_fp_full = to_dataset2(True, False)
X_train_fp_full, X_val_fp_full, y_train_fp_full, y_val_fp_full = split_train(data_train_fp_full.tweet,data_train_fp_full.sentiment)

In [25]:
trigram_result = classifier_comparator(X_train_fp_full, y_train_fp_full, X_val_fp_full, y_val_fp_full, n_features=100000,ngram_range=(1,3))

Validation result for Logistic Regression
LogisticRegression(max_iter=10000)
accuracy score: 84.38%
train and test time: 426.56s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC()
accuracy score: 84.23%
train and test time: 268.48s
--------------------------------------------------------------------------------
Validation result for LinearSVC with L1-based feature selection
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', LinearSVC())])
accuracy score: 84.24%
train and test time: 568.57s
--------------------------------------------------------------------------------
Validation result for Multinomial NB
MultinomialNB()
accuracy score: 80.23%
train and test time: 235.66s
--------------------------------------------------------------------------------
Valida

## prepare submission file

In [48]:
def run_logistic(X, y):
    clf = LogisticRegression().set_params(max_iter=10000)
    tvec = TfidfVectorizer().set_params(
        stop_words=None, max_features=100000, ngram_range=(1, 3))

    model_pipeline = Pipeline([('vectorizer', tvec), ('classifier', clf)])
    model_pipeline.fit(X, y)
    return model_pipeline

In [49]:
def predict_label(model,X_test,name_submission) : 
    y_pred = model.predict(X_test)
    y_pred2=[x if x ==1 else -1 for x in y_pred]
    df_to_submit = pd.DataFrame(y_pred2)
    df_to_submit.rename(columns={0: 'Prediction'}, inplace=True)
    df_to_submit.insert(0, 'Id', range(1, 1 + len(df_to_submit)))
    df_to_submit.to_csv(name_submission, index=False)

### logistic model 

In [47]:
model = run_logistic(data_train_fp_full.tweet,data_train_fp_full.sentiment)
predict_label(model,data_test_df.tweet,'logistic_model.csv', index=False)