In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pre_processing import get_pre_process_data
from pre_processing import get_pre_process_data_test
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import RidgeClassifier, LinearRegression
from sklearn.ensemble import VotingClassifier

In [64]:
def to_dataset(full=False,run_processing = False): 
    if run_processing : 
        tweet_pos, tweet_neg, tweet_test = desired_preprocessing(full,'preprocessing_pos.txt','preprocessing_neg.txt','preprocessing_test.txt')
    else : 
        if full : 
            #change
            path_pos = '../Resources/preprocessing_pos_full.txt'
            path_neg = '../Resources/preprocessing_neg_full.txt'
        else : 
            path_pos = '../Resources/preprocessing_pos1.txt'
            path_neg = '../Resources/preprocessing_neg1.txt'
        
        path_test = '../Resources/preprocessing_test1.txt'
    
        tweet_pos = [tweet.rstrip('\n') for tweet in open(path_pos)]
        tweet_neg = [tweet.rstrip('\n') for tweet in open(path_neg)]
        tweet_test = [tweet.rstrip('\n') for tweet in open(path_test)]
    
    data_test = pd.DataFrame({"tweet": tweet_test})  
    data_pos = pd.DataFrame({"tweet": tweet_pos,"sentiment": np.ones(len(tweet_pos))})
    data_neg = pd.DataFrame({ "tweet": tweet_neg, "sentiment": np.zeros(len(tweet_neg)) })
    
    data_train = pd.concat([data_pos, data_neg],axis=0).reset_index().drop(columns=['index'])

    data_train = data_train.sample(frac=1).reset_index(drop=True)

    
    return data_train, data_test

In [65]:
def split_train(X,y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_val, y_train, y_val

In [66]:
data_train,data_test = to_dataset()
X_train, X_val, y_train, y_val = split_train(data_train.tweet,data_train.sentiment)

In [68]:
X_train

23821     ahh long story going hear soon right laught go...
134809                       laught alright spam spam later
141838                             not want weekend already
115299                               oh slide flamingo love
130763                          ok thankyou consecutivestop
                                ...                        
119879    light audio cd sure alien laser gun princess c...
103694    laught making collage laught big mistake leavi...
131932                        always speak booty love booty
146867              very true tamara see using stuff learnt
121958    assimilation intensive workshop cd set turn fi...
Name: tweet, Length: 137836, dtype: object

In [4]:
def train_pipeline(clf, X, y):
    """
    Returns the model for clf trained
    INPUT:
        clf :                     - The classifier to train
        X : Multidimensional list - The traning features
        y : list                  - The traning results
    OUTPUT:
        Returns the model trained
    """
    tvec = TfidfVectorizer().set_params(
        stop_words=None, max_features=100000, ngram_range=(1, 3))

    model_pipeline = Pipeline([('vectorizer', tvec), ('classifier', clf)])
    model_pipeline.fit(X, y)
    return model_pipeline

In [5]:
def get_prediction(model, X_test) : 
    return model.predict(X_test)

In [21]:
def compute_accuracy(y,y_val):
    return (y==y_val).mean()

In [69]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
import numpy as np
from time import time


def acc_summary(pipeline, x_train, y_train, x_test, y_test):
    t0 = time()
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(x_test)
    train_test_time = time() - t0
    accuracy = accuracy_score(y_test, y_pred)
    print("accuracy score: {0:.2f}%".format(accuracy*100))
    print("train and test time: {0:.2f}s".format(train_test_time))
    print("-"*80)
    return accuracy, train_test_time


from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer()

from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import NearestCentroid
from sklearn.feature_selection import SelectFromModel

names = ["Logistic Regression", "Linear SVC", "LinearSVC with L1-based feature selection","Multinomial NB", 
         "Bernoulli NB", "Ridge Classifier", "AdaBoost", "Perceptron","Passive-Aggresive", "Nearest Centroid"]
classifiers = [
    LogisticRegression(),
    LinearSVC(),
    Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False))),
  ('classification', LinearSVC(penalty="l2"))]),
    MultinomialNB(),
    BernoulliNB(),
    RidgeClassifier(),
    AdaBoostClassifier(),
    Perceptron(),
    PassiveAggressiveClassifier(),
    NearestCentroid()
    ]
zipped_clf = zip(names,classifiers)

tvec = TfidfVectorizer()
def classifier_comparator(vectorizer=tvec, n_features=100000, stop_words=None, ngram_range=(1, 1), classifier=zipped_clf):
    result = []
    vectorizer.set_params(stop_words=stop_words, max_features=n_features, ngram_range=ngram_range)
    for n,c in classifier:
        checker_pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', c.set_params(max_iter=n))
        ])
        print("Validation result for {}".format(n))
        print (c)
        clf_acc,tt_time = acc_summary(checker_pipeline, X_train, y_train, X_val, y_val)
        result.append((n,clf_acc,tt_time))
    return result

trigram_result = classifier_comparator(n_features=100000,ngram_range=(1,3))

Validation result for Logistic Regression
LogisticRegression()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy score: 79.60%
train and test time: 18.93s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC()
accuracy score: 78.46%
train and test time: 11.16s
--------------------------------------------------------------------------------
Validation result for LinearSVC with L1-based feature selection
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', LinearSVC())])
accuracy score: 78.61%
train and test time: 14.01s
--------------------------------------------------------------------------------
Validation result for Multinomial NB
MultinomialNB()
accuracy score: 77.55%
train and test time: 8.96s
--------------------------------------------------------------------------------
Validation result for Bernoulli NB
BernoulliNB()
accuracy score: 75.37%
train and test t