In [6]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter  import PorterStemmer
from nltk.stem import WordNetLemmatizer

from wordcloud import WordCloud

import pandas as pd
import random, time
from babel.dates import format_date, format_datetime, format_time
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tweetsdataset/train_neg.txt
/kaggle/input/tweetsdataset/train_pos.txt
/kaggle/input/tweetsdataset/test_data.txt


In [8]:
path_dataset_neg = "/kaggle/input/tweetsdataset/train_neg.txt"
path_dataset_pos = "/kaggle/input/tweetsdataset/train_pos.txt"

In [9]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas

# Preprocessing functions

In [10]:
def import_data(path_dataset_neg, path_dataset_pos):
    """
    This function imports the data set, adds labels and returns a Pandas Dataframe, without duplicates. 
    Input : path of negative data set, path of postive dataset 
    Output: Pandas data frame with two columns : text and label
    """

    #Kaggle version
    train_neg = [tweet[:-1] for tweet in open(path_dataset_neg).readlines()]
    train_pos = [tweet[:-1] for tweet in open(path_dataset_pos).readlines()]
        
    X, y = train_neg + train_pos, [-1 for i in range(len(train_neg))]+[1 for i in range(len(train_pos))]
    df = pd.DataFrame(list(zip(y, X)), columns = ['label','text'], dtype = str)
    df.drop_duplicates(inplace = True)# Delete duplicate Tweets
    df['label'] = df['label'].astype(int)
    
    return df

In [11]:
def cleaning_data(df):
    """
    This function removes special characters, numbers, url links, single characters  
    Input : Pandas data frame with two columns : text and label 
    Output: Pandas data frame with two columns : text and label
    """
    
    # remove special characters from text column
    df.text = df.text.str.replace('[#,@,&]', '')
    
    #Replace special characters
    df.text = df.text.str.replace('(','')
    df.text = df.text.str.replace(')','')
    df.text = df.text.str.replace('=','')
    df.text = df.text.str.replace('!','')
    df.text = df.text.str.replace('?','')
    df.text = df.text.str.replace('"','')
    df.text = df.text.str.replace('_','')
    df.text = df.text.str.replace('-','')
    df.text = df.text.str.replace(',','')
    df.text = df.text.str.replace('.','')
    df.text = df.text.str.replace(';','')
    df.text = df.text.str.replace('+','')
    df.text = df.text.str.replace('<user>','')
    df.text = df.text.str.replace('<rt>','')
    df.text = df.text.str.replace(':','')
    df.text = df.text.str.replace('/','')
    df.text = df.text.str.replace('<','')
    df.text = df.text.str.replace('>','')
    df.text = df.text.str.replace('\'s','')
    
    # Remove digits
    df.text = df.text.str.replace('\d*','')
    
    #Remove www
    df.text = df.text.str.replace('w{3}','')
    # remove urls
    df.text = df.text.str.replace("http\S+", "")
    # remove multiple spaces with single space
    df.text = df.text.str.replace('\s+', ' ')
    #remove all single characters (except "i")
    df.text = df.text.str.replace(r'\s+[a-hA-H]\s+', '')
    df.text = df.text.str.replace(r'\s+[j-zJ-Z]\s+', '')
    df.text = df.text.str.replace(r'\s+[i-iI-I]\s+',' ')
    return df

In [12]:
def remove_stopwords(df):
    
    """
    This function stopwords, defined in the list in the function.
    We delete Twitter specific words, english stopwords, but we keep negative forms of verbs and negative adverbs
    Input : Pandas data frame with two columns : text and label 
    Output: Pandas data frame with two columns : text and label
    """
    
    stop_words = ['i', 'me', 'my', 'myself', 'we','url' 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain']
    stop_words.extend(['u', 'wa', 'ha','ho', 'would', 'com', 'user','<user>', '<rt>' 'url', 'rt', 'custom picture', 'i\'m', 'picture frame','<url>', 'positer frame', 'x','i\'ll'])
    stop_words.remove('not')
    stop_words.remove('no')
    stop_words.remove('nor')
    df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    return df

In [13]:
def Porter_stemmer(df):
    """
    This function applies Porter Stemmer methodology to reduces words to their stem
    Input : Pandas data frame with two columns : text and label 
    Output: Pandas data frame with two columns : text and label
    """   
    stemmer = PorterStemmer()
    df['text'] = df['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
    return df

def snow_ball_stemmer(df):
    """
    This function applies Snowball Stemmer methodology to reduces words to their stem
    Input : Pandas data frame with two columns : text and label 
    Output: Pandas data frame with two columns : text and label
    """   
    snow_stemmer = SnowballStemmer(language='english')
    df['text'] = df['text'].apply(lambda x: ' '.join([snow_stemmer.stem(word) for word in x.split()]))
    return df

def lemmatize_text(df):
    """
    This function applies World Net Lemmatizing methodology to reduces words to their stem
    Input : Pandas data frame with two columns : text and label 
    Output: Pandas data frame with two columns : text and label
    """   
    lemmatizer = nltk.stem.WordNetLemmatizer()
    df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
    return df

# Vectorizer

In [14]:
def Basic_Vectorizer(df):
    """
    This function transforms text into a matrix mapping X using all words in text as vocabulary list 
    It also transform the labels to a numpy vector y
    Input : Pandas data frame with two columns : text and label 
    Output:  X vector of features, y vector of labels
    """   
    text = df['text']
    y = df['label'].to_numpy()
    
    basic_vectorizer = CountVectorizer(binary=True)
    basic_vectorizer.fit(text)
    X = basic_vectorizer.transform(text)
    
    return X, y



def N_Gram_Vectorizer(df, N):
    """
    This function transforms text into a matrix mapping X using all words in text as vocabulary list.
    It maps N-grams (series of N consecutive words)
    It also transform the labels to a numpy vector y
    Input : Pandas data frame with two columns : text and label, N the parameter for N-grams 
    Output:  X vector of features, y vector of labels
    """   
    text = df['text']
    y = df['label'].to_numpy()
    
    #adding two or three word sequences (bigrams or trigrams)
    ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, N))
    ngram_vectorizer.fit(text)
    X = ngram_vectorizer.transform(text)
    
    return X, y

# SVD

In [15]:
def SVD_preprocessing(X, y, N):
    
    """
    This function applies SVD transformation to the features matrix X, keeping the N most significant drivers
    Input : Matrix of features X, vector of labels y, parameter N for number of drivers to keep
    Output:  X vector of features after SVD, y vector of labels
    """  
    clf = TruncatedSVD(100)
    X_SVD = clf.fit_transform(X)
    
    return X_SVD, y

# Preprocessing methods for testing

In [16]:
def evaluate_method(X,y, message):
    """
    This function first splits the data X, y into training and testing sets
    It then trains and test with a Logistic regression model and display the score
    Input : Matrix of features X, vector of labels y, message to display
    Output:  Nonen, but prints accuracy of model
    """   
    random.seed(42)
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.75)
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    print ("Accuracy "+message+" : %s" % (accuracy_score(y_val, lr.predict(X_val))))
    return None

In [17]:
def test_clean_data(path_dataset_neg, path_dataset_pos):
    
    """
    This function compares the accuracy of a logistic model, with and without a given processing method : cleaning data
    Input : path of negative data set, path of postive dataset
    Output:  Nonen, but prints accuracy of different processing method
    """   
    
    #Prediction accuracy with clean data
    df1 = import_data(path_dataset_neg, path_dataset_pos)
    df1 = cleaning_data(df1)
    X1, y1 = Basic_Vectorizer(df1)
    evaluate_method(X1, y1, "with clean data")
    
    #Prediction accuracy without clean data
    
    df2 = import_data(path_dataset_neg, path_dataset_pos)
    X2, y2 = Basic_Vectorizer(df2)
    evaluate_method(X2, y2, "without clean data")
    
    #Solution of test
    
    #Accuracy with clean data : 0.7924378460656063
    #Accuracy without clean data : 0.7940041031523681
    #As the Accuracy is equal, we keep this process
    
    return None

test_clean_data(path_dataset_neg, path_dataset_pos)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy with clean data : 0.7884008735743752
Accuracy without clean data : 0.7922172464759215


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [18]:
def test_stopwords(path_dataset_neg, path_dataset_pos):
    """
    This function compares the accuracy of a logistic model, with and without a given processing method : stopwords
    We keep the methods : cleaning data,  that are relevant according to the precedant test
    Input : path of negative data set, path of postive dataset
    Output:  None, but prints accuracy of different processing method
    """   
    
    #Prediction accuracy with  deleting stopwords
    df1 = import_data(path_dataset_neg, path_dataset_pos)
    df1 = cleaning_data(df1)
    df1 = remove_stopwords(df1)
    X1, y1 = Basic_Vectorizer(df1)
    evaluate_method(X1, y1, "with deleting stopwords")
    
    #Prediction accuracy without deleting stopwords
    
    df2 = import_data(path_dataset_neg, path_dataset_pos)
    df2 = cleaning_data(df2)
    X2, y2 = Basic_Vectorizer(df2)
    evaluate_method(X2, y2, "without deleting stopwords")
    
    #Solution of test
    
    #Accuracy with deleting stopwords : 0.7828858838322561
    #Accuracy without deleting stopwords : 0.790011250579074
    #As the Accuracy is equal, we should not keep this process
    
    
    return None

test_stopwords(path_dataset_neg, path_dataset_pos)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy with deleting stopwords : 0.7857757384571264
Accuracy without deleting stopwords : 0.7922172464759215


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [19]:
def test_normalization(path_dataset_neg, path_dataset_pos):
    """
    This function compares the accuracy of a logistic model, with and without a given processing method : normalization methods : Porter Stemming, Snowball stemming, Lemmatizing
    We keep the methods : cleaning data, remove stopwords, that are relevant according to the precedant test
    Input : path of negative data set, path of postive dataset
    Output:  None, but prints accuracy of different processing method
    """  
    
    #Prediction accuracy without  deleting normalization
    df1 = import_data(path_dataset_neg, path_dataset_pos)
    df1 = cleaning_data(df1)
    df1 = remove_stopwords(df1)#voir si on garde
    X1, y1 = Basic_Vectorizer(df1)
    evaluate_method(X1, y1, "without normalization")
    
    #Prediction accuracy with Porter
    df2 = import_data(path_dataset_neg, path_dataset_pos)
    df2 = cleaning_data(df2)
    df2 = remove_stopwords(df2) #voir si on garde
    df2 = Porter_stemmer(df2)
    X2, y2 = Basic_Vectorizer(df2)
    evaluate_method(X2, y2, "with Porter Stemmer")
    
    
    #Prediction accuracy with SnowBall Stemmer
    df3 = import_data(path_dataset_neg, path_dataset_pos)
    df3 = cleaning_data(df3)
    df3 = remove_stopwords(df3) #voir si on garde
    df3 = snow_ball_stemmer(df3)
    X3, y3 = Basic_Vectorizer(df3)
    evaluate_method(X3, y3, "with Snowball Stemmer")   
    
    
    #Prediction accuracy with Lemmatizer
    df4 = import_data(path_dataset_neg, path_dataset_pos)
    df4 = cleaning_data(df4)
    df4 = remove_stopwords(df4) #voir si on garde
    #df4 = lemmatize_text(df4)
    
    X4, y4 = Basic_Vectorizer(df4)
    evaluate_method(X4, y4, "with lemmatization")   
    
    
    #Solution of test
    
    #Accuracy without normalization : 0.782091725309391
    #Accuracy with Porter Stemmer : 0.7836359224371843
    #Accuracy with Snowball Stemmer : 0.7831285433809093
    #Accuracy with lemmatization : XXX
    #As the Accuracy is nearly equal, we keep method Porter Stemmer, that is the best for reproductibility of results and is short to implement
    
    return None

test_normalization(path_dataset_neg, path_dataset_pos)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy without normalization : 0.7856654386622841


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy with Porter Stemmer : 0.7826432242836028


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy with Snowball Stemmer : 0.781076967196841
Accuracy with lemmatization : 0.7837462222320266


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [20]:
def test_vectorization(path_dataset_neg, path_dataset_pos):
    
    """
    This function compares the accuracy of a logistic model, with and without a given vectorization method : 2-Grams, 3-Grams.
    We keep the methods : cleaning data, remove stopwords and Porter Stemmer, that are relevant according to the precedant test
    Input : path of negative data set, path of postive dataset
    Output:  None, but prints accuracy of different processing method
    """  
    
    #Prediction accuracy with basic vectorization
    df1 = import_data(path_dataset_neg, path_dataset_pos)
    df1 = cleaning_data(df1)
    df1 = remove_stopwords(df1)#voir si on garde
    df1 = Porter_stemmer(df1)
    X1, y1 = Basic_Vectorizer(df1)
    evaluate_method(X1, y1, "with basic vectorization")
    
    
    #Prediction accuracy with 2-Grams vectorization
    df1 = import_data(path_dataset_neg, path_dataset_pos)
    df1 = cleaning_data(df1)
    df1 = remove_stopwords(df1)#voir si on garde
    df1 = Porter_stemmer(df1)
    X1, y1 = N_Gram_Vectorizer(df1,2)
    evaluate_method(X1, y1, "with 2-grams vectorization")
    
    
    
    #Prediction accuracy with 3-Grams vectorization
    df1 = import_data(path_dataset_neg, path_dataset_pos)
    df1 = cleaning_data(df1)
    df1 = remove_stopwords(df1)#voir si on garde
    df1 = Porter_stemmer(df1)
    X1, y1 = N_Gram_Vectorizer(df1,3)
    evaluate_method(X1, y1, "with 3-grams vectorization")
    
    
    #Solution of test
    
    #Accuracy with basic vectorization : 0.7799739692484172
    #Accuracy with 2-grams vectorization : 0.7945776620855485
    #Accuracy with 3-grams vectorization : 0.7926143257373541
    

    return None

test_vectorization(path_dataset_neg, path_dataset_pos)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy with basic vectorization : 0.7809887273609671


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy with 2-grams vectorization : 0.7919304670093313
Accuracy with 3-grams vectorization : 0.792283426352827


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [22]:
def test_SVD(path_dataset_neg, path_dataset_pos, N):
    """
    This function compares the accuracy of a logistic model, with and without a given processing method : SVD 
    We keep the methods : cleaning data, remove stopwords and Porter Stemmer, 2-Grams Vectorizer that are relevant according to the precedant test
    Input : path of negative data set, path of postive dataset, Dimension of residual training set 
    Output:  Nonen, but prints accuracy of different processing method
    """  
    
    #Prediction accuracy with basic vectorization
    df1 = import_data(path_dataset_neg, path_dataset_pos)
    df1 = cleaning_data(df1)
    df1 = remove_stopwords(df1)#voir si on garde
    df1 = Porter_stemmer(df1)
    X1, y1 = N_Gram_Vectorizer(df1,2)
    X1, y1 = SVD_preprocessing(X1, y1, N)
    message = "with PCA " +str(N)
    evaluate_method(X1, y1, message)
    return None

#Accuracy with PCA 10 000 : 0.7068010853499812

test_SVD(path_dataset_neg, path_dataset_pos, 10000)

Accuracy with PCA 10000 : 0.7016611149103262
