In [50]:
import pandas as pd
import plotly.express as px
import nltk
import re
import matplotlib.pyplot as plt
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split, learning_curve

In [51]:
df = pd.read_csv("sentiment.csv",encoding = 'latin',header=None)
print(df.shape)
df.head()


(1600000, 6)


Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [52]:
df.columns = ['targets', 'ids', 'date', 'flag', 'user', 'text']
df.head()

Unnamed: 0,targets,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [53]:
# Replace all occurrences of 4 with 1 in the "targets" column
df['targets'] = df['targets'].replace(4, 1)

# Data Preprocessing

In [54]:
# Check if columns exist before deleting them
columns_to_drop = ['ids', 'date', 'flag', 'user']

for col in columns_to_drop:
    assert col in df.columns, f"La colonne '{col}' n'existe pas dans le DataFrame."

# Delete columns if they exist
df.drop(columns_to_drop, axis=1, inplace=True)

# Display the first rows of the DataFrame
df.head()


Unnamed: 0,targets,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [55]:
import random
random_idx_list = [random.randint(1,len(df.text)) for i in range(10)] # creates random indexes to choose from dataframe
df.loc[random_idx_list,:].head(10) 

Unnamed: 0,targets,text
1011174,1,its just one of those days.... where things do...
1244088,1,is guna live in a tiny zoo and grow a giant af...
748910,0,@Boogaloo1 so it was circuit. Dunno meercat. S...
416463,0,SO FREAKING HOT! I dont wanna go manz. Being ...
134231,0,i can't change my background
1229312,1,"I might seem weird, but for some reason I love..."
448287,0,I miss @corkcampusradio already.
1059850,1,@bjgardner B!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ...
815326,1,@autumn_dreamer : thank you its all because o...
103622,0,"@Julieannex oh no! its my 1 day off! grrr, of ..."


# Text Preprocessing

In [56]:
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

In [57]:
def preprocess(text, stem=False):
    text = re.sub(text_cleaning_re, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

Lemmatization and stemming are two text normalization techniques used in natural language processing (NLP) to reduce words to their base or truncated form.

The choice between lemmatization and rootization depends on the specific needs of your NLP application. If accuracy and semantic understanding are important, lemmatization is often preferred. If processing speed is a key factor, rooting may be more appropriate.  

I wanted to try lemmatization first but the code takes an extremely long time to run, this is explained by the large size of the dataset. We therefore opted for a stemmer technique.



In [58]:
print("Text before preprocessing : ", df['text'][0])
df['text'] = df['text'].apply(preprocess)
print("Text after preprocessing : ", df['text'][0])

Text before preprocessing :  @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
Text after preprocessing :  awww bummer shoulda got david carr third day


In [59]:
lab_to_sentiment = {0:"Negative", 1:"Positive"}
def label_decoder(label):
  return lab_to_sentiment[label]
df['targets'] = df['targets'].apply(lambda x: label_decoder(x))
df.head()

Unnamed: 0,targets,text
0,Negative,awww bummer shoulda got david carr third day
1,Negative,upset update facebook texting might cry result...
2,Negative,dived many times ball managed save 50 rest go ...
3,Negative,whole body feels itchy like fire
4,Negative,behaving mad see


### Train Test Split

In [60]:
# Split your DataFrame into features (X) and target (y)
X = df['text']  
y = df['targets'] 

# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train Data shape:", X_train.shape)
print("Test Data shape", X_test.shape)

Train Data shape: (1280000,)
Test Data shape (320000,)


In [61]:
X_train.head()

1374558    ya quot like palm pre touchstone charger ready...
1389115            felt earthquake afternoon seems epicenter
1137831                            ruffles shirts like likey
790714     pretty bad night crappy morning fml buttface d...
1117911                                      yeah clear view
Name: text, dtype: object

# Model Definition

In [62]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# creation of the pipeline
text_clf = Pipeline([
    ('count_vectorizer', CountVectorizer(preprocessor=preprocess)),
    ('classifier', MultinomialNB())
])

## Training

In [63]:
# fit the pipeline to the training data 
text_clf.fit(X_train, y_train)

We use CountVectorizer and the MultinomialNB model for our first pipeline.

## Prediction

In [64]:
# Use the pipeline to run predictions on the test data
y_pred = text_clf.predict(X_test)
y_pred

array(['Positive', 'Positive', 'Positive', ..., 'Positive', 'Negative',
       'Negative'], dtype='<U8')

## Resultats visualization

In [65]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
class_labels = ["Negative","Positive"]

# Styling the confusion matrix
confusion_matrix_kwargs = dict(
    text_auto=True,
    title="Confusion Matrix", width=1000, height=800,
    labels=dict(x="Predicted", y="True Label"),
    x=class_labels,
    y=class_labels,
    color_continuous_scale='Blues'
)

 
def report(y_true, y_pred, class_labels):
    print(classification_report(y_true, y_pred, target_names=class_labels))
    # Create a confusion matrix
    confusion_matrix = pd.crosstab(y_true, y_pred, rownames=['Actual'], colnames=['Predicted'])
    # Style and display the confusion matrix
    fig = px.imshow(
        confusion_matrix,
        **confusion_matrix_kwargs
    )
    fig.show()
    

In [66]:
# call the report function to visualize the classification results, use the above class_labels
report(y_test,y_pred, class_labels)

              precision    recall  f1-score   support

    Negative       0.76      0.79      0.77    159494
    Positive       0.78      0.75      0.76    160506

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000



We will examine the F1 score for each class in the classification report to evaluate the model's performance, as it serves as a relevant indicator. The F1-score represents the harmonic mean of precision and recall. In this case, the F1-score is close to 1, indicating that our model is performing well.

Our first pipeline takes raw text data as input, transforms it into numeric vectors using CountVectorizer, and then classifies it using the MultinomialNB classification model.  
We have an f1-score of 0.77 for the negative class and an f1-score of 0.76 for the positive class. That is pretty good for a first model, we are gonna to try to make some improvements.

We have 125.453k good predictions for the Negative class and 120.215k good predictions for the Positive class.

# Improve on the baseline

## Let's try with TD-IDF

But first, let's define a [class](https://www.w3schools.com/python/python_classes.asp) that will make iterating through different architectures and vectorizers a lot faster

In [67]:
class Model:
    def __init__(self, X, y, model_architecture, vectorizer, random_seed=42, test_size=0.2) -> None:
        self.X = X
        self.y = y
        self.model_instance = model_architecture
        self.vectorizer = vectorizer
        self.random_seed = random_seed
        self.test_size = test_size

        # Create the pipeline
        self.pipeline = Pipeline([
            ('vectorizer', self.vectorizer),
            ('model', self.model_instance)
        ])

        # Split the data into training and testing sets
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=self.test_size, random_state=self.random_seed)

    def preprocess(text, stem=False):
        text = re.sub(text_cleaning_re, ' ', str(text).lower()).strip()
        tokens = []
        for token in text.split():
            if token not in stop_words:
                if stem:
                    tokens.append(stemmer.stem(token))
                else:
                    tokens.append(token)
        return " ".join(tokens)
    
    def fit(self):
        # Fit the pipeline to the training data
        self.pipeline.fit(self.X_train, self.y_train)

    def predict(self):
        # Predict on the test data
        return self.pipeline.predict(self.X_test)

    def predict_proba(self):
        # Predict probabilities on the test data
        return self.pipeline.predict_proba(self.X_test)

    def report(self, y_true, y_pred, class_labels):
        # Print classification report
        print(classification_report(y_true, y_pred, target_names=class_labels))

        # Create a confusion matrix
        confusion_matrix_df = pd.crosstab(y_true, y_pred, rownames=['Actual'], colnames=['Predicted'])

        # Styling the confusion matrix
        confusion_matrix_kwargs = dict(
            text_auto=True, 
            title="Confusion Matrix", width=1000, height=800,
            labels=dict(x="Predicted", y="True Label"),
            x=class_labels,
            y=class_labels,
            color_continuous_scale='Blues'
        )

        # Display the confusion matrix
        fig = px.imshow(
            confusion_matrix_df, 
            **confusion_matrix_kwargs
        )
        fig.show()
    


In [68]:
# Instantiate and fit the Multinomial Naive Bayes model
nb_model = Model(X, y, MultinomialNB(), TfidfVectorizer(preprocessor=preprocess))
nb_model.fit()

# Predict on the test data
y_nb_pred = nb_model.predict()


# Generate a classification report
nb_model.report(nb_model.y_test, y_nb_pred, class_labels)

              precision    recall  f1-score   support

    Negative       0.75      0.78      0.77    159494
    Positive       0.77      0.75      0.76    160506

    accuracy                           0.76    320000
   macro avg       0.76      0.76      0.76    320000
weighted avg       0.76      0.76      0.76    320000



We train in our second model, the same text classification model using the multinomial Naïve Bayes classifier (MultinomialNB) but we will use the TfidfVectorizer transformer this time and compare if we obtain better results.

We have the same f1-score for the negative class and the positive class as before. We have an f1-score of 0.77 for the negative class and an f1-score of 0.76 for the positive class.   
By taking a look at the exact values of the good predictions on the heatmap, we can see that despite the same f1 score as previously, we have less good predictions in the Negative and Postive class than previously when we used CountVectorizer.

We have 124.462k good predictions for the Negative class and 119.659k good predictions for the Positive class.

We can conclude that CountVectorizer is better than TfidfVectorizer transformer

## GridSearchCV

We are gonna use now GridSearchCV to find the best hyperparmeters !

In [69]:
from sklearn.linear_model import LogisticRegression


model = Model(X, y, LogisticRegression(), CountVectorizer(preprocessor=preprocess))

# Fit the model
model.fit()

# Predict on the test data
y_pred = model.predict()


# Generate a classification report
model.report(model.y_test, y_pred, class_labels)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



              precision    recall  f1-score   support

    Negative       0.79      0.75      0.77    159494
    Positive       0.76      0.80      0.78    160506

    accuracy                           0.78    320000
   macro avg       0.78      0.78      0.78    320000
weighted avg       0.78      0.78      0.78    320000



We will therefore keep the CountVectorizer Transformer.

In this third model we tried a new classification model which is Logistic Regression while keeping the CountVectorizer Transformer


We have an f1-score of 0.77 for the negative class and an f1-score of 0.78 for the positive class. 



We have a better f1-score for the Positive class. We can conclude that Logistic Regression is better than multinomial Naïve Bayes classifier (MultinomialNB).

In [70]:
from sklearn.model_selection import GridSearchCV

# Divide your data into a balanced sample
X_train, X_sample, y_train, y_sample = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)
#X_sample and y_sample have a size of 8500 elements

# Set the hyperparameters to search for the CountVectorizer
param_grid_cv = {
    'count_vectorizer__max_features': [1000, 5000, 10000, None],
    'count_vectorizer__ngram_range': [(1, 1), (1, 2)],
}

# Define hyperparameters to search for logistic regression
param_grid_lr = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10],
    'classifier__max_iter': [100, 200, 300],
}

# Create the pipeline with the CountVectorizer and logistic regression
text_clf = Pipeline([
    ('count_vectorizer', CountVectorizer(preprocessor=preprocess)),
    ('classifier', LogisticRegression())
])

# Perform grid search with cross-validation on your data
grid_search_cv = GridSearchCV(text_clf, param_grid_cv, cv=5, scoring='accuracy')
grid_search_lr = GridSearchCV(text_clf, param_grid_lr, cv=5, scoring='accuracy')

# Fit the models
grid_search_cv.fit(X_sample, y_sample)
grid_search_lr.fit(X_sample, y_sample)

# Get the best hyperparameters and the best score for the CountVectorizer
best_params_cv = grid_search_cv.best_params_
best_score_cv = grid_search_cv.best_score_

# Get the best hyperparameters and score for logistic regression
best_params_lr = grid_search_lr.best_params_
best_score_lr = grid_search_lr.best_score_

print("Meilleurs hyperparamètres pour CountVectorizer :")
print(best_params_cv)
print("Meilleur score pour CountVectorizer :")
print(best_score_cv)

print("Meilleurs hyperparamètres pour la régression logistique :")
print(best_params_lr)
print("Meilleur score pour la régression logistique :")
print(best_score_lr)



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to th

Meilleurs hyperparamètres pour CountVectorizer :
{'count_vectorizer__max_features': None, 'count_vectorizer__ngram_range': (1, 2)}
Meilleur score pour CountVectorizer :
0.7688312500000001
Meilleurs hyperparamètres pour la régression logistique :
{'classifier__C': 0.1, 'classifier__max_iter': 100}
Meilleur score pour la régression logistique :
0.7617375



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



We used GridSearchCv to find the best hyperparameters for our LogisticRegression() model and our CountVectorizer() transformer.  
We had to reduce our dataset and take a sample to be able to have a correct execution time.

The execution time is quite long but we have the best hyperparameters for our model as results. We will then apply these parameters to our model and see if the accuracy of the model will increase as expected.

In [71]:
from sklearn.linear_model import LogisticRegression


model = Model(X, y, LogisticRegression(C=best_params_lr['classifier__C'], max_iter=best_params_lr['classifier__max_iter']), 
              CountVectorizer(preprocessor=preprocess, max_features=best_params_cv['count_vectorizer__max_features'], ngram_range=best_params_cv['count_vectorizer__ngram_range']))

# Fit the model
model.fit()

# Predict on the test data
y_pred = model.predict()


# Generate a classification report
model.report(model.y_test, y_pred, class_labels)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



              precision    recall  f1-score   support

    Negative       0.81      0.76      0.78    159494
    Positive       0.77      0.82      0.80    160506

    accuracy                           0.79    320000
   macro avg       0.79      0.79      0.79    320000
weighted avg       0.79      0.79      0.79    320000



We apply the best hyperparameters to our model.


We have an f1-score of 0.78 for the negative class and an f1-score of 0.80 for the positive class. 

We have a better f1-score for the Positive class and the Negative Class as expected. 

To conclude this part, we managed to improve our first model by trying different transformers, different classification models and improving the hyperparameters using the GridSearchCv() function.