# Sentiment Analysis

In [1]:
import sys
from os import chdir

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib.ticker import FuncFormatter
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud

sys.path.insert(0, '..')

from src import TEST_DATA_PATH, TRAIN_DATA_PATH, CACHE_DIR
from src.preprocessing.DataCleaner import DataCleaner
from src.preprocessing.Stemmer import Stemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
# import libraries
import string
import sys
from gc import collect
from os import chdir

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.ticker import FuncFormatter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from wordcloud import WordCloud

sys.path.insert(0, '..')

from src import TEST_DATA_PATH, TRAIN_DATA_PATH
from src.preprocessing.DataCleaner import DataCleaner
from src.preprocessing.Stemmer import Stemmer


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/krystian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
MAX_FEATURES = 200_000
RANDOM_SEED = 42
SAMPLE_SIZE = 0.005

In [3]:
# change directory to root
chdir('..')

# Load & Prepare Data

In [4]:
# loading data, we will use only review and label columns, and skip description column with index 1
train_data = pd.read_csv(TRAIN_DATA_PATH, names=['label', 'review'], usecols=[0, 2])
test_data = pd.read_csv(TEST_DATA_PATH, names=['label', 'review'], usecols=[0, 2])
print(f'Train data shape: {train_data.shape}')
print(f'Test data shape: {test_data.shape}')

Train data shape: (3000000, 2)
Test data shape: (650000, 2)


In [5]:
# drop neutral labels
train_data = train_data[train_data['label'] != 3]
test_data = test_data[test_data['label'] != 3]

# change the labels to 0 for negative and 1 for positive
train_data['label'] = train_data['label'].apply(lambda x: 0 if x < 3 else 1)
test_data['label'] = test_data['label'].apply(lambda x: 0 if x < 3 else 1)

# Sample Data

In [6]:
# sample the data to speed up the experiments
# make sure to stratify the data to keep the same distribution of labels
train_data, _ = train_test_split(train_data, test_size=(1 - SAMPLE_SIZE), stratify=train_data['label'],
                                 random_state=RANDOM_SEED)
test_data, _ = train_test_split(test_data, test_size=(1 - SAMPLE_SIZE), stratify=test_data['label'],
                                random_state=RANDOM_SEED)

# Preprocessing Pipeline

The pipeline will consist of the following steps:

1. Data Cleaning:
    - Convert all words to lowercase.
    - Remove stopwords.
    - Remove punctuation.
    - Remove URLs.
    - Remove handles (e.g., Twitter handles).
    - Remove emojis.
    - Remove extra spaces.

2. Stemming:
    - Reduce words to their root form using a stemming algorithm.

3. Vectorization:
    - Convert text into a matrix of token counts.
    - Set the ngram_range parameter to (1, 2) to include both individual words and pairs of consecutive words.
    - Use a predefined constant MAX_FEATURES to limit the number of most frequent words, discarding less frequent words.

4. TF-IDF Transformation:
    - Transform the matrix of token counts into a normalized TF-IDF representation.
    - This step reduces the importance of frequently occurring words and increases the importance of rarely occurring words, which could be more informative.

In [7]:
# In jupyter notebook, we will use only part of the data for more efficient data analysis and models comparison.
# Split the data into train and test sets
X_train = train_data['review']
y_train = train_data['label']
X_test = test_data['review']
y_test = test_data['label']

In [8]:
# create a pipeline
preprocessing_pipeline = Pipeline([
    ('cleaner', DataCleaner()),
    ('stemmer', Stemmer()),
    ('vectorizer', CountVectorizer(ngram_range=((1, 2)), max_features=MAX_FEATURES)),
    ('tfidf', TfidfTransformer()),
],
    verbose=True,
    memory=CACHE_DIR
)

In [9]:
# preprocess the data
X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train)
X_test_preprocessed = preprocessing_pipeline.transform(X_test)

[Pipeline] ........... (step 1 of 4) Processing cleaner, total=   6.8s
[Pipeline] ........... (step 2 of 4) Processing stemmer, total=   7.5s
[Pipeline] ........ (step 3 of 4) Processing vectorizer, total=   1.7s
[Pipeline] ............. (step 4 of 4) Processing tfidf, total=   0.1s


# Compare different models

In [None]:
def score_model(model):
    y_pred = model.predict(X_test_preprocessed)
    precision_score = metrics.precision_score(y_test, y_pred)
    recall_score = metrics.recall_score(y_test, y_pred)
    f1_score = metrics.f1_score(y_test, y_pred)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    try:
        roc_auc_score = metrics.roc_auc_score(y_test, model.decision_function(X_test_preprocessed))
    except:
        roc_auc_score = metrics.roc_auc_score(y_test, model.predict_proba(X_test_preprocessed)[:, 1])
    return precision_score, recall_score, f1_score, accuracy, roc_auc_score

In [None]:
def plot_confusion_matrix(model, model_name, accuracy, ax):
    y_pred = model.predict(X_test_preprocessed)
    cm = metrics.confusion_matrix(y_test, y_pred)
    cm_normalized = cm.astype('float') / cm.sum()
    sns.heatmap(cm_normalized, annot=True, fmt='.2%', ax=ax, cmap='Blues', cbar=False)
    ax.set_title(f"{model_name}\nAccuracy: {accuracy:.3f}")
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')


In [None]:
# prepare a dictionary of classifiers
classifiers = {
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

In [None]:
# prepare a dictionary of hyperparameters for each classifier
param_distributions = {
    'Naive Bayes': {},
    'SVM': {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf', 'poly', 'sigmoid']},
    'Logistic Regression': {'C': [0.1, 1, 10, 100], 'penalty': ['l1', 'l2', 'elasticnet', 'none']},
    'Random Forest': {'n_estimators': [10, 50, 100, 200], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]},
    'Gradient Boosting': {'n_estimators': [10, 50, 100, 200], 'learning_rate': [0.1, 0.01, 0.001], 'max_depth': [3, 10, 20]}
}

In [None]:
results = pd.DataFrame(columns=['Classifier', 'Precision', 'Recall', 'F1 Score', 'Accuracy', 'ROC AUC Score'])

for classifier_name, classifier in classifiers.items():
    random_search = RandomizedSearchCV(classifier, param_distributions=param_distributions[classifier_name], n_iter=10, cv=5, n_jobs=-1)
    random_search.fit(X_train_preprocessed, y_train)
    
    precision, recall, f1, accuracy, roc_auc = score_model(random_search.best_estimator_)
    results = results.append({
        'Classifier': classifier_name,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Accuracy': accuracy,
        'ROC AUC Score': roc_auc
    }, ignore_index=True)

In [None]:
import matplotlib.pyplot as plt

# Tworzenie nowego obiektu figure i axes dla wykresów
fig, axes = plt.subplots(nrows=len(classifiers), figsize=(5, 5 * len(classifiers)))

# Przeprowadzenie losowego przeszukiwania hiperparametrów dla każdego klasyfikatora
for ax, (classifier_name, classifier) in zip(axes, classifiers.items()):
    random_search = RandomizedSearchCV(classifier, param_distributions=param_distributions[classifier_name], n_iter=10, cv=5, n_jobs=-1)
    random_search.fit(X_train_preprocessed, y_train)
    
    # Obliczanie dokładności dla najlepszego modelu
    accuracy = metrics.accuracy_score(y_test, random_search.predict(X_test_preprocessed))
    
    # Rysowanie macierzy pomyłek
    plot_confusion_matrix(random_search.best_estimator_, classifier_name, accuracy, ax)

# Wyświetlanie wykresów
plt.tight_layout()
plt.show()