In [7]:
# Import libraries
import numpy as np
import optuna
import matplotlib
import matplotlib.image as mpimg
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

%matplotlib inline

2024-04-02 21:34:57.461880: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
# Train data (with images)
traindata = pd.read_csv('multimodal_train.tsv',sep='\t')

# Validation data (with images)
validata = pd.read_csv('multimodal_validate.tsv',sep='\t')

# Test data (with  images)
testdata = pd.read_csv('multimodal_test_public.tsv',sep='\t')

# Train data (with and without images)
traindata_all = pd.read_csv('all_train.tsv',sep='\t')

# Validation data (with and without images)
validata_all = pd.read_csv('all_validate.tsv',sep='\t')

# Test data (with and without images)
testdata_all = pd.read_csv('all_test_public.tsv',sep='\t')

In [14]:

# MULTIMODAL DATASET
# Train data with no missing values
train_data = traindata[traindata['clean_title'].notnull().to_numpy()]

# Validation data with no missing values
valid_data = validata[validata['clean_title'].notnull().to_numpy()]

# Test data with no missing values
test_data = testdata[testdata['clean_title'].notnull().to_numpy()]

# UNIMODAL DATASET
train_data_all = traindata_all[traindata_all['clean_title'].notnull().to_numpy()]

# Validation data with no missing values
valid_data_all = validata_all[validata_all['clean_title'].notnull().to_numpy()]

# Test data with no missing values
test_data_all = testdata_all[testdata_all['clean_title'].notnull().to_numpy()]

In [15]:
# Get series object with the necessary columns

# MULTIMODAL DATASET
# Train data (text and labels)
train_frame = train_data["clean_title"]
train_labels = train_data["6_way_label"]

# Validation data (text and labels)
valid_frame = valid_data["clean_title"]
valid_labels = valid_data["6_way_label"]

# Test data (text and labels)
test_frame = test_data["clean_title"]
test_labels = test_data["6_way_label"]

# UNIMODAL DATASET
train_frame_all = train_data_all["clean_title"]
train_labels_all = train_data_all["6_way_label"]

# Validation data (text and labels)
valid_frame_all = valid_data_all["clean_title"]
valid_labels_all = valid_data_all["6_way_label"]

# Test data (text and labels)
test_frame_all = test_data_all["clean_title"]
test_labels_all = test_data_all["6_way_label"]

## Multi-Class Text Classification

In [23]:
# Get series object with the necessary columns

# Train data (text and labels)
train_frame = train_data["clean_title"]
train_labels = train_data["6_way_label"]

# Validation data (text and labels)
valid_frame = valid_data["clean_title"]
valid_labels = valid_data["6_way_label"]

# Test data (text and labels)
test_frame = test_data["clean_title"]
test_labels = test_data["6_way_label"]

In [24]:
labels = ['True','Satire','False connection','Imposter content','Manipulated content','Misleading content']
# Train data
train_0 = sum(train_labels == 0)/len(train_labels)
train_1 = sum(train_labels == 1)/len(train_labels)
train_2 = sum(train_labels == 2)/len(train_labels)
train_3 = sum(train_labels == 3)/len(train_labels)
train_4 = sum(train_labels == 4)/len(train_labels)
train_5 = sum(train_labels == 5)/len(train_labels)
# Validation data
validation_0 = sum(valid_labels == 0)/len(valid_labels)
validation_1 = sum(valid_labels == 1)/len(valid_labels)
validation_2 = sum(valid_labels == 2)/len(valid_labels)
validation_3 = sum(valid_labels == 3)/len(valid_labels)
validation_4 = sum(valid_labels == 4)/len(valid_labels)
validation_5 = sum(valid_labels == 5)/len(valid_labels)
# Test data
test_0 = sum(test_labels == 0)/len(test_labels)
test_1 = sum(test_labels == 1)/len(test_labels)
test_2 = sum(test_labels == 2)/len(test_labels)
test_3 = sum(test_labels == 3)/len(test_labels)
test_4 = sum(test_labels == 4)/len(test_labels)
test_5 = sum(test_labels == 5)/len(test_labels)

d = {"Label":labels, "Traindata": [train_0, train_1, train_2, train_3, train_4, train_5], 
                  "Validata": [validation_0, validation_1, validation_2, validation_3, validation_4, validation_5],
                  "Testdata": [test_0, test_1, test_2, test_3, test_4, test_5]}

proportion_data = pd.DataFrame(data = d)
proportion_data

Unnamed: 0,Label,Traindata,Validata,Testdata
0,True,0.393761,0.392976,0.396281
1,Satire,0.059363,0.059334,0.059239
2,False connection,0.190108,0.190034,0.190445
3,Imposter content,0.020894,0.020862,0.020634
4,Manipulated content,0.297619,0.300125,0.294543
5,Misleading content,0.038255,0.036669,0.038858


## Hyperparameter tuning

### Preprocessing


In [25]:
# Convert 'series' data to list

## Texts ##

# Train
train_list = list(train_frame)
# Valid
valid_list = list(valid_frame)
# Test
test_list = list(test_frame)

## Labels ##

# Train
train_labels_list = list(train_labels)
# Valid
valid_labels_list = list(valid_labels)
# Test
test_labels_list = list(test_labels)

In [26]:
def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [27]:
# Remove puntuations and numbers and multiple spaces

train_news_clean_1 = []
valid_news_clean_1 = []
test_news_clean_1 = []
# Train
for new in train_list:
    train_news_clean_1.append(preprocess_text(new))
# Validation
for new in valid_list:
    valid_news_clean_1.append(preprocess_text(new))
# Test
for new in test_list:
    test_news_clean_1.append(preprocess_text(new))

### Stemmed and lemmatized data

#### Stemmed data

In [28]:
# Initialize stemmer and stop_words
import nltk
nltk.download('stopwords')
nltk.download('punkt')
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english')) 

# Function to remove stopwords
def remove_stopwords_stem(text):
    text = word_tokenize(text)
    # Stop words removal
    text = [word for word in text if word not in stop_words]
    # Stemming
    stemmed_text = [stemmer.stem(word) for word in text]    
    text_done = ' '.join(stemmed_text)
    return text_done

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/davendra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/davendra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


We remove stop words and perform stemming

In [29]:
# Train stemmed
train_stemmed = [remove_stopwords_stem(text) for text in train_news_clean_1]
# Validation stemmed
valid_stemmed = [remove_stopwords_stem(text) for text in valid_news_clean_1]
# Test stemmed
test_stemmed = [remove_stopwords_stem(text) for text in test_news_clean_1]

#### Lemmatized data

Function to remove stop words and perform lemmatization

In [30]:
# Initialize lemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

# Initialize stemmer and stop_words
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english')) 

# Function to remove stopwords
def remove_stopwords_lem(text):
    text = word_tokenize(text)
    # Stop words removal
    text = [word for word in text if word not in stop_words]
    # Lematization
    lemmatized_text = []
    for word in text:
        word1 = lemmatizer.lemmatize(word, pos = "n")
        word2 = lemmatizer.lemmatize(word1, pos = "v")
        word3 = lemmatizer.lemmatize(word2, pos = ("a"))
        lemmatized_text.append(word3) 
    text_done = ' '.join(lemmatized_text)
    return text_done

[nltk_data] Downloading package wordnet to /home/davendra/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


We remove stop words and perform lemmatiztion

In [35]:
# Train lemmatized
train_lemmatized = [remove_stopwords_lem(text) for text in train_news_clean_1]
# Validation lemmatized
valid_lemmatized = [remove_stopwords_lem(text) for text in valid_news_clean_1]
# Test lemmatized
test_lemmatized = [remove_stopwords_lem(text) for text in test_news_clean_1]

### Stemming

#### Multinomial Naive Bayes

In [51]:
class NaiveBayesOptimizer:
    def __init__(self, train_stemmed, train_labels, valid_stemmed, valid_labels):
        self.train_stemmed = train_stemmed
        self.train_labels = train_labels
        self.valid_stemmed = valid_stemmed
        self.valid_labels = valid_labels

    def create_model(self, n, sub_tf, min_df):
        return Pipeline([
            ('vect', CountVectorizer(ngram_range=(1, n), min_df=min_df)),
            ('tfidf', TfidfTransformer(sublinear_tf=sub_tf)),
            ('classifier', MultinomialNB())
        ])

    def objective(self, trial):
        n = trial.suggest_int("n", 1, 2)
        sub_tf = trial.suggest_categorical("sub_tf", ["True", "False"])
        min_df = trial.suggest_int("min_df", 5, 25)

        model = self.create_model(n, sub_tf, min_df)
        model.fit(self.train_stemmed, self.train_labels)
        predictions = model.predict(self.valid_stemmed)
        acc = accuracy_score(self.valid_labels, predictions)

        return acc

    def optimize(self, budget):
        np.random.seed(0)
        study = optuna.create_study(direction="maximize")
        study.optimize(self.objective, n_trials=budget, show_progress_bar=False)
        return study.best_params, study.best_value

# Initialize optimizer
optimizer = NaiveBayesOptimizer(train_stemmed, train_labels_list, valid_stemmed, valid_labels_list)

# Set budget
budget = 40

# Optimize hyper-parameters
best_params, best_score = optimizer.optimize(budget)

# Print results
print("Best hyper-parameters: ")
print(best_params)
print("Best score: ")
print(best_score)

[I 2024-04-02 22:20:45,434] A new study created in memory with name: no-name-3bff0c8a-fd42-4b58-98c2-e387cd928596


[I 2024-04-02 22:20:50,043] Trial 0 finished with value: 0.648663678339119 and parameters: {'n': 2, 'min_df': 6, 'sub_tf': 0}. Best is trial 0 with value: 0.648663678339119.
[I 2024-04-02 22:20:51,917] Trial 1 finished with value: 0.6272623099996629 and parameters: {'n': 1, 'min_df': 17, 'sub_tf': 1}. Best is trial 0 with value: 0.648663678339119.
[I 2024-04-02 22:20:53,836] Trial 2 finished with value: 0.6253243908193186 and parameters: {'n': 1, 'min_df': 21, 'sub_tf': 0}. Best is trial 0 with value: 0.648663678339119.
[I 2024-04-02 22:20:55,735] Trial 3 finished with value: 0.6288463482862053 and parameters: {'n': 1, 'min_df': 15, 'sub_tf': 0}. Best is trial 0 with value: 0.648663678339119.
[I 2024-04-02 22:20:57,669] Trial 4 finished with value: 0.6302618718614135 and parameters: {'n': 1, 'min_df': 12, 'sub_tf': 1}. Best is trial 0 with value: 0.648663678339119.
[I 2024-04-02 22:20:59,617] Trial 5 finished with value: 0.6232685113410401 and parameters: {'n': 1, 'min_df': 25, 'sub_tf

#### Logistic Regression

In [48]:
class LogisticRegressionOptimizer:
    def __init__(self, train_stemmed, train_labels, valid_stemmed, valid_labels):
        self.train_stemmed = train_stemmed
        self.train_labels = train_labels
        self.valid_stemmed = valid_stemmed
        self.valid_labels = valid_labels

    def create_model(self, max_iter, solver, multi_class, n, min_df, sub_tf):
        return Pipeline([
            ('vect', CountVectorizer(ngram_range=(1, n), min_df=min_df)),
            ('tfidf', TfidfTransformer(sublinear_tf=sub_tf)),
            ('classifier', LogisticRegression(random_state=3, solver=solver, multi_class=multi_class, max_iter=max_iter))
        ])

    def objective(self, trial):
        max_iter = trial.suggest_int("max_iter", 320, 420)
        solver = trial.suggest_categorical("solver", ["newton-cg"])
        multi_class = trial.suggest_categorical("multi_class", ["ovr", "multinomial"])
        n = trial.suggest_int("n", 1, 2)
        min_df = trial.suggest_int("min_df", 5, 25)
        sub_tf = trial.suggest_categorical("sub_tf", ["True", "False"])

        model = self.create_model(max_iter, solver, multi_class, n, min_df, sub_tf)
        model.fit(self.train_stemmed, self.train_labels)
        predictions = model.predict(self.valid_stemmed)
        acc = accuracy_score(self.valid_labels, predictions)

        return acc

    def optimize(self, budget):
        np.random.seed(0)
        study = optuna.create_study(direction="maximize")
        study.optimize(self.objective, n_trials=budget, show_progress_bar=False)
        return study.best_params, study.best_value

# Initialize optimizer
optimizer = LogisticRegressionOptimizer(train_stemmed, train_labels_list, valid_stemmed, valid_labels_list)

# Set budget
budget = 40

# Optimize hyper-parameters
best_params, best_score = optimizer.optimize(budget)

# Print results
print("Best hyper-parameters: ")
print(best_params)
print("Best score: ")
print(best_score)

[I 2024-04-02 22:03:41,874] A new study created in memory with name: no-name-64704bfc-611b-4a65-8e3a-5d600814ed2a
[I 2024-04-02 22:03:51,184] Trial 0 finished with value: 0.6948367092447171 and parameters: {'max_iter': 404, 'solver': 'newton-cg', 'multi_class': 'ovr', 'n': 1, 'min_df': 18, 'sub_tf': 1}. Best is trial 0 with value: 0.6948367092447171.
[I 2024-04-02 22:04:00,597] Trial 1 finished with value: 0.6965050048869267 and parameters: {'max_iter': 366, 'solver': 'newton-cg', 'multi_class': 'ovr', 'n': 1, 'min_df': 7, 'sub_tf': 0}. Best is trial 1 with value: 0.6965050048869267.
[I 2024-04-02 22:04:09,883] Trial 2 finished with value: 0.6950220754271847 and parameters: {'max_iter': 332, 'solver': 'newton-cg', 'multi_class': 'ovr', 'n': 1, 'min_df': 17, 'sub_tf': 1}. Best is trial 1 with value: 0.6965050048869267.
[I 2024-04-02 22:04:19,453] Trial 3 finished with value: 0.6963364901755923 and parameters: {'max_iter': 356, 'solver': 'newton-cg', 'multi_class': 'ovr', 'n': 1, 'min_df

Best hyper-parameters: 
{'max_iter': 411, 'solver': 'newton-cg', 'multi_class': 'multinomial', 'n': 2, 'min_df': 5, 'sub_tf': 0}
Best score: 
0.7144855245862964


#### Random Forest

In [None]:
class RandomForestOptimizer:
    def __init__(self, train_stemmed, train_labels, valid_stemmed, valid_labels):
        self.train_stemmed = train_stemmed
        self.train_labels = train_labels
        self.valid_stemmed = valid_stemmed
        self.valid_labels = valid_labels

    def create_model(self, n_estimators, criterion, max_depth, n, min_df, sub_tf):
        return Pipeline([
            ('vect', CountVectorizer(ngram_range=(1, n), min_df=min_df)),
            ('tfidf', TfidfTransformer(sublinear_tf=sub_tf)),
            ('classifier', RandomForestClassifier(
                random_state=3, n_estimators=n_estimators, criterion=criterion,
                max_depth=max_depth
            ))
        ])

    def objective(self, trial):
        n_estimators = trial.suggest_int("n_estimators", 100, 300)
        criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
        max_depth = trial.suggest_int("max_depth", 3, 6)
        n = trial.suggest_int("n", 1, 2)
        min_df = trial.suggest_int("min_df", 5, 25)
        sub_tf = trial.suggest_categorical("sub_tf", ["True", "False"])

        model = self.create_model(n_estimators, criterion, max_depth, n, min_df, sub_tf)
        model.fit(self.train_stemmed, self.train_labels)
        predictions = model.predict(self.valid_stemmed)
        accuracy = accuracy_score(self.valid_labels, predictions)

        return accuracy

    def optimize(self, budget):
        np.random.seed(0)
        study = optuna.create_study(direction="maximize")
        study.optimize(self.objective, n_trials=budget, show_progress_bar=False)
        return study.best_params, study.best_value

# Initialize optimizer
optimizer = RandomForestOptimizer(train_stemmed, train_labels_list, valid_stemmed, valid_labels_list)

# Set budget
budget = 40

# Optimize hyper-parameters
best_params, best_score = optimizer.optimize(budget)

# Print results
print("Best hyper-parameters: ")
print(best_params)
print("Best score: ")
print(best_score)


### Lemmatization

#### Naive Bayes

In [None]:
class BayesModel:
    def __init__(self, train_lemmatized, train_labels_list, valid_lemmatized, valid_labels_list):
        self.train_lemmatized = train_lemmatized
        self.train_labels_list = train_labels_list
        self.valid_lemmatized = valid_lemmatized
        self.valid_labels_list = valid_labels_list

    def create_model(self, n, sub_tf, min_df):
        Bayes_pipe = Pipeline([
            ('vect', CountVectorizer(ngram_range = (1, n), min_df = min_df)),
            ('tfidf', TfidfTransformer(sublinear_tf = sub_tf)),
            ('classifier', MultinomialNB() )
        ])
        return Bayes_pipe

    def train_and_evaluate(self, trial):
        # Sample values for the hyper-parameters
        n = trial.suggest_int("n", 1, 2)
        sub_tf = trial.suggest_int("sub_tf", 0, 1) == 1
        min_df = trial.suggest_int("min_df",5,25)

        # Create and fit model
        clf_Bayes = self.create_model(n, sub_tf, min_df)
        clf_Bayes.fit(self.train_lemmatized, self.train_labels_list)

        # Obtain the predictions and accuracy
        predictions = clf_Bayes.predict(self.valid_lemmatized)
        acc = accuracy_score(self.valid_labels_list, predictions)

        return acc

    def optimize(self, budget):
        # Select budget and set seed
        np.random.seed(0)

        # Optimize hyper-parameters
        study_Bayes = optuna.create_study(direction="maximize")
        study_Bayes.optimize(lambda trial: self.train_and_evaluate(trial), n_trials=budget, show_progress_bar=False)

        # Best hyper-parameters
        print("Best hyper-parameters: ")
        print(study_Bayes.best_params)
        # Best score
        print("Best score: ")
        print(study_Bayes.best_value)

# Initialize the model
bayes_model = BayesModel(train_lemmatized, train_labels_list, valid_lemmatized, valid_labels_list)

# Optimize hyper-parameters
bayes_model.optimize(budget=40)

[32m[I 2021-07-20 10:47:58,150][0m A new study created in memory with name: no-name-4796fa05-0079-4e15-b342-d759c4dec0b5[0m
[32m[I 2021-07-20 10:48:03,727][0m Trial 0 finished with value: 0.6253243908193186 and parameters: {'n': 1, 'sub_tf': 'False', 'min_df': 22}. Best is trial 0 with value: 0.6253243908193186.[0m
[32m[I 2021-07-20 10:48:09,156][0m Trial 1 finished with value: 0.6284587644501365 and parameters: {'n': 1, 'sub_tf': 'True', 'min_df': 16}. Best is trial 1 with value: 0.6284587644501365.[0m
[32m[I 2021-07-20 10:48:26,419][0m Trial 2 finished with value: 0.6436250884702235 and parameters: {'n': 2, 'sub_tf': 'True', 'min_df': 11}. Best is trial 2 with value: 0.6436250884702235.[0m
[32m[I 2021-07-20 10:48:43,660][0m Trial 3 finished with value: 0.6447204340938963 and parameters: {'n': 2, 'sub_tf': 'True', 'min_df': 10}. Best is trial 3 with value: 0.6447204340938963.[0m
[32m[I 2021-07-20 10:49:00,813][0m Trial 4 finished with value: 0.6348960264231067 and par

Best hyper-parameters: 
{'n': 2, 'sub_tf': 'True', 'min_df': 5}
Best score: 
0.650230865154528


#### Logistic Regression

In [None]:
class LogisticRegressionOptimizer:
    def __init__(self, train_lemmatized, train_labels_list, valid_lemmatized, valid_labels_list):
        self.train_lemmatized = train_lemmatized
        self.train_labels_list = train_labels_list
        self.valid_lemmatized = valid_lemmatized
        self.valid_labels_list = valid_labels_list

    def create_model(self, n, min_df, sub_tf, max_iter, solver, multi_class):
        return Pipeline([('vect', CountVectorizer(ngram_range = (1, n), min_df = min_df)),
                        ('tfidf', TfidfTransformer(sublinear_tf = sub_tf)),
                        ('classifier', LogisticRegression(random_state = 3,
                                        solver = solver, multi_class = multi_class,   max_iter = max_iter ))])

    def objective(self, trial):
        max_iter = trial.suggest_int("max_iter", 320, 420)
        solver = trial.suggest_categorical("solver", ["newton-cg"])
        multi_class = trial.suggest_categorical("multi_class",["ovr", "multinomial"])
        n = trial.suggest_int("n", 1, 2)
        min_df = trial.suggest_int("min_df",5,25)
        sub_tf = trial.suggest_categorical("sub_tf", ["True", "False"])

        clf_Logistic = self.create_model(n, min_df, sub_tf, max_iter, solver, multi_class)
        clf_Logistic.fit(self.train_lemmatized, self.train_labels_list)
        predictions = clf_Logistic.predict(self.valid_lemmatized)
        acc = accuracy_score(self.valid_labels_list, predictions)

        return acc

    def optimize(self, budget):
        np.random.seed(0)
        study = optuna.create_study(direction="maximize")
        study.optimize(self.objective, n_trials=budget, show_progress_bar=False)
        return study.best_params, study.best_value

# Initialize optimizer
optimizer = LogisticRegressionOptimizer(train_lemmatized, train_labels_list, valid_lemmatized, valid_labels_list)

# Set budget
budget = 40

# Optimize hyper-parameters
best_params, best_score = optimizer.optimize(budget)

# Print results
print("Best hyper-parameters: ")
print(best_params)
print("Best score: ")
print(best_score)

[32m[I 2021-07-20 10:58:40,681][0m A new study created in memory with name: no-name-e26fe76b-6220-4817-a426-a720ba3e5d93[0m
[32m[I 2021-07-20 10:59:48,879][0m Trial 0 finished with value: 0.6963027872333255 and parameters: {'max_iter': 370, 'solver': 'newton-cg', 'multi_class': 'ovr', 'n': 1, 'min_df': 8, 'sub_tf': 'True'}. Best is trial 0 with value: 0.6963027872333255.[0m
[32m[I 2021-07-20 11:01:59,987][0m Trial 1 finished with value: 0.6993360520373428 and parameters: {'max_iter': 329, 'solver': 'newton-cg', 'multi_class': 'multinomial', 'n': 1, 'min_df': 14, 'sub_tf': 'False'}. Best is trial 1 with value: 0.6993360520373428.[0m
[32m[I 2021-07-20 11:04:12,043][0m Trial 2 finished with value: 0.7006673182568838 and parameters: {'max_iter': 362, 'solver': 'newton-cg', 'multi_class': 'multinomial', 'n': 1, 'min_df': 5, 'sub_tf': 'False'}. Best is trial 2 with value: 0.7006673182568838.[0m
[32m[I 2021-07-20 11:06:28,408][0m Trial 3 finished with value: 0.7081156684978599 a

Best hyper-parameters: 
{'max_iter': 390, 'solver': 'newton-cg', 'multi_class': 'multinomial', 'n': 2, 'min_df': 5, 'sub_tf': 'True'}
Best score: 
0.7150921775470999


#### Random Forest

In [None]:
class ForestModel:
    def __init__(self, train_lemmatized, train_labels_list, valid_lemmatized, valid_labels_list):
        self.train_lemmatized = train_lemmatized
        self.train_labels_list = train_labels_list
        self.valid_lemmatized = valid_lemmatized
        self.valid_labels_list = valid_labels_list

    def create_model(self, n_estimators, criterion, max_depth, n, min_df, sub_tf):
        Forest_pipe = Pipeline([
            ('vect', CountVectorizer(ngram_range = (1, n), min_df = min_df)),
            ('tfidf', TfidfTransformer(sublinear_tf = sub_tf)),
            ('classifier', RandomForestClassifier(
                random_state = 3, n_estimators = n_estimators, criterion = criterion,
                max_depth = max_depth ) )
        ])
        return Forest_pipe

    def train_and_evaluate(self, trial):
        # Sample values for the hyper-parameters
        n_estimators = trial.suggest_int("n_estimators", 100, 300)
        criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
        max_depth = trial.suggest_int("max_depth", 3, 6)
        n = trial.suggest_int("n", 1, 2)
        min_df = trial.suggest_int("min_df",5,25)
        sub_tf = trial.suggest_categorical("sub_tf", ["True","False"])

        # Create and fit model
        clf_Forest = self.create_model(n_estimators, criterion, max_depth, n, min_df, sub_tf)
        clf_Forest.fit(self.train_lemmatized, self.train_labels_list)

        # Obtain the predictions and accuracy
        predictions = clf_Forest.predict(self.valid_lemmatized)
        acc = accuracy_score(self.valid_labels_list, predictions)

        return acc

    def optimize(self, budget):
        # Select budget and set seed
        np.random.seed(0)

        # Optimize hyper-parameters
        study_Forest = optuna.create_study(direction="maximize")
        study_Forest.optimize(lambda trial: self.train_and_evaluate(trial), n_trials=budget, show_progress_bar=False)

        # Best hyper-parameters
        print("Best hyper-parameters: ")
        print(study_Forest.best_params)
        # Best score
        print("Best score: ")
        print(study_Forest.best_value)

# Initialize the model
forest_model = ForestModel(train_lemmatized, train_labels_list, valid_lemmatized, valid_labels_list)

# Optimize hyper-parameters
forest_model.optimize(budget=40)

[32m[I 2021-07-20 13:44:29,932][0m A new study created in memory with name: no-name-1de3f99b-1a25-4e07-8a7f-0a788f62457b[0m
[32m[I 2021-07-20 13:44:58,103][0m Trial 0 finished with value: 0.3929763068315864 and parameters: {'n_estimators': 158, 'criterion': 'gini', 'max_depth': 4, 'n': 1, 'min_df': 25, 'sub_tf': 'False'}. Best is trial 0 with value: 0.3929763068315864.[0m
[32m[I 2021-07-20 13:45:38,170][0m Trial 1 finished with value: 0.3929763068315864 and parameters: {'n_estimators': 258, 'criterion': 'gini', 'max_depth': 4, 'n': 1, 'min_df': 10, 'sub_tf': 'True'}. Best is trial 0 with value: 0.3929763068315864.[0m
[32m[I 2021-07-20 13:46:15,293][0m Trial 2 finished with value: 0.3929763068315864 and parameters: {'n_estimators': 110, 'criterion': 'entropy', 'max_depth': 6, 'n': 2, 'min_df': 15, 'sub_tf': 'False'}. Best is trial 0 with value: 0.3929763068315864.[0m
[32m[I 2021-07-20 13:46:49,297][0m Trial 3 finished with value: 0.3929763068315864 and parameters: {'n_esti

Best hyper-parameters: 
{'n_estimators': 118, 'criterion': 'entropy', 'max_depth': 6, 'n': 1, 'min_df': 14, 'sub_tf': 'True'}
Best score: 
0.4065754440362644


## Model Evaluation

### Stemming 



In [None]:
# Join train and validation sets
training_stemmed = train_stemmed + valid_stemmed
training_lemmatized =  train_lemmatized + valid_lemmatized

# Joining train and validation labels
training_labels = train_labels_list + valid_labels_list

#### Naive Bayes

In [None]:
## Best hyper-parameters
#'n': 2, 'sub_tf': 'False', 'min_df': 5

# Training
Bayes_pipe = Pipeline([('vect', CountVectorizer(ngram_range = (1, 2), min_df = 5)),
                            ('tfidf', TfidfTransformer(sublinear_tf = 'False')),('classifier', MultinomialNB() )])
Bayes_pipe.fit(training_stemmed, training_labels)

# Evaluation of the model
predictions_Bayes_stem = Bayes_pipe.predict(test_stemmed)
print(classification_report(np.array(test_labels_list).reshape(len(test_labels),1),predictions_Bayes_stem))

              precision    recall  f1-score   support

           0       0.61      0.90      0.73     23507
           1       0.84      0.10      0.17      3514
           2       0.72      0.44      0.55     11297
           3       0.91      0.01      0.02      1224
           4       0.69      0.66      0.68     17472
           5       0.92      0.33      0.48      2305

    accuracy                           0.65     59319
   macro avg       0.78      0.41      0.44     59319
weighted avg       0.69      0.65      0.62     59319



In [None]:
# Classification report without 0 label
print(classification_report(np.array(test_labels).reshape(len(test_labels),1),predictions_Bayes_stem, labels = [1,2,3,4,5]))

              precision    recall  f1-score   support

           1       0.84      0.10      0.17      3514
           2       0.72      0.44      0.55     11297
           3       0.91      0.01      0.02      1224
           4       0.69      0.66      0.68     17472
           5       0.92      0.33      0.48      2305

   micro avg       0.71      0.49      0.58     35812
   macro avg       0.82      0.31      0.38     35812
weighted avg       0.74      0.49      0.55     35812



In [None]:
# Confusion matrix
print(confusion_matrix(np.array(test_labels).reshape(len(test_labels),1),predictions_Bayes_stem))

[[21047    21   826     0  1605     8]
 [ 1927   334   213     0  1033     7]
 [ 4476    17  4994     0  1772    38]
 [  782     2    71    10   355     4]
 [ 5242    18   616     1 11584    11]
 [  979     6   230     0   334   756]]


#### Logistic Regression

In [None]:
## Best hyper-parameters
# 'max_iter': 373, 'solver': 'newton-cg', 'multi_class': 'multinomial', 'n': 2, 'min_df': 5, 'sub_tf': 'True'

# Training
Logistic_pipe = Pipeline([('vect', CountVectorizer(ngram_range = (1, 2), min_df = 5)),
                            ('tfidf', TfidfTransformer(sublinear_tf = 'True')),('classifier', LogisticRegression(random_state = 3,
                                    solver = 'newton-cg', multi_class = 'multinomial',   max_iter = 373 ) )])
Logistic_pipe.fit(training_stemmed, training_labels)

# Evaluation of the model
predictions_Logistic_stem = Logistic_pipe.predict(test_stemmed)
print(classification_report(np.array(test_labels_list).reshape(len(test_labels),1),predictions_Logistic_stem))

              precision    recall  f1-score   support

           0       0.72      0.86      0.79     23507
           1       0.64      0.24      0.34      3514
           2       0.69      0.54      0.60     11297
           3       0.63      0.12      0.20      1224
           4       0.73      0.81      0.77     17472
           5       0.78      0.52      0.62      2305

    accuracy                           0.72     59319
   macro avg       0.70      0.51      0.55     59319
weighted avg       0.71      0.72      0.70     59319



In [None]:
# Classification report without 0 label
print(classification_report(np.array(test_labels).reshape(len(test_labels),1),predictions_Logistic_stem, labels = [1,2,3,4,5]))

              precision    recall  f1-score   support

           1       0.64      0.24      0.34      3514
           2       0.69      0.54      0.60     11297
           3       0.63      0.12      0.20      1224
           4       0.73      0.81      0.77     17472
           5       0.78      0.52      0.62      2305

   micro avg       0.72      0.63      0.67     35812
   macro avg       0.69      0.44      0.51     35812
weighted avg       0.71      0.63      0.65     35812



In [None]:
# Confusion matrix
print(confusion_matrix(np.array(test_labels).reshape(len(test_labels),1),predictions_Logistic_stem))

[[20236   164  1255    29  1696   127]
 [ 1238   827   272    12  1125    40]
 [ 3104    85  6072    22  1921    93]
 [  690    33   102   147   236    16]
 [ 2080   144   956    16 14222    54]
 [  634    47   180     6   248  1190]]


#### Random Forest

In [None]:
## Best hyper-parameters
# 'n_estimators': 142, 'criterion': 'entropy', 'max_depth': 6, 'n': 1, 'min_df': 24, 'sub_tf': 'True'

# Training
Forest_pipe = Pipeline([('vect', CountVectorizer(ngram_range = (1, 1), min_df = 24)),
                            ('tfidf', TfidfTransformer(sublinear_tf = 'True')),('classifier', RandomForestClassifier(
                                random_state = 3, n_estimators = 142, criterion = 'entropy',
                                max_depth = 6) )])
Forest_pipe.fit(training_stemmed, training_labels)

# Evaluation of the model
predictions_Forest_stem = Forest_pipe.predict(test_stemmed)
print(classification_report(np.array(test_labels_list).reshape(len(test_labels),1),predictions_Forest_stem))

              precision    recall  f1-score   support

           0       0.40      1.00      0.57     23507
           1       0.00      0.00      0.00      3514
           2       0.00      0.00      0.00     11297
           3       0.00      0.00      0.00      1224
           4       1.00      0.00      0.00     17472
           5       0.00      0.00      0.00      2305

    accuracy                           0.40     59319
   macro avg       0.23      0.17      0.09     59319
weighted avg       0.45      0.40      0.22     59319




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [None]:
# Classification report without 0 label
print(classification_report(np.array(test_labels).reshape(len(test_labels),1),predictions_Forest_stem, labels = [1,2,3,4,5]))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00      3514
           2       0.00      0.00      0.00     11297
           3       0.00      0.00      0.00      1224
           4       1.00      0.00      0.00     17472
           5       0.00      0.00      0.00      2305

   micro avg       1.00      0.00      0.00     35812
   macro avg       0.20      0.00      0.00     35812
weighted avg       0.49      0.00      0.00     35812




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [None]:
# Confusion matrix
print(confusion_matrix(np.array(test_labels).reshape(len(test_labels),1),predictions_Forest_stem))

[[23507     0     0     0     0     0]
 [ 3514     0     0     0     0     0]
 [11297     0     0     0     0     0]
 [ 1224     0     0     0     0     0]
 [17471     0     0     0     1     0]
 [ 2305     0     0     0     0     0]]


### Lemmatization


#### Naive Bayes

In [None]:
## Best hyper-parameters
# 'n': 2, 'sub_tf': 'True', 'min_df': 5

# Training
Bayes_pipe_lem = Pipeline([('vect', CountVectorizer(ngram_range = (1, 2), min_df = 5)),
                            ('tfidf', TfidfTransformer(sublinear_tf = 'True')),('classifier', MultinomialNB() )])
Bayes_pipe_lem.fit(training_lemmatized, training_labels)

# Evaluation of the model
predictions_Bayes_lem = Bayes_pipe_lem.predict(test_lemmatized)
print(classification_report(np.array(test_labels).reshape(len(test_labels),1),predictions_Bayes_lem))

              precision    recall  f1-score   support

           0       0.61      0.90      0.73     23507
           1       0.83      0.10      0.17      3514
           2       0.72      0.44      0.55     11297
           3       0.92      0.01      0.02      1224
           4       0.70      0.66      0.68     17472
           5       0.92      0.33      0.49      2305

    accuracy                           0.65     59319
   macro avg       0.78      0.41      0.44     59319
weighted avg       0.69      0.65      0.62     59319



In [None]:
# Classification report without 0 label
print(classification_report(np.array(test_labels).reshape(len(test_labels),1),predictions_Bayes_lem, labels = [1,2,3,4,5]))

              precision    recall  f1-score   support

           1       0.83      0.10      0.17      3514
           2       0.72      0.44      0.55     11297
           3       0.92      0.01      0.02      1224
           4       0.70      0.66      0.68     17472
           5       0.92      0.33      0.49      2305

   micro avg       0.71      0.49      0.58     35812
   macro avg       0.82      0.31      0.38     35812
weighted avg       0.74      0.49      0.55     35812



In [None]:
# Confusion matrix
print(confusion_matrix(np.array(test_labels).reshape(len(test_labels),1),predictions_Bayes_lem))

[[21055    21   817     0  1606     8]
 [ 1953   335   196     0  1025     5]
 [ 4470    19  5014     0  1759    35]
 [  786     3    71    12   348     4]
 [ 5238    22   614     1 11584    13]
 [  977     5   230     0   327   766]]


#### Logistic Regression

In [None]:
## Best hyper-parameters
# 'max_iter': 390, 'solver': 'newton-cg', 'multi_class': 'multinomial', 'n': 2, 'min_df': 5, 'sub_tf': 'True'

# Training
Logistic_pipe_lem = Pipeline([('vect', CountVectorizer(ngram_range = (1, 2), min_df = 5)),
                            ('tfidf', TfidfTransformer(sublinear_tf = 'True')),('classifier', LogisticRegression(random_state = 3,
                                    solver = 'newton-cg', multi_class = 'multinomial',   max_iter = 390 ) )])
Logistic_pipe_lem.fit(training_lemmatized, training_labels)

# Evaluation of the model
predictions_Logistic_lem = Logistic_pipe_lem.predict(test_lemmatized)
print(classification_report(np.array(test_labels).reshape(len(test_labels),1),predictions_Logistic_lem))

              precision    recall  f1-score   support

           0       0.72      0.86      0.79     23507
           1       0.63      0.23      0.34      3514
           2       0.69      0.54      0.60     11297
           3       0.68      0.12      0.21      1224
           4       0.73      0.81      0.77     17472
           5       0.79      0.51      0.62      2305

    accuracy                           0.72     59319
   macro avg       0.71      0.51      0.55     59319
weighted avg       0.71      0.72      0.70     59319



In [None]:
# Classification report without 0 label
print(classification_report(np.array(test_labels).reshape(len(test_labels),1),predictions_Logistic_lem, labels = [1,2,3,4,5]))

              precision    recall  f1-score   support

           1       0.63      0.23      0.34      3514
           2       0.69      0.54      0.60     11297
           3       0.68      0.12      0.21      1224
           4       0.73      0.81      0.77     17472
           5       0.79      0.51      0.62      2305

   micro avg       0.72      0.63      0.67     35812
   macro avg       0.70      0.44      0.51     35812
weighted avg       0.71      0.63      0.65     35812



In [None]:
# Confusion matrix
print(confusion_matrix(np.array(test_labels).reshape(len(test_labels),1),predictions_Logistic_lem))

[[20235   170  1252    27  1706   117]
 [ 1241   825   256     9  1149    34]
 [ 3116    82  6056    19  1932    92]
 [  682    32    99   150   245    16]
 [ 2111   153   953    10 14194    51]
 [  647    46   179     5   249  1179]]


#### Random Forest

In [None]:
## Best hyper-parameters
# 'n_estimators': 118, 'criterion': 'entropy', 'max_depth': 6, 'n': 1, 'min_df': 14, 'sub_tf': 'True'

# Training
Forest_pipe_lem = Pipeline([('vect', CountVectorizer(ngram_range = (1, 1), min_df = 14)),
                            ('tfidf', TfidfTransformer(sublinear_tf = 'True')),('classifier', RandomForestClassifier(
                                random_state = 3, n_estimators = 118, criterion = 'entropy',
                                max_depth = 6) )])
Forest_pipe_lem.fit(training_lemmatized, training_labels)

# Evaluation of the model
predictions_Forest_lem = Forest_pipe_lem.predict(test_lemmatized)
print(classification_report(np.array(test_labels).reshape(len(test_labels),1),predictions_Forest_lem))

              precision    recall  f1-score   support

           0       0.40      1.00      0.57     23507
           1       0.00      0.00      0.00      3514
           2       0.00      0.00      0.00     11297
           3       0.00      0.00      0.00      1224
           4       1.00      0.04      0.08     17472
           5       0.00      0.00      0.00      2305

    accuracy                           0.41     59319
   macro avg       0.23      0.17      0.11     59319
weighted avg       0.45      0.41      0.25     59319




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [None]:
# Classification report without 0 label
print(classification_report(np.array(test_labels).reshape(len(test_labels),1),predictions_Forest_lem, labels = [1,2,3,4,5]))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00      3514
           2       0.00      0.00      0.00     11297
           3       0.00      0.00      0.00      1224
           4       1.00      0.04      0.08     17472
           5       0.00      0.00      0.00      2305

   micro avg       1.00      0.02      0.04     35812
   macro avg       0.20      0.01      0.02     35812
weighted avg       0.49      0.02      0.04     35812




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [None]:
# Confusion matrix
print(confusion_matrix(np.array(test_labels).reshape(len(test_labels),1),predictions_Forest_lem))

[[23507     0     0     0     0     0]
 [ 3514     0     0     0     0     0]
 [11297     0     0     0     0     0]
 [ 1224     0     0     0     0     0]
 [16743     0     0     0   729     0]
 [ 2305     0     0     0     0     0]]
