# Imports


In [1]:
# For Data
import numpy as np
import pandas as pd
import re
from datetime import datetime
from tqdm.notebook import tqdm

#  For Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.offline as pyo 
import plotly.graph_objects as go
import plotly.figure_factory as ff
import missingno as msno
from wordcloud import WordCloud
import random

# For models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier


# For NLP
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from spellchecker import SpellChecker

import pickle
from imblearn.over_sampling import SMOTE
from collections import Counter

# For Styling
plt.style.use('fivethirtyeight')

# Downloading periphrals
nltk.download('vader_lexicon')
nltk.download('stopwords')

from preprocessing import *
from plot import *
from feature_extractor import *
from data_balance import *
from model import *

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mosel\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mosel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mosel\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mosel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Import the dataset

In [2]:
train_file = 'Dataset/train.csv'
devFile = 'Dataset/dev.csv'
train_df = pd.read_csv(train_file)
dev_df = pd.read_csv(devFile)
print(f"Training dataset size = {train_df.shape}")
print(f"Dev dataset size = {dev_df.shape}")

Training dataset size = (6988, 3)
Dev dataset size = (1000, 3)


# Data Preprocessing

In [3]:
# Data cleaning
training_data = cleanData(train_df, 'training', clean = False, clearData = True)
print(f"Cleaned Training dataset size = {training_data.shape}")
# Data cleaning
dev_data = cleanData(dev_df, 'dev', clean = False, clearData = True)
print(f"Cleaned Dev dataset size = {dev_data.shape}")

Cleaned Training dataset size = (6988, 3)
Cleaned Dev dataset size = (1000, 3)


In [4]:
# Some preprocessing steps, like extracting limmitization
training_data = training_data.pipe(processing)    
print(f"Processed Training dataset size = {training_data.shape}")
print(training_data.columns)
# Some preprocessing steps, like extracting limmitization
dev_data = processing(dev_data)   
print(f"Processed dev dataset size = {dev_data.shape}")
print(dev_data.columns)

Processed Training dataset size = (6988, 5)
Index(['text', 'category', 'stance', 'Lemmatization', 'sentiment'], dtype='object')
Processed dev dataset size = (1000, 5)
Index(['text', 'category', 'stance', 'Lemmatization', 'sentiment'], dtype='object')


In [5]:
# Save the out
training_data.to_csv('out/training_data_processed.csv', index=False) # print the df in a csv file
# Save the out
dev_data.to_csv('out/dev_data_processed.csv', index=False) # print the df in a csv file

# Feature Engineering

In [6]:
## Word2Vec embeddings
model = extractWordEmbeddings(training_data['Lemmatization']) # use word2vec to extract the word embeddings
training_data['features'] = getTweetsEmbeddings(model, training_data['Lemmatization']) # get the word embeddings for each tweet
trainingFeatures = training_data['features'].to_numpy()

## Word2Vec embeddings
model = extractWordEmbeddings(dev_data['Lemmatization']) # use word2vec to extract the word embeddings
dev_data['features'] = getTweetsEmbeddings(model, dev_data['Lemmatization']) # get the word embeddings for each tweet
devFeatures = dev_data['features'].to_numpy()


In [7]:
# XLM embeddings
# features = np.load('out/train_embeddings.npy')
# XLM embeddings
# dev_data['features'] = np.load('out/test_embeddings.npy').tolist()

# Balancing the training dataset

In [8]:
# convert features into 2d array
trainingFeatures = np.array([np.array(xi) for xi in trainingFeatures])
categories = training_data['category'].to_numpy()
columns = ["f" + str(i + 1) for i in range(len(trainingFeatures[0]))]

df = pd.DataFrame(trainingFeatures, columns=columns)
df = pd.DataFrame(trainingFeatures)
df['category'] = categories

X_train, y_train = balance_data(df)
print("Some notes about dimensions of the data")
print(f"X_train size before cleaning = {trainingFeatures.shape}")
print(f"X_train size = {X_train.shape}")
print(f"y_train size = {y_train.shape}")

Before balancing:
Class=1, n=975 (13.952%)
Class=2, n=3616 (51.746%)
Class=4, n=1025 (14.668%)
Class=9, n=323 (4.622%)
Class=5, n=606 (8.672%)
Class=6, n=112 (1.603%)
Class=3, n=167 (2.390%)
Class=8, n=79 (1.131%)
Class=0, n=67 (0.959%)
Class=7, n=18 (0.258%)
After balancing:
Class=1, n=3616 (10.000%)
Class=2, n=3616 (10.000%)
Class=4, n=3616 (10.000%)
Class=9, n=3616 (10.000%)
Class=5, n=3616 (10.000%)
Class=6, n=3616 (10.000%)
Class=3, n=3616 (10.000%)
Class=8, n=3616 (10.000%)
Class=0, n=3616 (10.000%)
Class=7, n=3616 (10.000%)
Some notes about dimensions of the data
X_train size before cleaning = (6988, 100)
X_train size = (36160, 100)
y_train size = (36160,)


In [10]:
split = 'y'
if split == 'y':
    # Split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(trainingFeatures, categories, test_size=0.2, random_state=42)
else:
    X_test = np.array([np.array(xi) for xi in dev_data['features'].to_numpy()])
    # X_test = np.load('out/test_embeddings.npy')
    y_test = dev_data['category'].to_numpy()

print(f"X_test size = {X_test.shape}")
print(f"y_test size = {y_test.shape}")
# print unique values in the dataset
print(f"Unique values in the dataset = {np.unique(y_test)}")
print(f"Unique values in the dataset = {np.unique(y_train)}")

X_test size = (1398, 100)
y_test size = (1398,)
Unique values in the dataset = ['advice' 'celebrity' 'info_news' 'others' 'personal' 'plan' 'requests'
 'restrictions' 'rumors' 'unrelated']
Unique values in the dataset = ['advice' 'celebrity' 'info_news' 'others' 'personal' 'plan' 'requests'
 'restrictions' 'rumors' 'unrelated']


# Model Building
**Build a multi-class classifier to predict the category of the tweet**

In [12]:
# RandomForestClassifier
clf = RandomForestClassifier(n_estimators=500, max_depth=15, random_state=0)

model, report = modelPipeline(X_train, y_train, X_test, y_test, clf, 'out/models/clf.model')
report

              precision    recall  f1-score   support

      advice       0.00      0.00      0.00        12
   celebrity       0.76      0.44      0.56       194
   info_news       0.58      0.93      0.71       718
      others       0.00      0.00      0.00        43
    personal       0.41      0.21      0.27       208
        plan       0.33      0.02      0.03       113
    requests       0.33      0.05      0.08        21
restrictions       0.00      0.00      0.00         2
      rumors       0.00      0.00      0.00        18
   unrelated       0.43      0.09      0.14        69

    accuracy                           0.58      1398
   macro avg       0.28      0.17      0.18      1398
weighted avg       0.52      0.58      0.50      1398



0.575107296137339

In [15]:
# Xgboost
xgb = XGBClassifier()
model, report = modelPipeline(X_train, y_train, X_test, y_test, xgb, 'out/models/xgb.model')
report

['advice' 'celebrity' 'info_news' 'others' 'personal' 'plan' 'requests'
 'restrictions' 'rumors' 'unrelated']
['advice' 'celebrity' 'info_news' 'others' 'personal' 'plan' 'requests'
 'restrictions' 'rumors' 'unrelated']


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3 4 5 6 7 8 9], got ['advice' 'celebrity' 'info_news' 'others' 'personal' 'plan' 'requests'
 'restrictions' 'rumors' 'unrelated']

In [13]:
# Naive Bayes
gnb = GaussianNB(var_smoothing=10)
model, report = modelPipeline(X_train, y_train, X_test, y_test, gnb, 'out/models/gnb.model')
report

              precision    recall  f1-score   support

      advice       0.00      0.00      0.00        12
   celebrity       0.53      0.14      0.23       194
   info_news       0.53      0.99      0.69       718
      others       0.00      0.00      0.00        43
    personal       0.00      0.00      0.00       208
        plan       0.00      0.00      0.00       113
    requests       0.00      0.00      0.00        21
restrictions       0.00      0.00      0.00         2
      rumors       0.00      0.00      0.00        18
   unrelated       0.00      0.00      0.00        69

    accuracy                           0.53      1398
   macro avg       0.11      0.11      0.09      1398
weighted avg       0.34      0.53      0.38      1398



0.5271816881258942

In [None]:
# SVM
svm = svm.SVC()
model, report = modelPipeline(X_train, y_train, X_test, y_test, svm, 'out/models/svm.model')
report

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        93
           0       0.00      0.00      0.00       208
           1       0.78      1.00      0.88      1097

    accuracy                           0.78      1398
   macro avg       0.26      0.33      0.29      1398
weighted avg       0.62      0.78      0.69      1398



0.7846924177396281

# Auto ML
Check out the [Auto SKlearn](https://automl.github.io/auto-sklearn/master/index.html)

In [None]:
import autosklearn.classification
import sklearn.datasets
import sklearn.metrics
from sklearn.utils.multiclass import type_of_target

In [None]:
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=1900, # overall time in seconds
    per_run_time_limit=1300, # time per model in seconds
    initial_configurations_via_metalearning=0,
    ensemble_size=10,
    n_jobs=8,
    smac_scenario_args={"runcount_limit": 1},
)
automl.fit(X_train, y_train)
y_pred = automl.predict(X_test)



In [None]:
print(automl.sprint_statistics())
print(automl.leaderboard())

auto-sklearn results:
  Dataset name: 093be1d2-848f-11ed-983d-1da6501c908a
  Metric: accuracy
  Best validation score: 0.795664
  Number of target algorithm runs: 1
  Number of successful target algorithm runs: 1
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 0
  Number of target algorithms that exceeded the memory limit: 0

          rank  ensemble_weight           type      cost   duration
model_id                                                           
2            1              1.0  random_forest  0.204336  20.671741


In [None]:
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_pred)) #  get the Score of the final ensemble

Accuracy score 0.7761087267525035
