# Imports


In [1]:
# For Data
import numpy as np
import pandas as pd
import re
from datetime import datetime
from tqdm.notebook import tqdm

#  For Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.offline as pyo 
import plotly.graph_objects as go
import plotly.figure_factory as ff
import missingno as msno
from wordcloud import WordCloud
import random

# For models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier


# For NLP
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from spellchecker import SpellChecker

import pickle
from imblearn.over_sampling import SMOTE
from collections import Counter

# For Styling
plt.style.use('fivethirtyeight')

# Downloading periphrals
nltk.download('vader_lexicon')
nltk.download('stopwords')

from preprocessing import *
from plot import *
from feature_extractor import *
from data_balance import *
from model import *

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/makrion/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/makrion/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/makrion/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/makrion/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
MODELS_PATH = 'out/models/category/'

# Import the dataset

In [3]:
train_file = 'Dataset/train.csv'
devFile = 'Dataset/dev.csv'
train_df = pd.read_csv(train_file)
dev_df = pd.read_csv(devFile)
print(f"Training dataset size = {train_df.shape}")
print(f"Dev dataset size = {dev_df.shape}")

Training dataset size = (6988, 3)
Dev dataset size = (1000, 3)


# Data Preprocessing

In [4]:
# Data cleaning
training_data = cleanData(train_df, 'training', clean = False, clearData = False)
print(f"Cleaned Training dataset size = {training_data.shape}")
# Data cleaning
dev_data = cleanData(dev_df, 'dev', clean = False, clearData = False)
print(f"Cleaned Dev dataset size = {dev_data.shape}")

Cleaned Training dataset size = (6557, 3)
Cleaned Dev dataset size = (1000, 3)


In [5]:
def edit_categories(x):
    if x == 'info_news':
        return 0
    elif x == 'celebrity':
        return 1
    elif x == 'plan':
        return 2
    elif x == 'requests':
        return 3
    elif x == 'rumors':
        return 4
    elif x == 'advice':
        return 5
    elif x == 'restrictions':
        return 6
    elif x == 'personal':
        return 7
    elif x == 'unrelated':
        return 8
    elif x == 'others':
        return 9
    else:
        return -1

training_data['category'] = training_data['category'].apply(edit_categories)
dev_data['category'] = dev_data['category'].apply(edit_categories)

training_data['category'].value_counts()

0    3422
7     972
1     897
2     577
8     293
9     150
3      88
4      75
5      67
6      16
Name: category, dtype: int64

In [6]:
def processing_categories(data):
    # Apply Lemmatization to the tweets
    st = ISRIStemmer()
    data['Lemmatization'] = data.text.apply(lambda x: ([st.stem(word) for word in x.split()]))
    # Extract Sentiment Values for each tweet 
    data['sentiment'] = data['category']
    return data

In [7]:
# Some preprocessing steps, like extracting limmitization
training_data = training_data.pipe(processing_categories)    
print(f"Processed Training dataset size = {training_data.shape}")
print(training_data.columns)
# Some preprocessing steps, like extracting limmitization
dev_data = processing_categories(dev_data)   
print(f"Processed dev dataset size = {dev_data.shape}")
print(dev_data.columns)

Processed Training dataset size = (6557, 5)
Index(['text', 'category', 'stance', 'Lemmatization', 'sentiment'], dtype='object')
Processed dev dataset size = (1000, 5)
Index(['text', 'category', 'stance', 'Lemmatization', 'sentiment'], dtype='object')


In [8]:
# Save the out
training_data.to_csv('out/training_data_processed.csv', index=False) # print the df in a csv file
# Save the out
dev_data.to_csv('out/dev_data_processed.csv', index=False) # print the df in a csv file

# Feature Engineering

In [9]:
# w2v of size 5000
##  embeddings
w2v_model, bow_model, tfidf_model = get_feature_models(training_data['Lemmatization'].tolist()) #+dev_data['Lemmatization'].tolist()) # use word2vec to extract the word embeddings
training_data['features'] = get_features(w2v_model, bow_model, tfidf_model, training_data['Lemmatization'], 1, 1, 1) # get the word embeddings for each tweet
trainingFeatures = training_data['features'].to_numpy()

##  embeddings
#model = extractWordEmbeddings(dev_data['Lemmatization']) # use word2vec to extract the word embeddings
dev_data['features'] = get_features(w2v_model, bow_model, tfidf_model, dev_data['Lemmatization'], 1, 1, 1) # get the word embeddings for each tweet
devFeatures = dev_data['features'].to_numpy()

In [10]:
print(len(dev_data['features']))
print(len(dev_data['features'][0]))
print(len(training_data['features']))
print(len(training_data['features'][0]))

1000
13750
6557
13750


In [11]:
# XLM embeddings
# features = np.load('out/train_embeddings.npy')
# XLM embeddings
# dev_data['features'] = np.load('out/test_embeddings.npy').tolist()

# Balancing the training dataset

In [12]:
# convert features into 2d array
trainingFeatures = np.array([np.array(xi) for xi in trainingFeatures])
categories = training_data['category'].to_numpy()
columns = ["f" + str(i + 1) for i in range(len(trainingFeatures[0]))]

df = pd.DataFrame(trainingFeatures, columns=columns)
df = pd.DataFrame(trainingFeatures)
df['category'] = categories

X_train_balanced, y_train_balanced = balance_data(df)
print("Some notes about dimensions of the data")
print(f"X_train size before cleaning = {trainingFeatures.shape}")
print(f"X_train size = {X_train_balanced.shape}")
print(f"y_train size = {y_train_balanced.shape}")

Before balancing:
Class=1, n=897 (13.680%)
Class=0, n=3422 (52.189%)
Class=7, n=972 (14.824%)
Class=8, n=293 (4.469%)
Class=2, n=577 (8.800%)
Class=3, n=88 (1.342%)
Class=9, n=150 (2.288%)
Class=4, n=75 (1.144%)
Class=5, n=67 (1.022%)
Class=6, n=16 (0.244%)
After balancing:
Class=1, n=3422 (10.000%)
Class=0, n=3422 (10.000%)
Class=7, n=3422 (10.000%)
Class=8, n=3422 (10.000%)
Class=2, n=3422 (10.000%)
Class=3, n=3422 (10.000%)
Class=9, n=3422 (10.000%)
Class=4, n=3422 (10.000%)
Class=5, n=3422 (10.000%)
Class=6, n=3422 (10.000%)
Some notes about dimensions of the data
X_train size before cleaning = (6557, 13750)
X_train size = (34220, 13750)
y_train size = (34220,)


In [13]:
split = 'n'
if split == 'y':
    # Split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(X_train_balanced, y_train_balanced, test_size=0.2, random_state=42)
else:
    X_train, y_train = X_train_balanced, y_train_balanced
    X_test = np.array([np.array(xi) for xi in dev_data['features'].to_numpy()])
    # X_test = np.load('out/test_embeddings.npy')
    y_test = dev_data['category'].to_numpy()
    y_train +=1

print(f"X_test size = {X_test.shape}")
print(f"y_test size = {y_test.shape}")
# print unique values in the dataset
print(f"Unique values in the dataset = {np.unique(y_test)}")
print(f"Unique values in the dataset = {np.unique(y_train)}")

X_test size = (1000, 13750)
y_test size = (1000,)
Unique values in the dataset = [0 1 2 3 4 5 6 7 8 9]
Unique values in the dataset = [0 1 2 3 4 5 6 7 8 9]


# Model Building
**Build a multi-class classifier to predict the category of the tweet**

In [14]:
print(len(dev_data['features']))
print(len(dev_data['features'][0]))
print(len(training_data['features']))
print(len(training_data['features'][0]))

1000
13750
6557
13750


In [15]:
# RandomForestClassifier
clf = RandomForestClassifier(n_estimators=500, max_depth=15, random_state=0, n_jobs=10)
_y_train = y_train + 1
_y_test = y_test + 1
model, report = modelPipeline(X_train, y_train, X_test, y_test, clf, MODELS_PATH+'clf.model')
report

              precision    recall  f1-score   support

           0       0.77      0.35      0.48       545
           1       0.70      0.67      0.69       145
           2       0.18      0.44      0.26        82
           3       0.06      0.25      0.10        20
           4       0.07      0.20      0.10        15
           5       0.06      0.20      0.09        10
           6       0.12      0.50      0.20         2
           7       0.48      0.35      0.41       128
           8       0.19      0.56      0.28        36
           9       0.02      0.06      0.03        17

    accuracy                           0.40      1000
   macro avg       0.26      0.36      0.26      1000
weighted avg       0.61      0.40      0.45      1000



0.399

In [16]:
# Xgboost
xgb = XGBClassifier()
_y_train = y_train + 1
_y_test = y_test + 1
model, report = modelPipeline(X_train, _y_train, X_test, _y_test, xgb, MODELS_PATH+ 'xgb.model')
report

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3 4 5 6 7 8 9], got [ 1  2  3  4  5  6  7  8  9 10]

In [None]:
# Naive Bayes
gnb = GaussianNB(var_smoothing=10)
model, report = modelPipeline(X_train, y_train, X_test, y_test, gnb,MODELS_PATH+ 'gnb.model')
report

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        93
           0       0.00      0.00      0.00       208
           1       0.78      1.00      0.88      1097

    accuracy                           0.78      1398
   macro avg       0.26      0.33      0.29      1398
weighted avg       0.62      0.78      0.69      1398



0.7846924177396281

In [None]:
# SVM
svm = svm.SVC()
model, report = modelPipeline(X_train, y_train, X_test, y_test, svm, MODELS_PATH+'svm.model')
report

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        93
           0       0.00      0.00      0.00       208
           1       0.78      1.00      0.88      1097

    accuracy                           0.78      1398
   macro avg       0.26      0.33      0.29      1398
weighted avg       0.62      0.78      0.69      1398



0.7846924177396281

# Auto ML
Check out the [Auto SKlearn](https://automl.github.io/auto-sklearn/master/index.html)

In [None]:
import autosklearn.classification
import sklearn.datasets
import sklearn.metrics
from sklearn.utils.multiclass import type_of_target

In [None]:
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=1900, # overall time in seconds
    per_run_time_limit=1300, # time per model in seconds
    initial_configurations_via_metalearning=0,
    ensemble_size=10,
    n_jobs=8,
    smac_scenario_args={"runcount_limit": 1},
)
automl.fit(X_train, y_train)
y_pred = automl.predict(X_test)



In [None]:
print(automl.sprint_statistics())
print(automl.leaderboard())

auto-sklearn results:
  Dataset name: 093be1d2-848f-11ed-983d-1da6501c908a
  Metric: accuracy
  Best validation score: 0.795664
  Number of target algorithm runs: 1
  Number of successful target algorithm runs: 1
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 0
  Number of target algorithms that exceeded the memory limit: 0

          rank  ensemble_weight           type      cost   duration
model_id                                                           
2            1              1.0  random_forest  0.204336  20.671741


In [None]:
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_pred)) #  get the Score of the final ensemble

Accuracy score 0.7761087267525035
