In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
     

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from time import perf_counter

In [None]:
from google.colab import files
data = files.upload()

In [None]:
data = pd.read_csv('final_data.csv')

In [None]:
data.head()

In [None]:
data['label'].value_counts()

In [None]:
data['label'].value_counts().plot.bar(color = ["g","r"])

In [None]:
import re
import nltk
     

In [None]:
from nltk.corpus import stopwords

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
def clean_data(feature):
    feature = re.sub('','',feature)
    feature = re.sub('https?://','', feature)
    feature = re.sub('[^0-9a-zA-Z\s]', '', feature)
    return feature

data['text'] = data['text'].apply(clean_data)

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
def lemmatize_data(rev):
   
    rev = rev.lower()
    rev = rev.split()
    rev = [lemmatizer.lemmatize(word) for word in rev if word not in set(stopwords.words('english'))]
    rev = ' '.join(rev)
       
    return rev

data['text'] = data['text'].apply(lemmatize_data)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['text']).toarray()

In [None]:
y = np.array(data['label_num'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
import plotly.express as px
from sklearn.decomposition import PCA

In [None]:
pca2D = PCA(n_components=2)
components = pca2D.fit_transform(X_train)

fig = px.scatter(components, x = 0, y = 1, color=y_train)
fig.show()

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
from collections import Counter

In [None]:
models = {
    "Random Forest": {"model":RandomForestClassifier(), "perf":0},
    "MultinomialNB": {"model":MultinomialNB(), "perf":0},
    "Logistic Regr.": {"model":LogisticRegression(), "perf":0},
    "KNN": {"model":KNeighborsClassifier(), "perf":0},
    "SVM (Linear)": {"model":LinearSVC(), "perf":0},
    "SVM (RBF)": {"model":SVC(), "perf":0}
}

In [None]:
models_stats = []

In [None]:
oversampler_smote = SMOTE()
X_smote, y_smote = oversampler_smote.fit_resample(X_train, y_train)

In [None]:
counter_smote = Counter(y_smote)
counter_smote

In [None]:
pca2D = PCA(n_components=2)
components = pca2D.fit_transform(X_smote)

fig = px.scatter(components, x = 0, y = 1, color=y_smote)
fig.show()

In [None]:
for name, model in models.items():
    start = perf_counter()
    model['model'].fit(X_smote, y_smote)
    duration = perf_counter() - start
    duration = round(duration,2)
    model["perf"] = duration
    print(f"{name:20} trained in {duration} sec")

In [None]:
models_acc = []
for name, model in models.items():
    models_acc.append([name, model["model"].score(X_test, y_test),model["perf"]])

In [None]:
df_acc = pd.DataFrame(models_acc)
df_acc.columns = ['Model', 'Accuracy without scaling (test set)', 'Training time (sec)']
df_acc.sort_values(by = 'Accuracy without scaling (test set)', ascending = False, inplace=True)
df_acc.reset_index(drop = True, inplace=True)
df_acc

In [None]:
plt.figure(figsize = (15,5))
sns.barplot(x = 'Model', y = 'Accuracy without scaling (test set)', data = df_acc)
plt.title('Accuracy on the test set\n(Training set oversampled with SMOTE)', fontsize = 15)
plt.ylim(0.8,1)
plt.show()
     

In [None]:
plt.figure(figsize = (15,5))
sns.barplot(x = 'Model', y = 'Training time (sec)', data = df_acc)
plt.title('Training time for each model in sec with oversampled data', fontsize = 15)
plt.ylim(0,100)
plt.show()

In [None]:
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import fbeta_score, f1_score
from sklearn.metrics import roc_auc_score

In [None]:
model_stats = []
for name, model in models.items():
    
    y_pred = model["model"].predict(X_test)
    model_stats.append([name, model["model"].score(X_test, y_test),model["perf"],precision_score(y_test, y_pred), recall_score(y_test, y_pred),f1_score(y_test, y_pred),fbeta_score(y_test, y_pred, beta = 2), roc_auc_score(y_test, y_pred)])
    print(f'{name} appended to stats')   

In [None]:
df_stats = pd.DataFrame(model_stats)
df_stats.columns = ['Model', 'Accuracy without scaling (test set)', 'Training time (sec)', 'Precision', 'Recall','F1 score', 'F2 score', 'ROC-AUC score']
df_stats.sort_values(by = 'Accuracy without scaling (test set)', ascending = False, inplace=True)
df_stats.reset_index(drop = True, inplace=True)
df_stats

In [None]:
df_stats.to_csv('Oversampled_stats.csv')