# Data Exploration

## Import Required Packages

In [None]:
import pandas as pd
import json
import os
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import functions
import re
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from textblob import TextBlob
from textblob.translate import NotTranslated
import random
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
import pickle


### Load Dataset

In [None]:
# open the jesonfile as dataframe
json_file = "kone_classification.json"
with open(json_file) as f:
    data = json.load(f)
    df_json=pd.DataFrame(data)

In [None]:
df_json.head()

In [None]:
print(df_json.info())
print("The number of labels in the dataset is: ",df_json['label'].nunique())
# count the rows for each language
print(df_json.groupby('culture').count())

In [None]:
# check the maximum and minimum frequent for each label
df_json.groupby('label').count().sort_values(by=['text'], ascending=False)

In [None]:
classes_count = df_json.label.value_counts().to_dict()
classes_count
count = 0
for i in classes_count:
    if classes_count[i] < 10:
        count =count + 1
print(f'This dataset has {count} minority classes with less than 10 sampels')

#### we obsarve that the dataset is imbalance and has a huge ratio of minority classes with such one or two samples

In [None]:
# choose the training source and drop the workflow
df_json_training= df_json.loc[df_json['source']== 'TRAINING',:]
df_json_training

In [None]:
# choose the French culture
df_json_training_fr = df_json_training.loc[df_json_training['culture']=='fr-fr',:]

In [None]:
# check the maximum and minimum frequent for each label
df_json_training_fr.groupby('label').count().sort_values(by=['text'], ascending=False)

### Visualize Destribution of Classes

In [None]:
# visualize the data, and see how many numbers of text are there per label

fig = plt.figure(figsize=(15,10))
colors = ['grey','grey','grey','grey','grey','grey','grey','grey','grey',
    'grey','darkblue','darkblue','darkblue']
df_json_training_fr.groupby('label').text.count().sort_values().plot.barh(
    ylim=0, color=colors, title= 'NUMBER OF Samples IN EACH label')
plt.xlabel('Number of ocurrences', fontsize = 10)

In [None]:
df_json_training_fr.label.nunique()

In [None]:
classes_count = df_json_training_fr.label.value_counts().to_dict()
classes_count
count = 0
for i in classes_count:
    if classes_count[i] < 10:
        count =count + 1
print(f'This dataset has {count} minority classes with less than 10 sampels')

## Text Data Preprosessing


In [None]:
df_cleaned = functions.clean_data(df_json_training_fr)

In [None]:
df = df_cleaned.copy()

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Text Augmentation

In [None]:
## Function for augmenting data using langauge translation
## Could not found free service for langauge translation, Use paid service like Azure, Google translator etc
# sr = random.SystemRandom()

language = ["es", "de", "fr", "ar", "te", "hi", "ja", "fa", "sq", "bg", "nl", "gu", "ig", "kk", "mt", "ps"]

In [None]:
label_count = df.label.value_counts().to_dict()

In [None]:
max_label_count = 7

In [None]:
## Loop to interate all messages
newdf = pd.DataFrame()   # the augmented dataframe
for label, count in label_count.items() :
    count_diff = max_label_count - count    ## Difference to fill
    multiplication_count = math.ceil((count_diff)/count)  ## Multiplying a minority classes for multiplication_count times
    if (multiplication_count) :
        old_message_df = pd.DataFrame()
        new_message_df = pd.DataFrame()
        for message in df.loc[df["label"] == label, "text" ]:
            ## Extracting existing minority class batch
            dummy1 = pd.DataFrame([message], columns=['text'])
            dummy1["label"] = label
            old_message_df = old_message_df.append(dummy1)
            
            ## Creating new augmented batch from existing minority class
            new_messages = functions.data_Aug(message,multiplication_count)
            dummy2 = pd.DataFrame(new_messages, columns=['text'])
            dummy2["label"] = label
            new_message_df = new_message_df.append(dummy2)
        
        ## Select random data points from augmented data
        new_message_df=new_message_df.take(np.random.permutation(len(new_message_df))[:count_diff])
        
        ## Merge existing and augmented data points
        newdf = newdf.append([old_message_df,new_message_df])
    else :
        newdf = newdf.append(df[df["label"] == label])

In [None]:
newdf.shape

In [None]:
# check the counter for each label in the new dataframe 
label_count = newdf.label.value_counts().to_dict()
label_count

In [None]:
# assign the maximum number of new augmentated texts for the next dataframe
max_label_count= 15

In [None]:
# Loop to augement the data using translation
newdf2 = pd.DataFrame()   # the augmented dataframe
for label, count in label_count.items() :
    # print(label, count)
    count_diff = max_label_count - count    ## Difference to fill
    multiplication_count = math.ceil((count_diff)/count)  ## Multiplying a minority classes for multiplication_count times
    # print(multiplication_count)
    if (multiplication_count) :
        old_message_df = pd.DataFrame()
        new_message_df = pd.DataFrame()
        for message in newdf.loc[newdf["label"] == label, "text" ]:
            ## Extracting existing minority class batch
            dummy1 = pd.DataFrame([message], columns=['text'])
            dummy1["label"] = label
            old_message_df = old_message_df.append(dummy1)
            
            ## Creating new augmented batch from existing minority class
            new_messages = functions.data_Aug(message,multiplication_count)
            dummy2 = pd.DataFrame(new_messages, columns=['text'])
            dummy2["label"] = label
            new_message_df = new_message_df.append(dummy2)
        
        ## Select random data points from augmented data
        new_message_df=new_message_df.take(np.random.permutation(len(new_message_df))[:count_diff])
        
        ## Merge existing and augmented data points
        newdf2 = newdf2.append([old_message_df,new_message_df])
    else :
        newdf2 = newdf2.append(df[df["label"] == label])

##### we avoid to make complete balanced classes because our goal was to breake the huge gab between classes and to keep it close to reality as much as possible

In [None]:
## Print count of all new data points
newdf2.label.value_counts()

In [None]:
# check the duplicate after augmentation and drop them
print(newdf2.duplicated().sum())
new_clean_df = newdf2.drop_duplicates()

In [None]:
# new_clean_df.to_csv('augmented_capital_dataset.csv',index= False)

# Models Training

In [None]:
new_df = pd.read_csv('augmented_dataset.csv', sep=',')

In [None]:
df = new_clean_df.copy()
df.head()

In [None]:
# load the dataset for training
X, y_label,y = functions.load_dataset(df)

In [None]:
labels_frame = pd.DataFrame()
labels_frame['label'] = y_label
labels_frame['label_id']= y

In [None]:
labels_frame.groupby(['label','label_id']).count().sort_values(by=['label_id'])

In [None]:
# Execute the sentence Embedding
model_embed = SentenceTransformer('all-MiniLM-L6-v2')
X= functions.embed(X,model_embed)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    random_state = 0)

In [None]:
models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    LinearSVC(),
    LogisticRegression(random_state=0),
]

In [None]:
# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))

In [None]:
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=CV,error_score='raise')
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [None]:
mean_accuracy = cv_df.groupby('model_name').accuracy.mean()
std_accuracy = cv_df.groupby('model_name').accuracy.std()

acc = pd.concat([mean_accuracy, std_accuracy], axis= 1, 
          ignore_index=True)
acc.columns = ['Mean Accuracy', 'Standard deviation']
acc

In [None]:

plt.figure(figsize=(8,5))
sns.boxplot(x='model_name', y='accuracy', 
            data=cv_df, 
            color='lightblue', 
            showmeans=True)
plt.title("MEAN ACCURACY (cv = 5)n", size=14);

### Implement LinearSVC

In [None]:
X_train, X_test1, y_train, y_test1, y_train_label, y_test1_label = train_test_split(X, y, y_label,
                                                    test_size=0.20,
                                                    random_state = 15)


In [None]:
X_val, X_test, y_val, y_test, y_val_label, y_test_label =  train_test_split(X_test1, y_test1, y_test1_label,
                                                    test_size=0.50,
                                                    random_state = 15)


In [None]:
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

In [None]:
# filename = 'Models\\LinearSVC_main_model.sav'
filename = 'Models\\LinearSVC_capital_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
predicted_df = pd.DataFrame()
predicted_df['class']= y_val_label
predicted_df['y_test']= y_val
predicted_df['y_pred']=y_pred

In [None]:
predicted_df.iloc[30:50]

In [None]:
# Classification report
print('CLASSIFICATIION METRICSn')
print(metrics.classification_report(y_val, y_pred))

### Implement LogisticRegression Model

In [None]:
model = LogisticRegression(random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

In [None]:
# filename = 'Models\\LogisticRegrission_main_model.sav'
filename = 'Models\\LogisticRegrission_sub_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
predicted_df = pd.DataFrame()
predicted_df['class']= y_val_label
predicted_df['y_test']= y_val
predicted_df['y_pred']=y_pred

In [None]:
# Classification report
print('CLASSIFICATIION METRICSn')
print(metrics.classification_report(y_val, y_pred))

In [None]:
predicted_df[20:40]

### Implement RandomForestClassifier Model

In [None]:
mosel = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

In [None]:
# filename = 'Models\\RandomForestClassifier_main_model.sav'
filename = 'Models\\RandomForestClassifier_main_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
predicted_df = pd.DataFrame()
predicted_df['class']= y_val_label
predicted_df['y_test']= y_val
predicted_df['y_pred']=y_pred

In [None]:
# Classification report
print('CLASSIFICATIION METRICSn')
print(metrics.classification_report(y_val, y_pred))

In [None]:
predicted_df[20:40]