# Exploring the dataset

In [None]:
import pandas as pd
import json
import os

In [None]:
# create a csv file for all codes (labels) 
all_df = pd.DataFrame()
path = 'C:\\Users\\user\\Desktop\\AI projects\\nlp_project_files\\'
for file in os.listdir(r'C:\Users\user\Desktop\AI projects\nlp_project_files'):
    if  file != 'kone_classification.json':
        df = pd.read_csv(f'{path}{file}')
        all_df = pd.concat([all_df, df], ignore_index=True)

print(all_df.shape)
all_df.to_csv


In [None]:
# open the jesonfile as dataframe
json_file = "C:\\Users\\user\\Desktop\\AI projects\\nlp_project_files\\kone_classification.json"
with open(json_file) as f:
    data = json.load(f)
    df_json=pd.DataFrame(data)

In [None]:
df_json.head(15)

In [None]:
print("The description of the dataset is: \n",df_json.info())
print("The number of labels in the dataset is: ",df_json['label'].nunique())
# count the rows for each language
df_json.groupby('culture').count()

In [None]:
# check the maximum and minimum frequent for each label
df_json.groupby('label').count().sort_values(by=['text'], ascending=False)

#### we obsarve that the dataset is imbalance and has a huge ratio of mineroty classes with such one or two samples

In [None]:
# choose the training source and drop the workflow
df_json_training= df_json.loc[df_json['source']== 'TRAINING',:]
df_json_training

In [None]:
# choose the French culture
df_json_training_fr = df_json_training.loc[df_json_training['culture']=='fr-fr',:]

In [None]:
# check the maximum and minimum frequent for each label
df_json_training_fr.groupby('label').count().sort_values(by=['text'], ascending=False)

# Preprosessing the text data


In [None]:

import re

# some text cleaning functions
def convert_to_lower(text):
    return text.lower()

def remove_numbers(text):
    number_pattern = r'\d+'
    without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
    return without_number

def remove_extra_white_spaces(text):
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_sc

def remove_special_char(text):
    special_char = r'[^\w\s]|.:,*"'
    remove_special_char = re.sub(pattern=special_char, repl=" ", string=text)
    return remove_special_char


In [None]:
def clean_data(df):
    """ Function to apply all in one
    parameters: dataframe
    return: dataframe  """

    df['text'] = df['text'].apply(lambda x: convert_to_lower(x))
    df['text'] = df['text'].apply(lambda x: remove_numbers(x))
    df['text'] = df['text'].apply(lambda x: remove_extra_white_spaces(x))
    df['text'] = df['text'].apply(lambda x: remove_special_char(x))
    df = df.drop_duplicates()
    return df

In [None]:
df_cleaned = clean_data(df_json_training_fr)

In [None]:
df = df_cleaned.copy()

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Text Augmentation

In [None]:
# try the code with one sentence from the dataset using Contextual Word Embeddings Augmenter (BERT)
import nlpaug.augmenter.word as naw

aug = naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-uncased', aug_p=0.1)
text = df['text'].iloc[51]
for i in range(3):  
    augmented_text = aug.augment(text)
    print("Augmented Text:")
    print(augmented_text)
print("Original:")
print(text)


In [None]:
import nlpaug.augmenter.word as naw
aug = naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-uncased', aug_p=0.1)

def data_Aug(messege,aug_range=1):
    """ Function for augmenting data using Contextual Word Embeddings Augmenter (BERT)
    parameters: message: text from the dataset
                aug_range: required sampels number
                
    return : one augmented message   """

    augmented_messages = []
    for j in range(0,aug_range) :
        augmented_text = aug.augment(messege)
        augmented_messages.append(str(augmented_text))
        

    return augmented_messages

In [None]:
## Function for augmenting data using langauge translation
## Could not found free service for langauge translation, Use paid service like Azure, Google translator etc

from textblob import TextBlob
from textblob.translate import NotTranslated
import random
sr = random.SystemRandom()

language = ["es", "de", "fr", "ar", "te", "hi", "ja", "fa", "sq", "bg", "nl", "gu", "ig", "kk", "mt", "ps"]

def data_augmentation(message, language, aug_range=1):
    augmented_messages = []
    if hasattr(message, "decode"):
        message = message.decode("utf-8")

    for j in range(0,aug_range) :
        new_message = ""
        text = TextBlob(message)
        try:
            text = text.translate(to=sr.choice(language))   ## Converting to random langauge for meaningful variation
            text = text.translate(to="en")
        except NotTranslated:
            pass
        augmented_messages.append(str(text))

    return augmented_messages

In [None]:
label_count = df.label.value_counts().to_dict()

In [None]:
max_label_count = 7

In [None]:
## Loop to interate all messages
import numpy as np
import math
newdf = pd.DataFrame()   # the augmented dataframe
for label, count in label_count.items() :
    count_diff = max_label_count - count    ## Difference to fill
    multiplication_count = math.ceil((count_diff)/count)  ## Multiplying a minority classes for multiplication_count times
    if (multiplication_count) :
        old_message_df = pd.DataFrame()
        new_message_df = pd.DataFrame()
        for message in df.loc[df["label"] == label, "text" ]:
            ## Extracting existing minority class batch
            dummy1 = pd.DataFrame([message], columns=['text'])
            dummy1["label"] = label
            old_message_df = old_message_df.append(dummy1)
            
            ## Creating new augmented batch from existing minority class
            new_messages = data_Aug(message,multiplication_count)
            dummy2 = pd.DataFrame(new_messages, columns=['text'])
            dummy2["label"] = label
            new_message_df = new_message_df.append(dummy2)
        
        ## Select random data points from augmented data
        new_message_df=new_message_df.take(np.random.permutation(len(new_message_df))[:count_diff])
        
        ## Merge existing and augmented data points
        newdf = newdf.append([old_message_df,new_message_df])
    else :
        newdf = newdf.append(df[df["label"] == label])

In [None]:
newdf.shape

In [None]:
label_count = newdf.label.value_counts().to_dict()
label_count

In [None]:
## Get max label count to match other minority classes through data augmentation
# import operator
# max_label_count = max(label_count.items(), key=operator.itemgetter(1))[1]
max_label_count= 15

In [None]:
import numpy as np
import math
newdf2 = pd.DataFrame()   # the augmented dataframe
for label, count in label_count.items() :
    # print(label, count)
    count_diff = max_label_count - count    ## Difference to fill
    multiplication_count = math.ceil((count_diff)/count)  ## Multiplying a minority classes for multiplication_count times
    # print(multiplication_count)
    if (multiplication_count) :
        old_message_df = pd.DataFrame()
        new_message_df = pd.DataFrame()
        for message in newdf.loc[newdf["label"] == label, "text" ]:
            ## Extracting existing minority class batch
            dummy1 = pd.DataFrame([message], columns=['text'])
            dummy1["label"] = label
            old_message_df = old_message_df.append(dummy1)
            
            ## Creating new augmented batch from existing minority class
            new_messages = data_Aug(message,multiplication_count)
            dummy2 = pd.DataFrame(new_messages, columns=['text'])
            dummy2["label"] = label
            new_message_df = new_message_df.append(dummy2)
        
        ## Select random data points from augmented data
        new_message_df=new_message_df.take(np.random.permutation(len(new_message_df))[:count_diff])
        
        ## Merge existing and augmented data points
        newdf2 = newdf2.append([old_message_df,new_message_df])
    else :
        newdf2 = newdf2.append(df[df["label"] == label])

##### we avoid to make complete balanced classes because our goal was to breake the huge gab between classes and to keep it close to reality as much as possible

In [None]:
## Print count of all new data points
newdf2.label.value_counts()

In [None]:
newdf2.duplicated().sum()

In [None]:
new_clean_df = newdf2.drop_duplicates()

In [None]:
new_clean_df['label_id']=new_clean_df['label'].factorize()[0]

------------------------------------------------------------------------------------------------------------------------------------------------------------

## sentence embeding using sentence transformer

In [None]:
from sentence_transformers import SentenceTransformer

model_embed = SentenceTransformer('all-MiniLM-L6-v2')

def embed(data, model_embed):
    sentences = data.values
    embeddings = model_embed.encode(sentences)
    return embeddings

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Train the model

In [None]:
from sklearn.preprocessing import LabelEncoder
# load the dataset
def load_dataset(df):
	# load the dataset as a numpy array
	data = df
	# retrieve numpy array
	data = df[['text', 'label']]
	# split into input and output elements
	X, y = data['text'], data['label']
	# label encode the target variable to have the classes 0 and 1
	y = LabelEncoder().fit_transform(y)
	return X, y

In [None]:
df = new_clean_df.copy()
df.head()

In [None]:
X, y = load_dataset(df)

In [None]:
X= embed(X,model_embed)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    random_state = 0)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics

models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    LinearSVC(),

    LogisticRegression(random_state=0),
]

In [None]:
# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))

In [None]:
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=CV,error_score='raise')
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [None]:
mean_accuracy = cv_df.groupby('model_name').accuracy.mean()
std_accuracy = cv_df.groupby('model_name').accuracy.std()

acc = pd.concat([mean_accuracy, std_accuracy], axis= 1, 
          ignore_index=True)
acc.columns = ['Mean Accuracy', 'Standard deviation']
acc

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(8,5))
sns.boxplot(x='model_name', y='accuracy', 
            data=cv_df, 
            color='lightblue', 
            showmeans=True)
plt.title("MEAN ACCURACY (cv = 5)n", size=14);

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    random_state = 15)


In [None]:
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
import pickle

In [None]:
filename = 'C:\\Users\\user\\Desktop\\AI projects\\nlp_project_files\\LinearSVC_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
predicted_df = pd.DataFrame()
predicted_df['y_test']= y_test
predicted_df['y_pred']=y_pred

In [None]:
predicted_df

In [None]:
# Classification report
print('CLASSIFICATIION METRICSn')
print(metrics.classification_report(y_test, y_pred))

In [None]:
def predict(file):
    """ Function to predict the classes for each entity using saved LinearSVC model
    parameter: json_file
    return: dataframe has the actual classes and the predicted classes """

    with open(json_file) as f:
        data = json.load(f)
        df_json=pd.DataFrame(data)
    df_json_training= df_json.loc[df_json['source']== 'TRAINING',:]      # choose the training source and drop the workflow
    df_json_training_fr = df_json_training.loc[df_json_training['culture']=='fr-fr',:]          # choose the French culture
    df_cleaned = clean_data(df_json_training_fr)   # data preprocessing
    X, y_test = load_dataset(df_cleaned)           # feature selection
    new_X_test= embed(X,model_embed)                # implement the embeddings for the new test dataset
    loaded_model = pickle.load(open(filename, 'rb'))    # load the saved model
    y_pred = loaded_model.predict(new_X_test)           # predict the classes
    predicted_df = pd.DataFrame()                       # create a data frame that show the prediction result
    predicted_df['y_test']= y_test
    predicted_df['y_pred']=y_pred
    
    return predicted_df 

-------------------------------------------------------------------------------------------------------------------------------------------------------

## Sentence embedding using transformer

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
#Sentences we want sentence embeddings for
sentences = list(df['text'].values)
#Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')

#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

#Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

In [None]:
sentence_embeddings.shape

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------