# Exploring the dataset

In [None]:
import pandas as pd
import json
import os

In [None]:
# create a csv file for all codes (labels) 
all_df = pd.DataFrame()
path = 'C:\\Users\\user\\Desktop\\AI projects\\nlp_project_files\\'
for file in os.listdir(r'C:\Users\user\Desktop\AI projects\nlp_project_files'):
    if  file != 'kone_classification.json':
        df = pd.read_csv(f'{path}{file}')
        all_df = pd.concat([all_df, df], ignore_index=True)

print(all_df.shape)
all_df.to_csv


In [None]:
# open the jesonfile as dataframe
json_file = "C:\\Users\\user\\Desktop\\AI projects\\nlp_project_files\\kone_classification.json"
with open(json_file) as f:
    data = json.load(f)
    df_json=pd.DataFrame(data)

In [None]:
df_json.head(15)

In [None]:
print("The description of the dataset is: \n",df_json.describe())
print("The number of labels in the dataset is: ",df_json['label'].nunique())
# count the rows for each language
df_json.groupby('culture').count()

In [None]:
# check the maximum and minimum frequent for each label
df_json.groupby('label').count().sort_values(by=['text'], ascending=False)

In [None]:
# choose the training source and drop the workflow
df_json_training= df_json.loc[df_json['source']== 'TRAINING',:]
df_json_training

In [None]:
# find the duplicated rows
duplicateRows = df_json_training[df_json_training.duplicated()]


In [None]:
# drop duplicated rows
df_json_training.drop_duplicates()

In [None]:
# choose the French culture
df_json_training_fr = df_json_training.loc[df_json_training['culture']=='fr-fr',:]

In [None]:
print(df_json_training_fr['document_id'].nunique())
print(df_json_training_fr['annotation_id'].nunique())
# check how many unique labels are there
print('the unique number of labels is: ',df_json_training_fr['label'].nunique())

In [None]:
# check the maximum and minimum frequent for each label
df_json_training_fr.groupby('label').count().sort_values(by=['text'], ascending=False)

# Preprosessing the text data


In [None]:

import re

# some text cleaning functions
def convert_to_lower(text):
    return text.lower()

def remove_numbers(text):
    number_pattern = r'\d+'
    without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
    return without_number

def remove_extra_white_spaces(text):
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_sc

def remove_special_char(text):
    special_char = r'[^\w\s]|.:,*"'
    remove_special_char = re.sub(pattern=special_char, repl=" ", string=text)
    return remove_special_char
df_json['text'] = df_json['text'].apply(lambda x: convert_to_lower(x))
df_json['text'] = df_json['text'].apply(lambda x: remove_numbers(x))
df_json['text'] = df_json['text'].apply(lambda x: remove_extra_white_spaces(x))
df_json['text'] = df_json['text'].apply(lambda x: remove_special_char(x))

In [None]:
df_json_training_fr['text'] = df_json_training_fr['text'].apply(lambda x: convert_to_lower(x))
df_json_training_fr['text'] = df_json_training_fr['text'].apply(lambda x: remove_numbers(x))
df_json_training_fr['text'] = df_json_training_fr['text'].apply(lambda x: remove_extra_white_spaces(x))
df_json_training_fr['text'] = df_json_training_fr['text'].apply(lambda x: remove_special_char(x))

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Text Augmentation

In [None]:
# try the code with one sentence from the dataset
import nlpaug.augmenter.word as naw

aug = naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-uncased', aug_p=0.1)
text = df_json_training_fr['text'].iloc[199]
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

In [None]:
import nlpaug.augmenter.word as naw
aug = naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-uncased', aug_p=0.1)

def data_Aug(messege,aug_range=1):
    """ Function for augmenting data using Contextual Word Embeddings Augmenter (BERT)
    parameters: message: text from the dataset
                aug_range: required sampels number
                
    return : one augmented message   """

    augmented_messages = []
    for j in range(0,aug_range) :
        augmented_text = aug.augment(messege)
        augmented_messages.append(str(augmented_text))
        

    return augmented_messages

In [None]:
## Dictionary for label counter
label_count = df_json_training_fr.label.value_counts().to_dict()

In [None]:
## Get max label count to match other minority classes through data augmentation
import operator
max_label_count = max(label_count.items(), key=operator.itemgetter(1))[1]

In [None]:
## Loop to interate all messages
import numpy as np
import math
newdf = pd.DataFrame()   # the augmented dataframe
for label, count in label_count.items() :
    count_diff = max_label_count - count    ## Difference to fill
    multiplication_count = math.ceil((count_diff)/count)  ## Multiplying a minority classes for multiplication_count times
    if (multiplication_count) :
        old_message_df = pd.DataFrame()
        new_message_df = pd.DataFrame()
        for message in df_json_training_fr.loc[df_json_training_fr["label"] == label, "text" ]:
            ## Extracting existing minority class batch
            dummy1 = pd.DataFrame([message], columns=['text'])
            dummy1["label"] = label
            old_message_df = old_message_df.append(dummy1)
            
            ## Creating new augmented batch from existing minority class
            new_messages = data_Aug(message,multiplication_count)
            dummy2 = pd.DataFrame(new_messages, columns=['text'])
            dummy2["label"] = label
            new_message_df = new_message_df.append(dummy2)
        
        ## Select random data points from augmented data
        new_message_df=new_message_df.take(np.random.permutation(len(new_message_df))[:count_diff])
        
        ## Merge existing and augmented data points
        newdf = newdf.append([old_message_df,new_message_df])
    # else :
    #     newdf = newdf.append(df[df["label"] == label])

In [None]:
## Print count of all new data points
newdf.label.value_counts()

------------------------------------------------------------------------------------------------------------------------------------------------------------

## sentence embeding using sentence transformer

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
sentences = df['text'].values
#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)


In [None]:
embeddings

In [None]:
#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")
    

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Sentence embedding using transformer

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
#Sentences we want sentence embeddings for
sentences = list(df['text'].values)
#Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')

#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

#Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

In [None]:
sentence_embeddings.shape

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
df_json_training_fr['text'].iloc[199]

In [None]:
import spacy
nlp = spacy.load("fr_core_news_sm")

In [None]:
spacy_stopwords = spacy.lang.fr.stop_words.STOP_WORDS

In [None]:
doc = nlp(df_json_training_fr['text'].iloc[18])
print(doc.text)

In [None]:
filtered_sent = []
for word in doc:
    if word.is_stop == False:
      filtered_sent.append(word)
print(filtered_sent)  