# Exploring the dataset

In [None]:
import pandas as pd
import json
import os

In [None]:
# create a csv file for all codes (labels) 
all_df = pd.DataFrame()
path = 'C:\\Users\\user\\Desktop\\AI projects\\nlp_project_files\\'
for file in os.listdir(r'C:\Users\user\Desktop\AI projects\nlp_project_files'):
    if  file != 'kone_classification.json':
        df = pd.read_csv(f'{path}{file}')
        all_df = pd.concat([all_df, df], ignore_index=True)

print(all_df.shape)
all_df.to_csv


In [None]:
# open the jesonfile as dataframe
json_file = "C:\\Users\\user\\Desktop\\AI projects\\nlp_project_files\\kone_classification.json"
with open(json_file) as f:
    data = json.load(f)
    df_json=pd.DataFrame(data)

In [None]:
df_json.head(15)

In [None]:
print("The description of the dataset is: \n",df_json.describe())
print("The number of labels in the dataset is: ",df_json['label'].nunique())
# count the rows for each language
df_json.groupby('culture').count()

In [None]:
# check the maximum and minimum frequent for each label
df_json.groupby('label').count().sort_values(by=['text'], ascending=False)

In [None]:
# choose the training source and drop the workflow
df_json_training= df_json.loc[df_json['source']== 'TRAINING',:]
df_json_training

In [None]:
# find the duplicated rows
duplicateRows = df_json_training[df_json_training.duplicated()]


In [None]:
# drop duplicated rows
df_json_training.drop_duplicates()

In [None]:
# choose the French culture
df_json_training_fr = df_json_training.loc[df_json_training['culture']=='fr-fr',:]

In [None]:
print(df_json_training_fr['document_id'].nunique())
print(df_json_training_fr['annotation_id'].nunique())
# check how many unique labels are there
print('the unique number of labels is: ',df_json_training_fr['label'].nunique())

In [None]:
# check the maximum and minimum frequent for each label
df_json_training_fr.groupby('label').count().sort_values(by=['text'], ascending=False)

# Preprosessing the text data


In [None]:

import re

# some text cleaning functions
def convert_to_lower(text):
    return text.lower()

def remove_numbers(text):
    number_pattern = r'\d+'
    without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
    return without_number

def remove_extra_white_spaces(text):
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_sc

def remove_special_char(text):
    special_char = r'[^\w\s]|.:,*"'
    remove_special_char = re.sub(pattern=special_char, repl=" ", string=text)
    return remove_special_char
df_json['text'] = df_json['text'].apply(lambda x: convert_to_lower(x))
df_json['text'] = df_json['text'].apply(lambda x: remove_numbers(x))
df_json['text'] = df_json['text'].apply(lambda x: remove_extra_white_spaces(x))
df_json['text'] = df_json['text'].apply(lambda x: remove_special_char(x))

In [None]:
df_json_training_fr['text'] = df_json_training_fr['text'].apply(lambda x: convert_to_lower(x))
df_json_training_fr['text'] = df_json_training_fr['text'].apply(lambda x: remove_numbers(x))
df_json_training_fr['text'] = df_json_training_fr['text'].apply(lambda x: remove_extra_white_spaces(x))
df_json_training_fr['text'] = df_json_training_fr['text'].apply(lambda x: remove_special_char(x))

------------------------------------------------------------------------------------------------------------------------------------------------------------

## implement ML classification

In [None]:
# copy the dataframe to start working on the first example https://www.analyticsvidhya.com/blog/2021/11/a-guide-to-building-an-end-to-end-multiclass-text-classification-model/
df = df_json_training_fr.copy()
# Create a new column 'category_id' with encoded categories 
df['category_id'] = df['label'].factorize()[0]

In [None]:
category_id_df = df[['label', 'category_id']]

In [None]:
category_id_df

In [None]:
# Dictionaries for future use
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'label']].values)
# New dataframe
df.head()

In [None]:
import matplotlib.pyplot as plt

# visualize the data, and see how many numbers of text are there per label

fig = plt.figure(figsize=(15,10))
colors = ['grey','grey','grey','grey','grey','grey','grey','grey','grey',
    'grey','darkblue','darkblue','darkblue']
df.groupby('label').text.count().sort_values().plot.barh(
    ylim=0, color=colors, title= 'NUMBER OF texts IN EACH label')
plt.xlabel('Number of ocurrences', fontsize = 10)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop

final_stopwords_list = list(fr_stop) 
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words=final_stopwords_list)
# We transform each complaint into a vector
features = tfidf.fit_transform(df.text).toarray()
labels = df.category_id
print("Each of the %d text is represented by %d features (TF-IDF score of unigrams and bigrams)" %(features.shape))

In [None]:
labels.unique()


In [None]:
from sklearn.feature_selection import chi2
import numpy as np

# Finding the three most correlated terms with each of the labels
N = 3
for Product, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("n==> %s:" %(Product))
  print("  * Most Correlated Unigrams are: %s" %(', '.join(unigrams[-N:])))
  print("  * Most Correlated Bigrams are: %s" %(', '.join(bigrams[-N:])))

In [None]:

from sklearn.model_selection import train_test_split
X = df['text'] # Collection of documents
y = df['category_id'] # Target or the labels we want to predict (i.e., the 50 different label)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    random_state = 0)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics

models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

In [None]:
# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))

In [None]:
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [None]:
mean_accuracy = cv_df.groupby('model_name').accuracy.mean()
std_accuracy = cv_df.groupby('model_name').accuracy.std()

acc = pd.concat([mean_accuracy, std_accuracy], axis= 1, 
          ignore_index=True)
acc.columns = ['Mean Accuracy', 'Standard deviation']
acc

In [None]:
import seaborn as sns
plt.figure(figsize=(8,5))
sns.boxplot(x='model_name', y='accuracy', 
            data=cv_df, 
            color='lightblue', 
            showmeans=True)
plt.title("MEAN ACCURACY (cv = 5)n", size=14);

In [None]:
X_train, X_test, y_train, y_test,indices_train,indices_test = train_test_split(features, 
                                                               labels, 
                                                               df.index, test_size=0.25, 
                                                               random_state=1)
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
y_pred

In [None]:
# Classification report
print('CLASSIFICATIION METRICSn')
print(metrics.classification_report(y_test, y_pred))

-----------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# copy the dataframe to start working on the second example 
df = df_json_training_fr.copy()
# Create a new column 'category_id' with encoded categories 
df['category_id'] = df['label'].factorize()[0]

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
df_json_training_fr['text'].iloc[199]

In [None]:
import spacy
nlp = spacy.load("fr_core_news_sm")

In [None]:
spacy_stopwords = spacy.lang.fr.stop_words.STOP_WORDS

In [None]:
doc = nlp(df_json_training_fr['text'].iloc[18])
print(doc.text)

In [None]:
filtered_sent = []
for word in doc:
    if word.is_stop == False:
      filtered_sent.append(word)
print(filtered_sent)  