In [None]:
!wget https://raw.githubusercontent.com/JoseCaliz/dotfiles/main/css/custom_css.css 2>/dev/null 1>&2
    
from IPython.core.display import HTML
with open('./custom_css.css', 'r') as file:
    custom_css = file.read()

HTML(custom_css)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cmap
import matplotlib.colors as mpl_colors

def hex_to_rgb(h):
    h = h.lstrip('#')
    return tuple(int(h[i:i+2], 16)/255 for i in (0, 2, 4))

cluster_colors_hex = ['#b4d2b1', '#568f8b', '#1d4a60', '#cd7e59', '#ddb247', '#d15252']
cluster_colors_rgb = [hex_to_rgb(x) for x in cluster_colors_hex]
cmap = mpl_colors.ListedColormap(cluster_colors_rgb)
colors = cmap.colors
bg_color= '#fdfcf6'

custom_params = {
    "axes.spines.right": False,
    "axes.spines.top": False,
    'grid.alpha':0.3,
    'figure.figsize': (16, 6),
    'axes.titlesize': 'Large',
    'axes.labelsize': 'Large',
    'figure.facecolor': bg_color,
    'axes.facecolor': bg_color
}

sns.set_theme(
    style='whitegrid',
    palette=sns.color_palette(cluster_colors_hex),
    rc=custom_params
)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# E-commerce text classification

<img src='https://storage.googleapis.com/kaggle-datasets-images/2093102/3477380/44b0c6a2ff03a89726b1baa56fa36af6/dataset-cover.jpg?t=2022-04-17-12-35-05' style='width:50%; margin-left:auto; margin-right:auto'/>

<span id='toc'/>

This style is taken from Jose Caliz's notebook from [here](https://www.kaggle.com/code/jcaliz/tps-sep22-eda-baseline-you-were-looking-for).

# Table of Contents
1. [Table of Contents](#Table-of-Contents)
1. [EDA](#eda)
1. [Preprocessing](#prepro)
1. [Modelling](#mdl)
1. [Hyperparamater tuning](#hpt)

# EDA

<span id='eda'/>

In [None]:
# Let's read the data
df_input = pd.read_csv('../input/ecommerce-text-classification/ecommerceDataset.csv', header=None, names=['Target', 'Text'])
df_input.head()

In [None]:
# Let's see how many target labels are present
df_input['Target'].unique()

In [None]:
# Check for data types and nulls
df_input.info()

In [None]:
# Looks like there's only one null value in the 'Text' column. So, it is safe to drop it.
df_input.dropna(inplace=True)

In [None]:
# Get the maximum length (in characters) of document from 'Text' column
df_input['Text'].str.len().max()

In [None]:
# Plot histogram of these document lengths
df_input['Text'].str.len().hist(bins=100)
plt.title('Distribution of document lengths (in characters)')
plt.xlabel('Number of characters')
plt.ylabel('Number of documents')
plt.show()

In [None]:
# Check if the target labels are imbalanced.
target_val_counts = df_input['Target'].value_counts()
sns.barplot(x=target_val_counts.index, y=target_val_counts)
plt.title('Distribution of target labels across all the documents')
plt.xlabel('Number of characters')
plt.ylabel('Target label')
plt.show()

In [None]:
# Plot the same in a pie chart
plt.pie(target_val_counts, labels=target_val_counts.index, colors=colors, autopct='%.0f%%')
plt.title('Distribution of target labels across all the documents')
plt.show()

Looks like the data is almost balanced except for the **household** which is slightly higher than the 25% needed for a balanced dataset. Ideally, we would need to oversample the minority class or undersample the majority class if we want to completely eliminate this imbalance.

In [None]:
# Show a wordcloud of the entire corpus
from wordcloud import WordCloud, STOPWORDS

desc_cloud = WordCloud(stopwords=STOPWORDS).generate(' '.join(df_input['Text']))
plt.imshow(desc_cloud)
plt.axis("off")
plt.show()

Although most of the data has household items description, we can't see any keywords that suggest this. We could try updating the stopwords and then re-generating this wordcloud.

Also, it is interesting to see words like "Stainless Steel" which might suggest the items belong to household products. And the "India" can also be seen which implies that the dataset must've been collected from an Indian e-commerce website.

# Preprocessing

<span id='prepro'/>

## Remove punctuations

In [None]:
# Use Pandas apply() to remove punctuation for every document in an efficient manner
import string

df_input['Text'] = df_input['Text'].apply(lambda x: 
                                          x.translate(str.maketrans('', '', string.punctuation)))

# Verify that it actually worked
df_input['Text'][0]

## Remove URLs (if any)

Looks, there aren't any URLs (at least according the RegEx pattern used below.)

In [None]:
import re

URL_PATTERN = r'\s*https?://\S+(\s+|$)'
df_input['urlcount'] = df_input['Text'].apply(lambda x: re.findall(URL_PATTERN, x)).str.len()
df_input['urlcount'].unique()

## Convert to lowercase

In [None]:
# Use apply() function and lower() to achieve this
df_input['Text'] = df_input['Text'].apply(lambda x: x.lower())

# Verify that it actually worked
df_input['Text'][0]

## Tokenization

In [None]:
from nltk.tokenize import word_tokenize

df_input['tokenized_text'] = df_input['Text'].apply(lambda x: word_tokenize(x))
df_input['tokenized_text'].head()

## Remove stopwords

In [None]:
# Import the nlp library
import nltk

# Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')

# Use apply() on the 'Text' column
df_input['tokenized_text'] = df_input['tokenized_text'].apply(lambda x: [word for word in x if word not in stopwords])
df_input['tokenized_text'].head()

## Stemming

The process of stemming converts a word to its root form. 
For example:- "programmer", "programming" ==> "program"

But, it might also convert to meaningless words in some cases.
Ex:- "Goose" ==> "Goos"

In [None]:
# Use one of NLTK's in-built stemmers
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('english')

# Apply for the first example
print([stemmer.stem(x) for x in df_input['tokenized_text'][0]])

As you can see, it has converted lot of words into something meaningless. This might definitely reduce the size of the vocabulary at the end but we're losing information this way. So, it's not recommended.

## Lemmatization

You could say this is an improved version of stemming... without losing the meaning of the underlying words.

In [None]:
# Need to download this for the lemmatizer used below
nltk.download('omw-1.4')

In [None]:
# Import the lemmatizer
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# Apply for the first document
print([lemmatizer.lemmatize(x) for x in df_input['tokenized_text'][0]])

In [None]:
# Now, apply for all documents
df_input['tokenized_text'] = df_input['tokenized_text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df_input['tokenized_text'].head()

# Modelling

<span id='mdl'/>

Using one of the simplest feature extraction technique in text analytics - **TfIdfVectorizer** which computes something called Term Frequency and Inverse Document Frequency to properly represent all the text documents in the corpus.

Other ways one could do modelling is to use deep learning based models like LSTM/RNNs or Transformer based models like BERT to encode the input text into a latent feature space (embeddings) and then use these as features to classify into the target labels.

In [None]:
# Use Tf-Idf vectorizer on the already preprocessed, tokenized text
from sklearn.feature_extraction.text import TfidfVectorizer

def dummy_fun(doc):
    return doc

vectorizer = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun, # This can't be None, hence using a dummy function
    preprocessor=dummy_fun,
    token_pattern=None)

X = vectorizer.fit_transform(df_input['tokenized_text'].values)
X.shape

In [None]:
# Encode the target labels into (0, 1, 2...) and split into train/test datasets.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

y = LabelEncoder().fit_transform(df_input['Target'].values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1
                                                    , random_state=42)
print('Shapes of X_train, X_test, y_train, y_test', X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# Check what the vocabulary contains
vectorizer.get_feature_names_out()

In [None]:
# Define a common evaluation function that gives classification report and cross-validation score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

def evaluate_model(sk_model, X_train_in, y_train_in, folds=5):
    sk_model.fit(X_train, y_train)
    y_pred = sk_model.predict(X_test)
    print(classification_report(y_test, y_pred))
    cv_score = cross_val_score(sk_model, X_train_in, y_train_in, cv=5)
    print('Model:', sk_model.__class__.__name__)
    print('{}-fold mean CV score:'.format(folds), round(cv_score.mean(), 4))

In [None]:
# Naive-Bayes as the baseline model
from sklearn.naive_bayes import MultinomialNB

evaluate_model(MultinomialNB(), X_train, y_train)

In [None]:
# 'hinge' loss implies the classifier is Linear SVM
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=10, random_state=42)
evaluate_model(clf, X_train, y_train)

In [None]:
# Multinomial logistic regression model
from sklearn.linear_model import LogisticRegression 

clf = LogisticRegression(multi_class='multinomial', max_iter=200)
evaluate_model(clf, X_train, y_train)

# Hyperparameter optimization

Try optimizing the hyperparameters to get more accuracy hopefully without overfitting. GridSearchCV from sklearn package can be used.

In [None]:
from sklearn.model_selection import GridSearchCV

hyper_parameters = {'loss': ['hinge', 'modified_huber'], 'alpha': [1e-3, 0.01, 0.1], 
                    'eta0': [0.01, 0.05, 0.1],
                    'max_iter': [10, 50, 100]}

clf = GridSearchCV(SGDClassifier(), hyper_parameters)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)