In [3]:
# System and utilities
import os
import re
import time
# Data manipulation and analysis
import numpy as np
import pandas as pd

# Natural Language Processing (NLP)
import re
from langdetect import detect
import ast

# Machine Learning & Modeling
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

## importing helper functions:
import sys
sys.path.append('../src/functions')
#from text_helpers import get_word2vec_features
#from text_helpers import Word2VecEmbedder


# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Image processing and display
from PIL import Image
from IPython.display import display
from IPython.core.display import HTML

# Progress bars
from tqdm import tqdm

In [4]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [None]:
### Loading the data and defining the paths

In [5]:
DATA_PATH = '../data/'
PROC_DATA_PATH = '../processed_data/'
df = pd.read_csv(PROC_DATA_PATH  + 'X_train_with_labels_ext.csv')
all_stopwords = np.load(PROC_DATA_PATH + 'all_stopwords.npy', allow_pickle=True).tolist() ## vecotrizer need a list
print(all_stopwords)

['notre', 'nicht', 'qualité', 'kein', 'style', 'ancien', 'fossi', 'starai', 'ont', 'nach', 'blanche', 'ebbi', 'son', 'eusse', 'aurai', 'nouvelle', 'fois', 'ob', 'au', 'col', 'five', 'nostre', 'delle', 'nei', 'sarò', 'soyons', 'gt', 'six', 'durch', 'se', 'aveva', 'fossimo', 'noir', 'war', 'werde', 'sta', 'autre', 'sua', 'ed', 'wir', 'bis', 'stette', 'dove', 'avez', 'furono', 'on', 'avremo', 'einen', 'lei', 'faceva', 'hatte', 'facendo', 'avessimo', 'ancienne', 'solches', 'tue', 'class', 'this', 'oui', 'différente', 'contre', 'gegen', 'violette', 'étais', 'warst', 'is', 'agl', 'vostre', 'serai', 'fait', 'einigem', 'mm', 'utilisation', 'en', 'donc', 'von', 'stia', 'fusse', 'vor', 'quanti', 'egrave', 'alles', 'sono', 'verte', 'édition', 'savoir', 'li', 'du', 'dass', 'sto', 'siano', 'derselbe', 'meines', 'allo', 'seront', 'avemmo', 'rsquo', 'br', 'weil', 'avuti', 'stemmo', 'by', 'wollen', 'lo', 'können', 'meine', 'avrete', 'toi', 'usage', 'mia', 'es', 'agrave', 'jener', 'stava', 'feci', 'pho

In [None]:
df.head(1)

### Checking the number of unique tokens in the dataset:
- this is useful to understand the vocabulary size and complexity of the dataset

In [None]:
full = pd.concat([pd.Series(row) for row in df['comb_tokens']])
print(len(full))
vocabulary = set(full)
print(f"Unique tokens in 'comb_tokens': {len(vocabulary)}")

### setting up a few constants

In [None]:
# Random seed for reproducibility
random_state = np.random.seed(66)
VECTOR_SIZE = 15000 ### WE have 73011 unique tokens in the text data, so 
                    #we can use this as a reference point for tfidf vector size 20-30% would be a good size

In [1]:
## Creating a TF-IDF vectorizer object

In [None]:
vectorizer = TfidfVectorizer(
        max_features=VECTOR_SIZE,  # Limit to N number of features
        ngram_range=(1,2),  # Unigrams and bigrams
        min_df=5, max_df=0.8,  # Ignore very rare and very common terms
        lowercase=True,
        stop_words=all_stopwords,  # Keep product-specific terms
        sublinear_tf=True,  # Better for large corpora
        norm='l2'
    )

### VARIANT 1: using combined text:

In [None]:
X=df['combined']
y=df['prdtypecode']

print(y.size)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state, stratify=y)
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

In [None]:
X_train_tfidf = vectorizer.fit_transform(X_train)  # fit + transform
X_test_tfidf = vectorizer.transform(X_test)        # transform only
print(f'TF-IDF matrix shape: {X_train_tfidf.shape}')

## Models Dictionary: 
##### - Add more models as needed comment out the ones you don't want to use.
##### - We might wanna read up on the parameters and to specify which ones need adjusting
##### -  We might wanna consider cross-validation for better model evaluation as a next step.

In [None]:
models = {
    'Logistic Regression': LogisticRegression(
        max_iter=1000,
        C=1.0,  
        class_weight='balanced',  # Handle class imbalance -- DO NOT USE IF SMOTE IS USED -- the function below handles that though
        solver='liblinear',  # Good for sparse data
        random_state=random_state
    ),

    'SGD Classifier': SGDClassifier(
        loss='log_loss',
        alpha=0.0001,  
        class_weight='balanced',
        max_iter=1000,
        random_state=random_state
    ),
    'Linear SVM': LinearSVC(
        C=1.0,
        class_weight='balanced',
        max_iter=1000,
        random_state=random_state
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=20,  
        class_weight='balanced',
        random_state=random_state,
        n_jobs=-1
    )}
## Uncomment the following lines to include XGBoost in the models dictionary - not sure if it works but worth a try
#,
    # 'XGBoost': xgb.XGBClassifier(
    #     n_estimators=100,
    #     max_depth=6,
    #     learning_rate=0.1,
    #     subsample=0.8,
    #     colsample_bytree=0.8,
    #     random_state=random_state,
    #     n_jobs=-1,
    #     eval_metric='mlogloss'  # For multiclass
    # )}

### This is the main function to train and evaluate models. 
- if you provide different/ partial data it should work as long as dimenstions match

In [None]:
def train_and_eval_basic(X_tr, y_tr, X_tst, y_tst, models, balance_classes=False, verbose=True):
    '''balance_classes: if True, uses SMOTE to balance classes in training data
       verbose: if True, prints additional information about class distribution and model performance
       '''
    output = {}
    label_encoder = LabelEncoder()
    y_tr_enc= label_encoder.fit_transform(y_tr)
    y_tst_enc = label_encoder.transform(y_tst)

    if balance_classes:
        print(' SMOTE for class balancing...')
        if verbose:
            print(f"Class distribution before SMOTE:")
            unique, counts = np.unique(y_tr_enc, return_counts=True)
            for cls, count in zip(unique, counts):
                if cls % 5 == 0: 
                    print(f"  Class {cls}: {count} samples")

        smote = SMOTE(random_state=random_state, k_neighbors=5,sampling_strategy='auto')  
        X_tr, y_tr_enc= smote.fit_resample(X_tr, y_tr_enc)
        if verbose:
            print(f'Class distribution after SMOTE:')
            unique, counts = np.unique(y_tr_enc, return_counts=True)
            for cls, count in zip(unique, counts):
                if cls % 5 == 0: 
                    print(f"  Class {cls}: {count} samples")

    for name, model in models.items():
        print(f'Training {name}')
        
        # Train model on TRAINING data
        if balance_classes: model.set_params(class_weight=None)  

        model.fit(X_tr, y_tr_enc)
        # Predict on TEST data
        y_pred_enc = model.predict(X_tst)
        y_pred = label_encoder.inverse_transform(y_pred_enc)
        y_tst_orig = label_encoder.inverse_transform(y_tst_enc)
        
        # Calculate metrics using original labels
        f1 = f1_score(y_tst_orig, y_pred, average='macro') 
        f1_weighted = f1_score(y_tst_orig, y_pred, average='weighted')
            
        print(f'F1 Score (macro): {f1:.4f}')
        print(f'F1 Score (weighted): {f1_weighted:.4f}')

        output[name] = {
            'model': model,
            'f1_score': f1,           # Fixed: variable name
            'f1_weighted': f1_weighted,
            'predictions': y_pred
        }
        if verbose:
            print(classification_report(y_tst_orig, y_pred))
    return output

#### RUN IT

## LONG RUNTIME WARNING:
#### everytime you train and eval the model, it will take a long time to run

In [None]:
output1 = train_and_eval_basic(X_train_tfidf, y_train, X_test_tfidf, y_test, models, verbose=False)

In [None]:
## use smote for class balancing
output1s = train_and_eval_basic(X_train_tfidf, y_train, X_test_tfidf, y_test, models, balance_classes=True, verbose=True)

### VARIANT 2: using preprocessed tokens instead of raw text:

In [None]:
X=df['comb_tokens'] ## !!!!!! the only difference here
y=df['prdtypecode']

print(y.size)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state, stratify=y)
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

print("Vectorizing text...")
X_train_tfidf_tok = vectorizer.fit_transform(X_train)  # fit + transform
X_test_tfidf_tok = vectorizer.transform(X_test)        # transform only

print(f"TF-IDF matrix shape: {X_train_tfidf_tok.shape}")

In [None]:
output2 = train_and_eval_basic(X_train_tfidf_tok, y_train, X_test_tfidf_tok, y_test,  models, balance_classes=True, verbose=True)

In [None]:
### saving the outputs to local data folder (not shared) 

In [None]:
np.save(DATA_PATH + 'output1.npy', output1)
np.save(DATA_PATH + 'output1s.npy', output1s)
np.save(DATA_PATH + 'output2.npy', output2)

### Variant with word2vec features
- we were advised against using word2vec in favour of keras.embedding layer for cnns
- i left it here for comparison and educational purposes

In [None]:
df['comb_tokens']= df['comb_tokens'].apply(ast.literal_eval) ## !!!

X=pd.DataFrame(df['comb_tokens']) ## !!!!!! the only difference here
y=df['prdtypecode']

print(y.size)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state, stratify=y)
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

X_train.head()

In [None]:
# THE VECTOR SIZE IN Word2VecEmbedder is much smaller than the one in TfidfVectorizer, so we need to adjust it
W2V_VECTOR_SIZE = 400 # Adjusted vector size for Word2VecEmbedder

In [None]:
w2v = Word2VecEmbedder(vector_size=W2V_VECTOR_SIZE)

# Train and transform training data
X_train_w2v = w2v.fit_transform(X_train['comb_tokens'])

# Transform test data with same model
X_test_w2v = w2v.transform(X_test['comb_tokens'])

In [None]:
output3 = train_and_eval_basic(X_train_w2v, y_train, X_test_w2v, y_test,  models, balance_classes=True, verbose=True)

In [6]:
output1 = np.load(DATA_PATH + 'output1.npy', allow_pickle=True).item()
output2 =np.load(DATA_PATH + 'output1s.npy', allow_pickle=True).item()


In [7]:
output1 

{'Logistic Regression': {'model': LogisticRegression(max_iter=1000, solver='liblinear'),
  'f1_score': 0.7734938224667732,
  'f1_weighted': 0.7902879939112404,
  'predictions': array([2705, 1302, 1560, ..., 1560, 1302, 1300], shape=(16984,))},
 'SGD Classifier': {'model': SGDClassifier(loss='log_loss'),
  'f1_score': 0.7342957373976658,
  'f1_weighted': 0.7529368805858434,
  'predictions': array([1281, 1302, 1560, ..., 1560, 2280, 2280], shape=(16984,))},
 'Linear SVM': {'model': LinearSVC(),
  'f1_score': 0.783122845011161,
  'f1_weighted': 0.8044003932739417,
  'predictions': array([1280, 1302, 1560, ..., 1560, 1302, 1300], shape=(16984,))},
 'Random Forest': {'model': RandomForestClassifier(max_depth=20, n_jobs=-1),
  'f1_score': 0.6164494092352807,
  'f1_weighted': 0.6270541941377651,
  'predictions': array([1280, 1302, 1560, ..., 1560,   10,   10], shape=(16984,))}}

In [8]:
output2 

{'Logistic Regression': {'model': LogisticRegression(max_iter=1000, solver='liblinear'),
  'f1_score': 0.7711741423618188,
  'f1_weighted': 0.7918192486793633,
  'predictions': array([2705, 1302, 1560, ..., 1560, 1302, 1300], shape=(16984,))},
 'SGD Classifier': {'model': SGDClassifier(loss='log_loss'),
  'f1_score': 0.7213511310781017,
  'f1_weighted': 0.7440609711593948,
  'predictions': array([1281, 1302, 1560, ..., 1560, 2280, 2280], shape=(16984,))},
 'Linear SVM': {'model': LinearSVC(),
  'f1_score': 0.7696099633981431,
  'f1_weighted': 0.7945755067565161,
  'predictions': array([1280, 1302, 1560, ..., 1560, 1302, 1300], shape=(16984,))},
 'Random Forest': {'model': RandomForestClassifier(max_depth=20, n_jobs=-1),
  'f1_score': 0.6152963304539969,
  'f1_weighted': 0.6317166737903592,
  'predictions': array([1280, 1302, 1560, ..., 1560,   10,   10], shape=(16984,))}}