In [14]:
# System and utilities
import os
import re
import time
# Data manipulation and analysis
import numpy as np
import pandas as pd

# Natural Language Processing (NLP)
import re
#from langdetect import detect

# Machine Learning & Modeling
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

import sys
sys.path.append('../src/functions')
#from text_helpers import get_word2vec_features
#from text_helpers import Word2VecEmbedder

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Image processing and display
from PIL import Image
from IPython.display import display
from IPython.core.display import HTML

# Progress bars
from tqdm import tqdm

In [15]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [16]:
DATA_PATH = '../data/'
PROC_DATA_PATH = '../processed_data/'
df = pd.read_csv(PROC_DATA_PATH  + 'X_train_with_labels.csv')
#df_img_flat = pd.read_csv(DATA_PATH + 'flattened_images_32.csv')

In [8]:
df.head(1)

Unnamed: 0,designation,description,productid,imageid,prdtypecode,cat_name_fr,cat_name_en,combined
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,10,Livres d'occasion,Used Books,Olivia: Personalisiertes Notizbuch / 150 Seite...


In [None]:
all_stopwords = np.load(PROC_DATA_PATH + 'all_stopwords.npy', allow_pickle=True).tolist()
print(all_stopwords)

['notre', 'nicht', 'qualité', 'kein', 'style', 'ancien', 'fossi', 'starai', 'ont', 'nach', 'blanche', 'ebbi', 'son', 'eusse', 'aurai', 'nouvelle', 'fois', 'ob', 'au', 'col', 'five', 'nostre', 'delle', 'nei', 'sarò', 'soyons', 'gt', 'six', 'durch', 'se', 'aveva', 'fossimo', 'noir', 'war', 'werde', 'sta', 'autre', 'sua', 'ed', 'wir', 'bis', 'stette', 'dove', 'avez', 'furono', 'on', 'avremo', 'einen', 'lei', 'faceva', 'hatte', 'facendo', 'avessimo', 'ancienne', 'solches', 'tue', 'class', 'this', 'oui', 'différente', 'contre', 'gegen', 'violette', 'étais', 'warst', 'is', 'agl', 'vostre', 'serai', 'fait', 'einigem', 'mm', 'utilisation', 'en', 'donc', 'von', 'stia', 'fusse', 'vor', 'quanti', 'egrave', 'alles', 'sono', 'verte', 'édition', 'savoir', 'li', 'du', 'dass', 'sto', 'siano', 'derselbe', 'meines', 'allo', 'seront', 'avemmo', 'rsquo', 'br', 'weil', 'avuti', 'stemmo', 'by', 'wollen', 'lo', 'können', 'meine', 'avrete', 'toi', 'usage', 'mia', 'es', 'agrave', 'jener', 'stava', 'feci', 'pho

In [10]:
rs = np.random.random(66)
VECTOR_SIZE = 15000

In [11]:
vectorizer = TfidfVectorizer(
        max_features=VECTOR_SIZE,  # Limit to N number of features
        ngram_range=(1,2),  # Unigrams and bigrams
        min_df=5, max_df=0.8,  # Ignore very rare and very common terms
        lowercase=True,
        stop_words=all_stopwords,  # Keep product-specific terms
        sublinear_tf=True,  # Better for large corpora
        norm='l2'
    )

In [None]:
img_size = 128 #image size
images_df = pd.read_csv(PROC_DATA_PATH + f'flattened_images_{img_size}.csv')  # Replace with your image file path
#images_df = pd.read_csv(DATA_PATH + 'flattened_images_{128}.csv')
# Merge on both productid and imageid
df_merged = df.merge(images_df[['productid', 'imageid', 'pixels']], 
                     on=['productid', 'imageid'], 
                     how='inner')  # Only keep rows with both text and image

print(f"Original text data: {len(df)} rows")
print(f"Merged data with images: {len(df_merged)} rows")

Original text data: 84916 rows
Merged data with images: 84916 rows


In [None]:
## optional non-shared save
df_merged.to_csv(DATA_PATH + f'X_train_labeled_with_flatimgs_{img_size}.csv', index=False)

In [12]:
X_text = df_merged['combined']
X_images = df_merged['pixels']  # Already vectorized images
y = df_merged['prdtypecode'] 

### Runing train.test split before vectorization!!

In [40]:
X_text_train, X_text_test, X_img_train, X_img_test, y_train, y_test = train_test_split(X_text, X_images, y, test_size=0.2, random_state=rs, stratify=y)

In [41]:
X_text_train_tfidf = vectorizer.fit_transform(X_text_train)
X_text_test_tfidf = vectorizer.transform(X_text_test)

### this step may require some attention and arrays checking


In [None]:

# X_img_train_array = np.array(X_img_train.tolist())
# X_img_test_array = np.array(X_img_test.tolist())
def clean_image_vector(img):
    if isinstance(img, str):
        # Remove brackets and split, then filter out '...'
        values = [x for x in img.strip('[]').split() if x != '...']
        return np.array(values, dtype=np.float32)
    else:
        return np.array(img, dtype=np.float32)

X_img_train_array = np.array([clean_image_vector(img) for img in X_img_train])
X_img_test_array = np.array([clean_image_vector(img) for img in X_img_test])

print(X_img_train_array.shape)
                      

### combining text and image features

In [44]:
#Combine text and image features
from scipy.sparse import hstack, csr_matrix

X_train_joint = hstack([X_text_train_tfidf, csr_matrix(X_img_train_array)])
X_test_joint = hstack([X_text_test_tfidf, csr_matrix(X_img_test_array)])

In [45]:
print(f"Combined feature matrix shape: {X_train_joint.shape}")

Combined feature matrix shape: (67932, 10005)


In [None]:
models = {
    'Logistic Regression': LogisticRegression(
        max_iter=1000,
        C=1.0,  
        class_weight='balanced',  # Handle class imbalance
        solver='liblinear',  # Good for sparse data
        random_state=rs
    ),
    # 'SGD Classifier': SGDClassifier(
    #     loss='log_loss',
    #     alpha=0.0001,  
    #     class_weight='balanced',
    #     max_iter=1000,
    #     random_state=rs
    # ),
    'Linear SVM': LinearSVC(
        C=1.0,
        class_weight='balanced',
        max_iter=1000,
        random_state=rs
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=20,  
        class_weight='balanced',
        random_state=rs,
        n_jobs=-1
    )}

#,
    # 'XGBoost': xgb.XGBClassifier(
    #     n_estimators=100,
    #     max_depth=6,
    #     learning_rate=0.1,
    #     subsample=0.8,
    #     colsample_bytree=0.8,
    #     random_state=rs,
    #     n_jobs=-1,
    #     eval_metric='mlogloss'  # For multiclass
    # )}

### This is the main function to train and evaluate models. 
##### -  if you provide different/ partial data it should work as long as dimenstions match

In [None]:
def train_and_eval_basic(X_tr, y_tr, X_tst, y_tst, models, balance_classes=False, verbose=True):
    '''balance_classes: if True, uses SMOTE to balance classes in training data
       verbose: if True, prints additional information about class distribution and model performance
       '''
    output = {}
    label_encoder = LabelEncoder()
    y_tr_enc= label_encoder.fit_transform(y_tr)
    y_tst_enc = label_encoder.transform(y_tst)

    if balance_classes:
        print(' SMOTE for class balancing...')
        if verbose:
            print(f"Class distribution before SMOTE:")
            unique, counts = np.unique(y_tr_enc, return_counts=True)
            for cls, count in zip(unique, counts):
                if cls % 5 == 0: 
                    print(f"  Class {cls}: {count} samples")

        smote = SMOTE(random_state=rs, k_neighbors=5,sampling_strategy='auto')  
        X_tr, y_tr_enc= smote.fit_resample(X_tr, y_tr_enc)
        if verbose:
            print(f'Class distribution after SMOTE:')
            unique, counts = np.unique(y_tr_enc, return_counts=True)
            for cls, count in zip(unique, counts):
                if cls % 5 == 0: 
                    print(f"  Class {cls}: {count} samples")

    for name, model in models.items():
        print(f'Training {name}')
        
        # Train model on TRAINING data
        if balance_classes: model.set_params(class_weight=None)  

        model.fit(X_tr, y_tr_enc)
        # Predict on TEST data
        y_pred_enc = model.predict(X_tst)
        y_pred = label_encoder.inverse_transform(y_pred_enc)
        y_tst_orig = label_encoder.inverse_transform(y_tst_enc)
        
        # Calculate metrics using original labels
        f1 = f1_score(y_tst_orig, y_pred, average='macro') 
        f1_weighted = f1_score(y_tst_orig, y_pred, average='weighted')
            
        print(f'F1 Score (macro): {f1:.4f}')
        print(f'F1 Score (weighted): {f1_weighted:.4f}')

        output[name] = {
            'model': model,
            'f1_score': f1,           # Fixed: variable name
            'f1_weighted': f1_weighted,
            'predictions': y_pred
        }
        if verbose:
            print(classification_report(y_tst_orig, y_pred))
    return output

## RUN IT

In [13]:
## LONG RUNTIME WARNING:
#### everytime you train and eval the model, it will take a long time to run

In [None]:
output = train_and_eval_basic(X_train_joint, y_train, X_test_joint, y_test, 
                                models, 
                                balance_classes=True, 
                                verbose=True
                                )

 SMOTE for class balancing...
Class distribution before SMOTE:
  Class 0: 2493 samples
  Class 5: 3162 samples
  Class 10: 646 samples
  Class 15: 642 samples
  Class 20: 1137 samples
  Class 25: 2209 samples
Class distribution after SMOTE:
  Class 0: 8167 samples
  Class 5: 8167 samples
  Class 10: 8167 samples
  Class 15: 8167 samples
  Class 20: 8167 samples
  Class 25: 8167 samples
Training Logistic Regression


## RUN IT

In [42]:
output1 = train_and_eval_basic(X_train_tfidf, y_train, X_test_tfidf, y_test, models, verbose=False)

Training Logistic Regression
F1 Score (macro): 0.7636
F1 Score (weighted): 0.7828
Training SGD Classifier
F1 Score (macro): 0.7279
F1 Score (weighted): 0.7471
Training Linear SVM
F1 Score (macro): 0.7716
F1 Score (weighted): 0.7961
Training Random Forest
F1 Score (macro): 0.6144
F1 Score (weighted): 0.6267


In [48]:
## use smote for class balancing
output1s = train_and_eval_basic(X_train_tfidf, y_train, X_test_tfidf, y_test, models, balance_classes=True, verbose=True)

 SMOTE for class balancing...
Class distribution before SMOTE:
  Class 0: 2493 samples
  Class 5: 3162 samples
  Class 10: 646 samples
  Class 15: 642 samples
  Class 20: 1137 samples
  Class 25: 2209 samples
Class distribution after SMOTE:
  Class 0: 8167 samples
  Class 5: 8167 samples
  Class 10: 8167 samples
  Class 15: 8167 samples
  Class 20: 8167 samples
  Class 25: 8167 samples
Training Logistic Regression
F1 Score (macro): 0.7622
F1 Score (weighted): 0.7845
              precision    recall  f1-score   support

          10       0.42      0.54      0.47       623
          40       0.65      0.63      0.64       502
          50       0.74      0.82      0.78       336
          60       0.88      0.87      0.88       166
        1140       0.72      0.73      0.73       534
        1160       0.90      0.87      0.89       791
        1180       0.44      0.68      0.53       153
        1280       0.74      0.51      0.61       974
        1281       0.53      0.54      0.5

### VARIANT 2: using preprocessed tokens instead of raw text:

In [49]:
X=df['comb_tokens'] ## !!!!!! the only difference here
y=df['prdtypecode']

print(y.size)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state, stratify=y)
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

print("Vectorizing text...")
X_train_tfidf_tok = vectorizer.fit_transform(X_train)  # fit + transform
X_test_tfidf_tok = vectorizer.transform(X_test)        # transform only

print(f"TF-IDF matrix shape: {X_train_tfidf_tok.shape}")

84916
Training samples: 67932
Test samples: 16984
Vectorizing text...
TF-IDF matrix shape: (67932, 9999)


In [50]:
output2 = train_and_eval_basic(X_train_tfidf_tok, y_train, X_test_tfidf_tok, y_test,  models, balance_classes=True, verbose=True)

 SMOTE for class balancing...
Class distribution before SMOTE:
  Class 0: 2493 samples
  Class 5: 3162 samples
  Class 10: 646 samples
  Class 15: 642 samples
  Class 20: 1137 samples
  Class 25: 2209 samples
Class distribution after SMOTE:
  Class 0: 8167 samples
  Class 5: 8167 samples
  Class 10: 8167 samples
  Class 15: 8167 samples
  Class 20: 8167 samples
  Class 25: 8167 samples
Training Logistic Regression
F1 Score (macro): 0.6727
F1 Score (weighted): 0.7011
              precision    recall  f1-score   support

          10       0.19      0.56      0.28       623
          40       0.54      0.47      0.50       502
          50       0.64      0.65      0.64       336
          60       0.85      0.77      0.81       166
        1140       0.60      0.53      0.56       534
        1160       0.75      0.53      0.62       791
        1180       0.27      0.39      0.32       153
        1280       0.71      0.48      0.57       974
        1281       0.50      0.49      0.4

In [51]:
np.save(DATA_PATH + 'output1.npy', output1)
np.save(DATA_PATH + 'output1s.npy', output1s)
np.save(DATA_PATH + 'output2.npy', output2)