In [1]:
import os
import sys

#Import config file. Update config.py according to your environment
import config

import pandas as pd
import numpy as np

import tensorflow as tf

from Rakuten_preprocessing import Rakuten_img_path

from src.text.classifiers import MLClassifier

from src.utils.batch import fit_save_all


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

2024-03-10 17:11:47.347534: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-10 17:11:47.388230: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-10 17:11:47.388266: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-10 17:11:47.389498: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory

## Loading data

In [2]:
data_train = pd.read_csv(os.path.join(config.path_to_data, 'df_train_index.csv'))
data_train['testset'] = False
data_test = pd.read_csv(os.path.join(config.path_to_data, 'df_test_index.csv'))
data_test['testset'] = True
data = pd.concat([data_train, data_test], axis=0)

#merging text into token column
colnames = ['designation_translated', 'description_translated'] #['designation', 'description']#
data['tokens'] = data[colnames].apply(lambda row: ' '.join(s.lower() for s in row if isinstance(s, str)), axis=1)

#path to images into img_path column
data['img_path'] = Rakuten_img_path(img_folder=config.path_to_images,
                             imageid=data['imageid'], productid=data['productid'], suffix='_resized')

In [3]:
#labels of encoded classes
class_labels = data.groupby('prdtypedesignation')['prdtypeindex'].first().reset_index()
class_labels.index = class_labels['prdtypeindex']
class_labels = class_labels.drop(columns='prdtypeindex').sort_index()

## Creating train and test sets

In [4]:
Img_train = data.loc[~data['testset'], 'img_path']
Img_test = data.loc[data['testset'], 'img_path']

Txt_train = data.loc[~data['testset'], 'tokens']
Txt_test = data.loc[data['testset'], 'tokens']

y_train = data.loc[~data['testset'],'prdtypeindex']
y_test = data.loc[data['testset'],'prdtypeindex']

#To be fed into any of our sklearn classifiers, X_train and X_test
#should be dataframes with columns tokens and img_path
X_train = pd.DataFrame({'tokens': Txt_train, 'img_path': Img_train})
X_test = pd.DataFrame({'tokens': Txt_test, 'img_path': Img_test})

#All data for cross-validated scores
X = pd.concat([X_train, X_test], axis=0)
y = pd.concat([y_train, y_test], axis=0)

#Number of classes
num_classes = len(np.unique(data['prdtypeindex']))

## Dummy classifier

In [None]:
#Dummy classifier on tfidf
dum_classifier = MLClassifier(base_name='dummyclassifier')
dum_classifier.fit(X_train, y_train);
dum_classifier.classification_score(X_test, y_test)
cv_scores = dum_classifier.cross_validate(X, y, cv=10)
dum_classifier.save('text/dummy')

## Bag of word based benchmarks

In [None]:
#Name of the summary csv file to save results to
result_file_name = 'results_benchmark_text.csv'

#Type of classifier
class_type = 'MLClassifier'

#grid search number of folds
nfolds_grid = 0

#cross-validation of f1-score
nfolds_cv = 0

#Bag of word parameters
vec_method = 'tfidf'

#Initializing the list of parameters to batch over
params_list = []

params_list.append({'modality': 'text',
                    'class': class_type,
                    'base_name': 'LogisticRegression', 
                    'vec_method': vec_method, 
                    'param_grid': {'C': [2], 'penalty': ['l2']},
                    'nfolds_grid': nfolds_grid, 'nfolds_cv': nfolds_cv
                   })
params_list.append({'modality': 'text',
                    'class': class_type,
                    'base_name': 'MultinomialNB', 
                    'vec_method': vec_method, 
                    'param_grid': {'alpha': [0.02], 'fit_prior': [True]},
                    'nfolds_grid': nfolds_grid, 'nfolds_cv': nfolds_cv
                    })
params_list.append({'modality': 'text',
                    'class': class_type,
                    'base_name': 'RandomForestClassifier', 
                    'vec_method': vec_method, 
                    'param_grid': {'n_estimators': [200], 'max_depth': [500]},
                    'nfolds_grid': nfolds_grid, 'nfolds_cv': nfolds_cv
                    })
params_list.append({'modality': 'text',
                    'class': class_type,
                    'base_name': 'xgboost', 
                    'vec_method': vec_method, 
                    'param_grid': {'n_estimators': [200], 'objective': ['multi:softprob'], 'max_depth':[6], 'reg_alpha':[0]},
                    'nfolds_grid': nfolds_grid, 'nfolds_cv': nfolds_cv
                    })
params_list.append({'modality': 'text',
                    'class': class_type,
                    'base_name': 'LinearSVC', 
                    'vec_method': vec_method, 
                    'param_grid': {'C': np.arange(0.5, 1.5, 0.1), 'penalty': ['l2']},
                    'nfolds_grid': 5, 'nfolds_cv': 10
                    })

results = fit_save_all(params_list, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, result_file_name = result_file_name)


Fitting:  LinearSVC tfidf
{'C': 0.5, 'penalty': 'l2'}
GridSearch:  {'C': 0.5, 'penalty': 'l2'}
Test set, f1score:  0.8239800861828738
CV f1score:  0.8258831911074498


## Fetch and check the saved result file

In [None]:
results = pd.read_csv(os.path.join(config.path_to_results,'results_benchmark_text.csv'), index_col=0)
results.head()

Unnamed: 0,modality,class,vectorization,classifier,tested_params,best_params,score_test,score_cv_test,score_cv_train,fit_cv_time,model_path
0,text,MLClassifier,tfidf,LinearSVC,"{'C': array([0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1...","{'C': 0.5, 'penalty': 'l2'}",0.82398,[0.82403265 0.82531629 0.82968601 0.82028095 0...,[0.96525381 0.96556959 0.96546323 0.96552612 0...,[8.3740375 8.30071878 8.0570724 8.07186079 8...,text/LinearSVC_tfidf


## Word2vec based benchmarks

In [None]:
params_list = []
class_type = 'MLClassifier'

#grid search number of folds
nfolds_grid = 0

#cross-validation of f1-score
nfolds_cv = 0

#Word2vec parameters
vec_method = 'skipgram'
vector_size = 500

#List of parameters to batch over
params_list.append({'modality': 'text',
                    'class': class_type, 
                    'base_name': 'LogisticRegression',
                    'vec_method': vec_method,
                    'param_grid': {'C': [10], 'penalty': 'l2',
                                   'vec_params':[{'workers': num_cores-1, 'vector_size': vector_size}]},
                    'nfolds_grid': nfolds_grid, 'nfolds_cv': nfolds_cv                    
                    })
params_list.append({'modality': 'text',
                    'class': class_type,
                    'base_name': 'RandomForestClassifier', 
                    'vec_method': vec_method, 
                    'param_grid': {'n_estimators': [200], 'max_depth': [500],
                                   'vec_params':[{'workers': num_cores-1, 'vector_size': vector_size}]},
                    'nfolds_grid': nfolds_grid, 'nfolds_cv': nfolds_cv
                    })
params_list.append({'modality': 'text',
                    'class': class_type,
                    'base_name': 'xgboost', 
                    'vec_method': vec_method, 
                    'param_grid': {'n_estimators': [200], 'objective': ['multi:softprob'], 'max_depth':[6], 'reg_alpha':[0],
                                   'vec_params':[{'workers': num_cores-1, 'vector_size': vector_size}]},
                    'nfolds_grid': nfolds_grid, 'nfolds_cv': nfolds_cv
                    })
params_list.append({'modality': 'text',
                    'class': class_type, 
                    'base_name': 'SVC',
                    'vec_method': vec_method,
                    'param_grid': {'C': [10], 'kernel': ['rbf'],
                                   'vec_params':[{'workers': num_cores-1, 'vector_size': vector_size}]},
                    'nfolds_grid': nfolds_grid, 'nfolds_cv': nfolds_cv
                    })

results = fit_save_all(params_list, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, result_file_name = 'results_benchmark_text.csv')

## Fetch and check the saved result file

In [None]:
## Fetch and check the saved result file
results = pd.read_csv(os.path.join(config.path_to_results,'results_benchmark_text.csv'), index_col=0)
results.head()

## Example usage of ML classifier

In [None]:
#SVC on tfidf
svc_w2v_classifier = MLClassifier(base_name='SVC', C=10, kernel='rbf', vec_method = 'skipgram', vec_params={'workers': num_cores-1, 'vector_size': 512})
svc_w2v_classifier.fit(X_train, y_train);
svc_w2v_classifier.classification_score(X_test, y_test)
svc_w2v_classifier.cross_validate(X, y, cv=10)
svc_w2v_classifier.save('text/SVC_skipgram')

0.7695062608525401