In [1]:
import os
import sys
projectDir = '/mnt/c/Documents and Settings/justj/Documents/GitHub/RakutenTeam'
sys.path.append(projectDir)

import multiprocessing
num_cores = multiprocessing.cpu_count()

import src.config as config
config.path_to_project = projectDir
config.path_to_data = os.path.join(projectDir, 'data', 'clean')
config.path_to_results = os.path.join(projectDir, 'results')
config.path_to_images = '/home/jul/DST/Rakuten/Data/images/image_train_resized'
config.path_to_models = '/mnt/c/Documents and Settings/justj/Documents/DST/RakutenProject/models'


import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold

import tensorflow as tf

from Rakuten_preprocessing import Rakuten_img_path

from src.text.classifiers import MLClassifier
from src.utils.plot import classification_results

from sklearn.metrics import f1_score

2024-03-09 18:22:41.598399: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-09 18:22:41.628608: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-09 18:22:41.628637: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-09 18:22:41.629261: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-09 18:22:41.633721: I tensorflow/core/platform/cpu_feature_guar

In [2]:
data_train = pd.read_csv(os.path.join(config.path_to_data, 'df_train_index.csv'))
data_train['testset'] = False
data_test = pd.read_csv(os.path.join(config.path_to_data, 'df_test_index.csv'))
data_test['testset'] = True
data = pd.concat([data_train, data_test], axis=0)

#merging text into token column
colnames = ['designation_translated', 'description_translated'] #['designation', 'description']#
data['tokens'] = data[colnames].apply(lambda row: ' '.join(s.lower() for s in row if isinstance(s, str)), axis=1)

#path to images into img_path column
data['img_path'] = Rakuten_img_path(img_folder=config.path_to_images,
                             imageid=data['imageid'], productid=data['productid'], suffix='_resized')

In [3]:
#labels of encoded classes
class_labels = data.groupby('prdtypedesignation')['prdtypeindex'].first().reset_index()
class_labels.index = class_labels['prdtypeindex']
class_labels = class_labels.drop(columns='prdtypeindex').sort_index()

## Creating train and test sets

In [4]:
Img_train = data.loc[~data['testset'], 'img_path']
Img_test = data.loc[data['testset'], 'img_path']

Txt_train = data.loc[~data['testset'], 'tokens']
Txt_test = data.loc[data['testset'], 'tokens']

y_train = data.loc[~data['testset'],'prdtypeindex']
y_test = data.loc[data['testset'],'prdtypeindex']

#To be fed into any of our sklearn classifiers, X_train and X_test
#should be dataframes with columns tokens and img_path
X_train = pd.DataFrame({'tokens': Txt_train, 'img_path': Img_train})
X_test = pd.DataFrame({'tokens': Txt_test, 'img_path': Img_test})

#All data for cross-validated scores
X = pd.concat([X_train, X_test], axis=0)
y = pd.concat([y_train, y_test], axis=0)

#Number of classes
num_classes = len(np.unique(data['prdtypeindex']))

## Dummy classifier

In [None]:
#Dummy classifier on tfidf
dum_classifier = MLClassifier(base_name='dummyclassifier')
dum_classifier.fit(X_train, y_train);
dum_classifier.classification_score(X_test, y_test)
cv_scores = dum_classifier.cross_validate(X, y, cv=10)
dum_classifier.save('benchmark_txt/dummy')

In [24]:
if not os.path.exists(config.path_to_results):
    os.makedirs(config.path_to_results)
            
def fit_save_all(params_list, cv_grid = 5, cv = 10, result_file_name = 'results.csv'):
    results_path = os.path.join(config.path_to_results, result_file_name)
    
    #If results.csv doesn't exist, we create it
    if not os.path.isfile(results_path):
        df_results = pd.DataFrame(columns=['modality', 'vectorization', 'classifier', 'tested_params', 
                                           'best_params','score_test', 'score_cv_test', 'score_cv_train', 'fit_cv_time',
                                           'model_path'])
        df_results.to_csv(results_path)

    cvsplitter = StratifiedKFold(n_splits=cv_grid, shuffle=True, random_state=123)
    for params in params_list:
        #Populating results with parameters
        results = {'modality': params['modality'], 'class': params['class'], 'classifier': params['base_name'],
                   'vectorization': params['vec_method'], 'tested_params': params['param_grid']}
        
        #GridsearCV on one parameter
        print('Fitting: ', params['base_name'], params['vec_method'])
        clf = MLClassifier(base_name=params['base_name'], vec_method=params['vec_method'])
        param_grid = params['param_grid']
        
        if cvsplitter is not None:
            gridcv = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='f1_weighted', cv=cvsplitter)
            gridcv.fit(X_train, y_train)
            print('GridSearch: ', gridcv.best_params_)
            
            #saving best params
            results['best_params'] = gridcv.best_params_
            
            #Keeping the best parameter
            clf = gridcv.best_estimator_
        else:
            clf.fit(X_train, y_train)
            results['best_params'] = np.nan
        
        #Calculating scores on the test set
        f1score_test = clf.classification_score(X_test, y_test)
        print('Test set, f1score: ', f1score_test)
        
        #saving f1score_test
        results['score_test'] = f1score_test
        
        #Calculating score by k-fold cross-validation
        f1score_cv = clf.cross_validate(X, y, cv=cv)
        print('CV f1score: ', f1score_cv)
        
        #saving CV f1score on test, train and fit time
        results['score_cv_test'] = clf.cv_scores['test_score']
        results['score_cv_train'] = clf.cv_scores['train_score']
        results['fit_cv_time'] = clf.cv_scores['fit_time']
        
        #Saving the model (trained on training set only)
        model_path = params['modality'] + '/' + params['base_name'] + '_' + params['vec_method']
        clf.save(model_path)
        
        #saving where the model is saved
        results['model_path'] = model_path
        
        #Loading results.csv, adding line and saving it
        #Loading results.csv
        df_results = pd.read_csv(results_path, index_col=0)
        df_results = pd.concat([df_results, results], axis=0)
        # df_results.loc[len(df_results)] = results
        df_results.to_csv(results_path)

In [22]:
results_path = os.path.join(config.path_to_results, 'results.csv')
df_results = pd.read_csv(results_path, index_col=0)
df_results.head()

Unnamed: 0,modality,vectorization,classifier,tested_params,best_params,score_test,score_cv_test,score_cv_train,fit_cv_time,model_path
0,text,tfidf,LinearSVC,"{'C': array([0.9, 1. , 1.1]), 'penalty': ['l2'...","{'C': 0.9, 'dual': 'auto', 'penalty': 'l2'}",0.825805,[0.8211819 0.82424724 0.8245079 0.82306479 0...,[0.98150768 0.98217348 0.98224738 0.98188303 0...,[7.7058816 7.7486496 7.92781711 7.51118326 7...,benchmark_txt/w2v_LinearSVC


## Bag of word based benchmarks

In [26]:
save_prefix = 'benchmark_txt/w2v_'
params_list = []
# params_list.append({'base_name': 'LinearSVC', 
#                     'vec_method': 'tfidf', 
#                     'param_grid': {'C': np.arange(0.9, 1.1, 0.1), 'penalty': ['l2'], 'dual': ['auto']}
#                     })
params_list.append({'class': 'MLClassifier', 
                    'base_name': 'LinearSVC',
                    'vec_method': 'skipgram',
                    'param_grid': {'C': [1, 5, 10], 'kernel': ['rbf'], 'vec_params':[{'workers': num_cores-1, 'vector_size': 256}]}
                    })
fit_save_all(params_list, save_prefix=save_prefix, cv_grid=5, cv=5, result_file_name = 'results.csv')

Fitting:  LinearSVC skipgram


KeyboardInterrupt: 

In [7]:
#Logistic regression on tfidf
lr_bow_classifier = MLClassifier(base_name='LogisticRegression', vec_method = 'tfidf', C=2, penalty='l2', max_iter=1000, dual=False)
lr_bow_classifier.fit(X_train, y_train);
lr_bow_classifier.classification_score(X_test, y_test)
cv_scores = lr_bow_classifier.cross_validate(X, y, cv=10)
lr_bow_classifier.save('benchmark_txt/w2w/logistic')

0.8129309414789236

In [8]:
#Naive Bayes on tfidf
nb_bow_classifier = MLClassifier(base_name='MultinomialNB', vec_method = 'tfidf', alpha=0.02, fit_prior=True)
nb_bow_classifier.fit(X_train, y_train);
nb_bow_classifier.classification_score(X_test, y_test)

0.771265410675905

In [None]:
#Random forest on tfidf
rf_w2v_classifier = MLClassifier(base_name='RandomForestClassifier', vec_method = 'tfidf', n_estimators=100, criterion='gini', max_depth=500)
rf_w2v_classifier.fit(X_train, y_train);
rf_w2v_classifier.classification_score(X_test, y_test)

In [None]:
#xgboost on tfidf
xgb_bow_classifier = MLClassifier(base_name='xgboost', vec_method = 'tfidf', n_estimators=200, objective='multi:softprob',max_depth=6, reg_alpha=0)
xgb_bow_classifier.fit(X_train, y_train);
xgb_bow_classifier.classification_score(X_test, y_test)

0.8193118067053706

In [None]:
#Linear SVC on tfidf
svc_bow_classifier = MLClassifier(base_name='LinearSVC', vec_method = 'tfidf', C=1, penalty='l2', dual='auto')
svc_bow_classifier.fit(X_train, y_train);
svc_bow_classifier.classification_score(X_test, y_test)
# cv_scores = svc_classifier.cross_validate(X, y, cv=10)

0.7154932395658148

## Word2Vec based benchmarks

In [13]:
#SVC on tfidf
svc_w2v_classifier = MLClassifier(base_name='SVC', C=10, kernel='rbf', vec_method = 'skipgram', vec_params={'workers': num_cores-1, 'vector_size': 512})
svc_w2v_classifier.fit(X_train, y_train);
svc_w2v_classifier.classification_score(X_test, y_test)
cv_scores = svc_w2v_classifier.cross_validate(X, y, cv=10)
svc_w2v_classifier.save('benchmark_w2w/svc_skipgram')

0.8132613511229633

In [16]:
svc_w2v_classifier.load('benchmark_w2w/svc_skipgram')

In [17]:
svc_w2v_classifier.f1score

0.8132613511229633