In [1]:
import os
import sys
projectDir = '/mnt/c/Documents and Settings/justj/Documents/GitHub/RakutenTeam'
sys.path.append(projectDir)

import src.config as config
config.path_to_project = projectDir
config.path_to_data = os.path.join(projectDir, 'data', 'clean')
config.path_to_images = '/home/jul/DST/Rakuten/Data/images/image_train_resized'
config.path_to_models = '/mnt/c/Documents and Settings/justj/Documents/DST/RakutenProject/models'


import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV

import tensorflow as tf

from Rakuten_preprocessing import Rakuten_img_path

from src.text.classifiers import MLClassifier
from src.utils.plot import classification_results

from sklearn.metrics import f1_score

2024-03-08 06:00:30.792266: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-08 06:00:30.827341: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-08 06:00:30.827391: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-08 06:00:30.828254: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-08 06:00:30.833625: I tensorflow/core/platform/cpu_feature_guar

In [2]:
data_train = pd.read_csv(os.path.join(config.path_to_data, 'df_train_index.csv'))
data_train['testset'] = False
data_test = pd.read_csv(os.path.join(config.path_to_data, 'df_test_index.csv'))
data_test['testset'] = True
data = pd.concat([data_train, data_test], axis=0)

#merging text into token column
colnames = ['designation_translated', 'description_translated'] #['designation', 'description']#
data['tokens'] = data[colnames].apply(lambda row: ' '.join(s.lower() for s in row if isinstance(s, str)), axis=1)

#path to images into img_path column
data['img_path'] = Rakuten_img_path(img_folder=config.path_to_images,
                             imageid=data['imageid'], productid=data['productid'], suffix='_resized')

In [3]:
#labels of encoded classes
class_labels = data.groupby('prdtypedesignation')['prdtypeindex'].first().reset_index()
class_labels.index = class_labels['prdtypeindex']
class_labels = class_labels.drop(columns='prdtypeindex').sort_index()

## Creating train and test sets

In [4]:
Img_train = data.loc[~data['testset'], 'img_path']
Img_test = data.loc[data['testset'], 'img_path']

Txt_train = data.loc[~data['testset'], 'tokens']
Txt_test = data.loc[data['testset'], 'tokens']

y_train = data.loc[~data['testset'],'prdtypeindex']
y_test = data.loc[data['testset'],'prdtypeindex']

#To be fed into any of our sklearn classifiers, X_train and X_test
#should be dataframes with columns tokens and img_path
X_train = pd.DataFrame({'tokens': Txt_train, 'img_path': Img_train})
X_test = pd.DataFrame({'tokens': Txt_test, 'img_path': Img_test})

#All data for cross-validated scores
X = pd.concat([X_train, X_test], axis=0)
y = pd.concat([y_train, y_test], axis=0)

#Number of classes
num_classes = len(np.unique(data['prdtypeindex']))

## Bag of word based benchmarks

In [5]:
#Dummy classifier on tfidf
dum_classifier = MLClassifier(base_name='dummyclassifier', vec_method = 'tfidf')
dum_classifier.fit(X_train, y_train);
dum_classifier.classification_score(X_test, y_test)

0.025807979055757255

In [26]:
#Logistic regression on tfidf
lr_classifier = MLClassifier(base_name='LogisticRegression', vec_method = 'tfidf', C=2, penalty='l2', max_iter=1000, dual=False)
lr_classifier.fit(X_train, y_train);
lr_classifier.classification_score(X_test, y_test)

0.8129309414789236

In [11]:
#Naive Bayes on tfidf
nb_classifier = MLClassifier(base_name='MultinomialNB', vec_method = 'tfidf', alpha=0.02, fit_prior=True)
nb_classifier.fit(X_train, y_train);
nb_classifier.classification_score(X_test, y_test)

0.771265410675905

In [32]:
#Random forest on tfidf
rf_classifier = MLClassifier(base_name='RandomForestClassifier', vec_method = 'tfidf', n_estimators=100, criterion='gini', max_depth=500)
rf_classifier.fit(X_train, y_train);
rf_classifier.classification_score(X_test, y_test)

0.7698481612513959

In [5]:
#xgboost on tfidf
rf_classifier = MLClassifier(base_name='xgboost', vec_method = 'tfidf', n_estimators=200, objective='multi:softprob',max_depth=6, reg_alpha=0)
rf_classifier.fit(X_train, y_train);
rf_classifier.classification_score(X_test, y_test)

0.8193118067053706

In [5]:
#Linear SVC on tfidf
svc_classifier = MLClassifier(base_name='LinearSVC', vec_method = 'tfidf', C=1, penalty='l2', dual='auto')
svc_classifier.fit(X_train, y_train);
svc_classifier.classification_score(X_test, y_test)
cv_scores = svc_classifier.cross_validate(X, y, cv=10)





















