### 1. Reading the datasets
Installing the libraries first.

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
! pip install -q -U numpy==1.19.5
! pip install -q -U matplotlib==3.3.4
! pip install -q -U mxnet==1.8.0
! pip install -q -U catboost==0.26.1
! pip install -q -U lightgbm==3.2.1

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m
[31mERROR: Could not find a version that satisfies the requirement mxnet==1.8.0 (from versions: 1.6.0, 1.7.0.post1, 1.7.0.post2, 1.8.0.post0)[0m
[31mERROR: No matching distribution found for

In [4]:
import pandas as pd

train_df = pd.read_csv("../final_project/final_project_dataset_training.csv")
test_df = pd.read_csv("../final_project/public_test_features.csv")

In [5]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

%matplotlib inline

### 2. Splitting into train and test

__SimpleImputer: Sklearn imputation transformer for completing missing values__

Examples:
* Numerical fields: `SimpleImputer(missing_values=nan, strategy='mean', fill_value=None)` # or strategy='median'
* Categorical fields: `SimpleImputer(missing_values=nan, strategy='most_frequent', fill_value=None)`
* Constant (for both): `SimpleImputer(missing_values=nan, strategy='constant', fill_value=None)` # example: for categorical-> fill_value='missing', for numerical -> fill_value=0

In [6]:
train_df.columns

Index(['ID', 'ASIN', 'marketplace', 'GL', 'category', 'subcategory', 'dept',
       'brand_code', 'brand', 'item_type_keyword', 'website_display_group',
       'product_type_description', 'binding', 'size', 'handling_group',
       'item_count', 'hazmat_classification', 'actual_process_class_ID'],
      dtype='object')

In [7]:
train_df.duplicated().sum() # there is no duplicate

0

In [8]:
columns_to_keep = ['marketplace', 'GL', 'category', 'subcategory', 'brand', 'website_display_group',
       'product_type_description', 'binding','handling_group', 'hazmat_classification']

# dropping columns that have too many missing values such as dept and size, item count
# dropping columns that are too similar to the existing features

In [9]:
test_df.columns

Index(['ID', 'ASIN', 'marketplace', 'GL', 'category', 'subcategory', 'dept',
       'brand_code', 'brand', 'item_type_keyword', 'website_display_group',
       'product_type_description', 'binding', 'size', 'handling_group',
       'item_count', 'hazmat_classification'],
      dtype='object')

In [10]:
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(train_df, test_size=0.1, shuffle=True, random_state=23)

# Print the shapes of the Train - Test Datasets
print('Train - Test Datasets shapes: ', train_data.shape, val_data.shape)

Train - Test Datasets shapes:  (363205, 18) (40357, 18)


In [11]:
train_X=train_data[columns_to_keep]
val_X=val_data[columns_to_keep]
train_Y=train_data['actual_process_class_ID']

In [12]:
train_data.count()/len(train_data)

ID                          1.000000
ASIN                        1.000000
marketplace                 1.000000
GL                          1.000000
category                    1.000000
subcategory                 1.000000
dept                        0.579370
brand_code                  0.851183
brand                       0.969885
item_type_keyword           0.480186
website_display_group       1.000000
product_type_description    1.000000
binding                     0.994774
size                        0.668141
handling_group              0.964067
item_count                  0.000000
hazmat_classification       0.012599
actual_process_class_ID     1.000000
dtype: float64

### 3. Data Preprocessing and Defining classifiers

Pipeline is an easy way to apply preprocessing on multiple fields.

In [25]:
from sklearn.utils import resample
from lightgbm import LGBMClassifier

In [26]:
most_frequent_cols = ['brand', 'binding', 'handling_group']
constant_cols = ['hazmat_classification']

In [27]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# apply different preprocessing and feature extraction pipelines to different subsets of features, 
# using ColumnTransformer.
most_frequent_transformer = Pipeline(steps=[
       ('cat_imputer', SimpleImputer(strategy='most_frequent'))
])

constant_transformer = Pipeline(steps=[
       ('cat_imputer', SimpleImputer(strategy='constant', fill_value='missing'))
])

preprocessor = ColumnTransformer(
   transformers=[
    ('most_frequent', most_frequent_transformer, most_frequent_cols)
   ,('constant', constant_transformer, constant_cols)
]) 

In [28]:
from sklearn.preprocessing import FunctionTransformer

def to_lowercase(x):
    return np.char.lower(x.astype(str))

lowercase_transformer = FunctionTransformer(to_lowercase, validate=False)

In [29]:
train_Y = train_df["actual_process_class_ID"].values
train_X = train_df.drop(columns=["actual_process_class_ID", "ASIN"])

for c in train_X.columns:
    train_X[c] = train_X[c].astype("category")

In [30]:
# encoding target column values into 0 to 27 since Y has 28 unique values

y_label_to_id = dict()
y_id_to_label = dict()
y_mod = list()
idx = 0
for elem in train_Y.tolist():
    if elem not in y_label_to_id:
        y_label_to_id[elem] = idx
        y_id_to_label[idx] = elem
        idx += 1
    y_mod.append(y_label_to_id[elem])

In [20]:
# params1 = {
#     "boosting_type": "gbdt",
#     "objective": "multiclass",  # ‘regression’ for LGBMRegressor, ‘binary’ or ‘multiclass’ for LGBMClassifier
#     "num_class": 28,
#     "metric": "multi_logloss",
#     'n_estimators': 100, # We can change it, by default 100
#     "learning_rate": 0.5,  # Default 0.1
#     "num_iterations": 500,  # Default 100
#     "is_unbalance": True,  # Used to fix the class imbalance in the dataset
#     "verbose": 100,
#     "force_col_wise": True
# }

# lgb_train = lgb.Dataset(train_X, y_mod)

In [32]:
# Preprocess the categorical features and add decision tree to the pipeline
pipeline_light_gbm = Pipeline([('preprocessor', preprocessor),
                                   ('cat_lowercase_transformer', lowercase_transformer),
                                   ('cat_encoder', OneHotEncoder(handle_unknown='ignore')),
                                   ('lightgbm', LGBMClassifier(n_jobs=-1))])

pipeline_light_gbm.fit(train_X,
                       y_mod)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('most_frequent',
                                                  Pipeline(steps=[('cat_imputer',
                                                                   SimpleImputer(strategy='most_frequent'))]),
                                                  ['brand', 'binding',
                                                   'handling_group']),
                                                 ('constant',
                                                  Pipeline(steps=[('cat_imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant'))]),
                                                  ['hazmat_classification'])])),
                ('cat_lowercase_transformer',
                 FunctionTransformer(func=<function to_lowercase at 0x16069fdc0>)),
 

In [34]:
from sklearn import set_config
set_config(display='diagram')
pipeline_light_gbm

In [37]:
val_Y = val_data["actual_process_class_ID"].values
val_X = val_data.drop(columns=["actual_process_class_ID", "ASIN"])

for c in val_X.columns:
    val_X[c] = val_X[c].astype("category")

In [39]:
# encoding target column values into 0 to 27 since Y has 28 unique values

y_label_to_id = dict()
y_id_to_label = dict()
val_y_mod = list()
idx = 0
for elem in val_Y.tolist():
    if elem not in y_label_to_id:
        y_label_to_id[elem] = idx
        y_id_to_label[idx] = elem
        idx += 1
    val_y_mod.append(y_label_to_id[elem])

In [54]:
# Accuracy Report

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

val_predictions = pipeline_light_gbm.predict(val_X)

# generate confusion matrix as a dataframe
unique_labels = sorted(set(val_y_mod))
confusion_matrix = confusion_matrix(val_y_mod, val_predictions)
support = confusion_matrix.sum(axis=1)
confusion_matrix_prob = confusion_matrix / np.sum(confusion_matrix, axis=1, keepdims=True)  # convert them to probability distributions

confusion_matrix_df = pd.DataFrame(confusion_matrix_prob, columns=[str(label) + '_pred' for label in unique_labels], index=[str(label) + '_true' for label in unique_labels])
confusion_matrix_df['support'] = support
confusion_matrix_df.to_csv('confusion_matrix.csv', index=True)

print('Model performance on validation set:')
print(classification_report(val_y_mod, val_predictions))
val_accuracy = accuracy_score(val_y_mod, val_predictions)
print("Validation accuracy:", val_accuracy)

Model performance on validation set:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      4799
           1       0.08      0.00      0.00     13452
           2       0.00      0.00      0.00      1563
           3       0.00      0.00      0.00       673
           4       0.00      0.00      0.00      2760
           5       0.00      0.00      0.00      2100
           6       0.00      0.00      0.00      2046
           7       0.00      0.00      0.00       279
           8       0.03      0.00      0.00       902
           9       0.00      0.00      0.00      3410
          10       0.00      0.00      0.00      1900
          11       0.00      0.00      0.00       685
          12       0.00      0.00      0.00       125
          13       0.00      0.00      0.00      2174
          14       0.00      0.00      0.00      1005
          15       0.00      0.00      0.00       649
          16       0.01      1.00      0.01 

### 6. Get test predictions and write to CSV for submission

In [57]:
# call pipeline.predict() on your test dataset
test_predictions_light_gbm=pipeline_light_gbm.predict(val_X)

In [58]:
import pandas as pd

result_df = pd.DataFrame(columns=["ID",
                                  "actual_process_class_ID"])
result_df["ID"] = val_data["ID"].tolist()
result_df["actual_process_class_ID"] = test_predictions_light_gbm

result_df.to_csv("project_result.csv", index=False)