# Table of Content

1. [Prepare to Modelling](#scrollTo=1PtkBaUlUeV8)
2. [Regression Models](#scrollTo=pU-qpHcD47wi)
3. [Classification Models](#scrollTo=POEygRgxIuQ1)
4. [Mixed Model](#scrollTo=YTyaHhIVQ3iK)
5. [Ordinal Regression](#scrollTo=JdLTUUfQeRG0)

# Imports

In [None]:
#!pip install mord
#!pip install session-info

In [3]:
import pandas as pd
import numpy as np
import warnings

# Visualizations
warnings.filterwarnings('ignore')
import seaborn as sns 
import matplotlib.pyplot as plt
pd.set_option('display.float_format', lambda x: '%.5f' % x) # To surpass the scientific notation

# Scikit learn packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler # Scale between 0 and 1
from sklearn.preprocessing import LabelBinarizer
import mord #[https://pythonhosted.org/mord/reference.html#mord.OrdinalRidge]

#import metrics functions
from sklearn import metrics
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error 
from imblearn.metrics import macro_averaged_mean_absolute_error
from imblearn.metrics import macro_averaged_mean_absolute_error #Compute Macro-Averaged MAE for imbalanced ordinal classification.
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, confusion_matrix

# Hypermeter tuning packages
from sklearn.model_selection import RandomizedSearchCV

# See requirements
import session_info

# Uploads

In [4]:
# Main df
df = pd.read_csv('df_after_eda.csv', index_col=0)
# BERT VECTORS
df_desc_bert = pd.read_csv('description_bert_vector.csv', index_col = 0).add_prefix('d_')
df_info_bert = pd.read_csv('information_bert_vector.csv', index_col = 0).add_prefix('i_')
df_ques_bert = pd.read_csv('question_bert_vector.csv', index_col = 0).add_prefix('q_')
df_ans_bert = pd.read_csv('answer_bert_vector.csv', index_col = 0).add_prefix('a_')

In [5]:
print(df.shape)
print(df_desc_bert.shape)
print(df_info_bert.shape)
print(df_ques_bert.shape)
print(df_ans_bert.shape)

(10000, 142)
(10000, 768)
(10000, 768)
(10000, 768)
(10000, 768)


In [6]:
# All vectors
df_with_vec = pd.concat([df_desc_bert, df_info_bert, df_ques_bert ,df_ans_bert], axis=1)
# Description and product Information
df_with_id = pd.concat([df_desc_bert, df_info_bert], axis=1)
# Customer question and answers
df_with_qa = pd.concat([df_ques_bert, df_ans_bert], axis=1)

# 1. Prepare to Modeling

**Note:** In this section, we prepare the data for sklearn models. 

## 1.1 *New Features*

In [7]:
# Normalize the best seller rank -LABEL- 
df['best_seller_rank_0_1'] = df['best_seller_rank'].apply(lambda x: x - np.min(df['best_seller_rank'])) / (np.max(df['best_seller_rank']) - np.min(df['best_seller_rank']))
# Check the values
print(f"Min: {df['best_seller_rank_0_1'].min()}")
print(f"Max: {df['best_seller_rank_0_1'].max()}")

Min: 0.0
Max: 1.0


In [8]:
# Rename Buckets which are in data
df = df.rename(columns={"bsr_bucket": "bucket_30_bsr", # each bucket have 50K BSR
                        "bucket":"bucket_rev_6" #Buckets are created cumulative sum of opportunity
                        })
# Create new buckets 

# 150K BSR
df['bucket_10_bsr']  = df['bucket_30_bsr'].replace({ 1.00000: 1, 2.00000:1, 3.00000: 1, 4.00000:2, 5.00000:2, 6.00000: 2, 7.00000:3, 8.00000:3, 9.00000: 3,
                                                     10.00000:4, 11.00000:4, 12.00000: 4, 13.00000:5, 14.00000:5, 15.00000: 5, 16.00000:6, 17.00000:6, 18.00000: 6,
                                                     19.00000:7, 20.00000:7, 21.00000: 7,  22.00000:8, 23.00000:8, 24.00000: 8,  25.00000:9, 26.00000:9, 27.00000: 9,
                                                     28.00000:10, 29.00000:10, 30.00000: 10, 34.00000:10, 46.00000:10})
# 300K BSR
df['bucket_5_bsr']  = df['bucket_10_bsr'].replace({ 1.00000: 1, 2.00000:1, 3.00000: 2, 4.00000:2, 5.00000:3, 6.00000: 3, 7.00000:4, 8.00000:4, 9.00000: 5,
                                                     10.00000:5})

# Balanced buckets
df['bucket_5_frq'] = pd.qcut(df['best_seller_rank'], 5, labels=["1", "2", "3","4", "5"])
df['bucket_10_frq'] = pd.qcut(df['best_seller_rank'], 10, labels=["1", "2", "3","4", "5", "6", '7', '8', '9', '10'])

In [9]:
# Some tehcnical details includes various values. Such as each product have different dimensions.
# Create 0-1 from technical details from these features. (1: Yes, technical detail, 0: No, technical detail.)
tech_list = ['weight', 'dimension', 'recom_age', 'batt_requi', 'batt_inc', '#_puzzle', '#_players',
             'color', 'material_type', 'ship_weight', 'deli_dest', 'first_del_date']
 
for col in tech_list:
  df[col] = np.where(df[col].isna(), 0, 1)

# Boolean feature
df['batteries'] = df['batteries'].astype(int)
# Assumption: Mission value means there is no number in description.
df['desc_#_numb'] = df['desc_#_numb'].fillna(0)

## 1.2 *Choose related features*

In [10]:
# Choose the column that can be used in modeling
model_features = [# Outputs
                   'best_seller_rank' , 'best_seller_rank_0_1', "bucket_30_bsr", "bucket_10_bsr", "bucket_5_bsr", "bucket_rev_6", 'bucket_5_frq','bucket_10_frq',
                   # Raw features
                  'price', 'number_of_reviews', 'number_of_answered_questions', 'average_review_rating', 'number_in_stock','type_of_stock', 
                  '#_items_bought', '#_items_bought_view', 
                  # Technical details from product information
                  '#_tech_details', 'weight', 'dimension', 'recom_age', 'batt_requi', 'batt_inc', 'batteries', '#_puzzle', '#_players',
                  'color', 'material_type', 'ship_weight', 'deli_dest', 'first_del_date',## GET RID OF
                  # Question text
                  'customer_questions_and_answers',
                  # Textual data related
                  'desc_len','info_len', 'ave_length_question', 'ave_length_answers',
                  'desc_#_numb', 'info_#_numb','que_#_numb', 'ans_#_numb', 
                  # Bert Similarities
                  'desc_info', 'desc_ques', 'desc_ans', 'info_ques', 'info_ans', 'ques_ans',
                  # Comptetitors data
                  'competitors_count', 
                  # Categories
                  'category_eda',
                  # Topics
                  'Size_q_topic', 'Sound_q_topic', 'Outliers_q_topic', 'Track-Train_q_topic','Material_q_topic', 'Shipping_q_topic', 
                  'Color_q_topic', 'Castle_q_topic', 'DIY_q_topic', 'Price_q_topic', 'Age_q_topic','Players_q_topic', 'Cards_q_topic', 
                  'Kinetic-Sand_q_topic', 'Pool_q_topic', 'Door_q_topic', 'Delivery_q_topic', 'Battery_q_topic', 'Baloon_q_topic', 'Kite_q_topic']
print(f'The total number of features in model {len(model_features)}.') 

# Create a new dataframe for modelling
df_model = df[model_features]

The total number of features in model 67.


## 1.3 *Missing Values*

In [11]:
# See the features which have missing values
df_model.isna().sum().sort_values()

ave_length_question                  0
desc_#_numb                          0
info_#_numb                          0
que_#_numb                           0
ans_#_numb                           0
                                  ... 
bucket_10_bsr                      134
bucket_30_bsr                      134
best_seller_rank_0_1               134
best_seller_rank                   134
customer_questions_and_answers    9086
Length: 67, dtype: int64

## 1.4 *Encoding*

The category and stock type feature are categorical. Let's encode them.

In [12]:
# Encode categorical variables 
# Get one hot encoding of columns 'category_eda' & 'type_of_stock'
one_hot = pd.get_dummies(data=df_model['category_eda'])
one_hot_2 = pd.get_dummies(data=df_model['type_of_stock'])
# Drop column as it is now encoded
df_model = df_model.drop(['type_of_stock','category_eda'],axis = 1)
# Join the encoded df
df_model = df_model.join(one_hot).join(one_hot_2)
# Check
df_model.shape

(10000, 90)

## 1.5 *Drop Missing Best Seller Rank*

In [13]:
# Drop the rows that have missing output

# Keep the original data with 10K rows
df_model_2 = df_model.copy()

# Drop the rows that have missing label
df_model = df_model.dropna(subset=['best_seller_rank'])
df_model.shape

(9866, 90)

# 2. Regression Models - XGBoost Regressor

In [14]:
# Generate a dataframe to store the metrics
results = pd.DataFrame(columns=['Model','Label','#_Row', '#_Features' , 'MAE', 'MSE', 'R2', 'Note'])

### 2.1 Predict Best Sellers Rank

In [17]:
# define a columns list which must be removed from X
drop_list = ['best_seller_rank', 'best_seller_rank_0_1', "bucket_30_bsr", "bucket_10_bsr", "bucket_5_bsr", "bucket_rev_6", 'bucket_5_frq','bucket_10_frq','customer_questions_and_answers']
# define X and y
y = df_model['best_seller_rank']
X = df_model.drop(columns = drop_list)
print(f"y shape: {y.shape},X shape: {X.shape} ")
print()
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Check the shapes of all features
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train:{y_train.shape}, y_test:, {y_test.shape}')

y shape: (9866,),X shape: (9866, 81) 

X_train: (7892, 81), X_test: (1974, 81), y_train:(7892,), y_test:, (1974,)


In [18]:
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)
pred = model_xgb.predict(X_test)

  # Append metrics to dataframe
results.loc[len(results.index)] = ['XGBoost Regressor', 'best_seller_rank', X_train.shape[0]+X_test.shape[0],  X_train.shape[1],
                                    metrics.mean_absolute_error(y_test, pred), 
                                    metrics.mean_squared_error(y_test, pred), 
                                    metrics.r2_score(y_test, pred),
                                    'All features and rows included'] 
results                               



Unnamed: 0,Model,Label,#_Row,#_Features,MAE,MSE,R2,Note
0,XGBoost Regressor,best_seller_rank,9866,81,103113.70628,22887464114.92366,0.65554,All features and rows included


### 2.2 Predict Best Sellers Rank Normalized

In [19]:
# define X and y
y = df_model['best_seller_rank_0_1']
X = df_model.drop(columns = drop_list)
print(f"y shape: {y.shape},X shape: {X.shape} ")
print()
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Check the shapes of all features
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train:{y_train.shape}, y_test:, {y_test.shape}')

y shape: (9866,),X shape: (9866, 81) 

X_train: (7892, 81), X_test: (1974, 81), y_train:(7892,), y_test:, (1974,)


In [20]:
model_xgb = XGBRegressor()
# Use function with scaled data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Fit and predict
model_xgb.fit(X_train, y_train)
pred = model_xgb.predict(X_test)

  # Append metrics to dataframe
results.loc[len(results.index)] = ['XGBoost Regressor', 'best_seller_rank_0_1', X_train.shape[0]+X_test.shape[0],  X_train.shape[1],
                                    metrics.mean_absolute_error(y_test, pred), 
                                    metrics.mean_squared_error(y_test, pred), 
                                    metrics.r2_score(y_test, pred),
                                    'All features and rows included'] 
results                                   



Unnamed: 0,Model,Label,#_Row,#_Features,MAE,MSE,R2,Note
0,XGBoost Regressor,best_seller_rank,9866,81,103113.70628,22887464114.92366,0.65554,All features and rows included
1,XGBoost Regressor,best_seller_rank_0_1,9866,81,0.04581,0.0045,0.6551,All features and rows included


### 2.3 Predict Best Sellers Rank - 1653 Products



**Note:** One of the limitation of data is about 90% of 'customer_questions_and_answers' text is missing. This can effect the model. Let's inititate the model with filtered data.

In [21]:
# Choose rows whhich have question text or zero answered questions
df_model_1653 = df_model[(df_model['customer_questions_and_answers'].isna() == False) | (df_model['number_of_answered_questions'] == 0)]
df_model_1653.shape

(1653, 90)

In [22]:
# define X and y
y = df_model_1653['best_seller_rank']
X = df_model_1653.drop(columns = drop_list)
print(f"y shape: {y.shape},X shape: {X.shape} ")
print()
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Check the shapes of all features
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train:{y_train.shape}, y_test:, {y_test.shape}')

y shape: (1653,),X shape: (1653, 81) 

X_train: (1322, 81), X_test: (331, 81), y_train:(1322,), y_test:, (331,)


In [26]:
model_xgb = XGBRegressor()
# Use function with scaled data
'''scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)'''
# Fit and predict
model_xgb.fit(X_train, y_train)
pred = model_xgb.predict(X_test)

  # Append metrics to dataframe
results.loc[len(results.index)] = ['XGBoost Regressor', 'best_seller_rank_0_1', X_train.shape[0]+X_test.shape[0],  X_train.shape[1],
                                    metrics.mean_absolute_error(y_test, pred), 
                                    metrics.mean_squared_error(y_test, pred), 
                                    metrics.r2_score(y_test, pred),
                                    'Q&A text or #_questions is 0. 1653 row.']   
results                                     



Unnamed: 0,Model,Label,#_Row,#_Features,MAE,MSE,R2,Note
0,XGBoost Regressor,best_seller_rank,9866,81,103113.70628,22887464114.92366,0.65554,All features and rows included
1,XGBoost Regressor,best_seller_rank_0_1,9866,81,0.04581,0.0045,0.6551,All features and rows included
2,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
3,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.


### 2.4 Predict Best Sellers Rank Normalized - 1653 Products

In [27]:
# define X and y
y = df_model_1653['best_seller_rank_0_1']
X = df_model_1653.drop(columns = drop_list)
print(f"y shape: {y.shape},X shape: {X.shape} ")
print()
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Check the shapes of all features
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train:{y_train.shape}, y_test:, {y_test.shape}')

y shape: (1653,),X shape: (1653, 81) 

X_train: (1322, 81), X_test: (331, 81), y_train:(1322,), y_test:, (331,)


In [28]:
model_xgb = XGBRegressor()
# Use function with scaled data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Fit and predict
model_xgb.fit(X_train, y_train)
pred = model_xgb.predict(X_test)

  # Append metrics to dataframe
results.loc[len(results.index)] = ['XGBoost Regressor', 'best_seller_rank_0_1', X_train.shape[0]+X_test.shape[0],  X_train.shape[1],
                                    metrics.mean_absolute_error(y_test, pred), 
                                    metrics.mean_squared_error(y_test, pred), 
                                    metrics.r2_score(y_test, pred),
                                    'Q&A text or #_questions is 0. 1653 row']      



### 2.5 Predict Best Seller Rank - Question related features - 1653 Products

In [30]:
# Some features are directly related to customer questions. Split features as:
# a. Question Related
# b. Non-Question Related
non_question_features = ['price', 'number_of_reviews', 'average_review_rating',
                        'number_in_stock', '#_items_bought', '#_items_bought_view', 'Advent Calendars', 'Arts & Crafts', 'Baby & Toddler Toys',
                        'Building & Construction Toys', 'Characters & Brands',
                        'Die-Cast & Toy Vehicles', 'Dolls & Accessories', 'Fancy Dress',
                        'Figures & Playsets', 'Games', 'Hobbies', 'Jigsaws & Puzzles',
                        'Musical Toy Instruments', 'Novelty & Special Use', 'Other',
                        'Party Supplies', 'Pretend Play', 'Puppets & Puppet Theatres',
                        'Soft Toys', 'Sports Toys & Outdoor', 'Collectible', 'New', 'No stock',
                        'Refurbished', 'Used']
print(len(non_question_features))

31


In [31]:
y = df_model_1653['best_seller_rank_0_1']
X = df_model_1653.drop(columns = drop_list).drop(columns = non_question_features)
print(f"y shape: {y.shape},X shape: {X.shape} ")
print()
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Check the shapes of all features
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train:{y_train.shape}, y_test:, {y_test.shape}')

y shape: (1653,),X shape: (1653, 50) 

X_train: (1322, 50), X_test: (331, 50), y_train:(1322,), y_test:, (331,)


In [32]:
# create an xgboost regression model
model_xgb = XGBRegressor()

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model_xgb.fit(X_train_scaled, y_train)
pred = model_xgb.predict(X_test_scaled)

# Append metrics to dataframe
results.loc[len(results.index)] = ['XGBoost Regressor',' best_seller_rank_0_1', X_train.shape[0]+X_test.shape[0],  X_train.shape[1],
                                   metrics.mean_absolute_error(y_test, pred), 
                                   metrics.mean_squared_error(y_test, pred), 
                                   metrics.r2_score(y_test, pred),
                                   'Question related features.1653 rows'] 

results



Unnamed: 0,Model,Label,#_Row,#_Features,MAE,MSE,R2,Note
0,XGBoost Regressor,best_seller_rank,9866,81,103113.70628,22887464114.92366,0.65554,All features and rows included
1,XGBoost Regressor,best_seller_rank_0_1,9866,81,0.04581,0.0045,0.6551,All features and rows included
2,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
3,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
4,XGBoost Regressor,best_seller_rank_0_1,1653,81,0.0341,0.00297,0.70954,Q&A text or #_questions is 0. 1653 row
5,XGBoost Regressor,best_seller_rank_0_1,1653,50,0.05287,0.00674,0.3407,Question related features.1653 rows


###  2.6 Predict Best Seller Rank - Question non-related features - 1653 Products

In [33]:
y = df_model_1653['best_seller_rank_0_1']
X = df_model_1653[non_question_features]
print(f"y shape: {y.shape},X shape: {X.shape} ")
print()
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Check the shapes of all features
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train:{y_train.shape}, y_test:, {y_test.shape}')

y shape: (1653,),X shape: (1653, 31) 

X_train: (1322, 31), X_test: (331, 31), y_train:(1322,), y_test:, (331,)


In [34]:
# create an xgboost regression model
model_xgb = XGBRegressor()

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model_xgb.fit(X_train_scaled, y_train)
pred = model_xgb.predict(X_test_scaled)

# Append metrics to dataframe
results.loc[len(results.index)] = ['XGBoost Regressor',' best_seller_rank_0_1', X_train.shape[0]+X_test.shape[0],  X_train.shape[1],
                                   metrics.mean_absolute_error(y_test, pred), 
                                   metrics.mean_squared_error(y_test, pred), 
                                   metrics.r2_score(y_test, pred),
                                   'Non-Question related features-1653 row'] 

results



Unnamed: 0,Model,Label,#_Row,#_Features,MAE,MSE,R2,Note
0,XGBoost Regressor,best_seller_rank,9866,81,103113.70628,22887464114.92366,0.65554,All features and rows included
1,XGBoost Regressor,best_seller_rank_0_1,9866,81,0.04581,0.0045,0.6551,All features and rows included
2,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
3,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
4,XGBoost Regressor,best_seller_rank_0_1,1653,81,0.0341,0.00297,0.70954,Q&A text or #_questions is 0. 1653 row
5,XGBoost Regressor,best_seller_rank_0_1,1653,50,0.05287,0.00674,0.3407,Question related features.1653 rows
6,XGBoost Regressor,best_seller_rank_0_1,1653,31,0.03522,0.00309,0.69769,Non-Question related features-1653 row


### 2.7 Predict Best Sellers Rank - Question Related Features

In [35]:
y = df_model['best_seller_rank_0_1']
X = df_model.drop(columns = drop_list).drop(columns = non_question_features)
print(f"y shape: {y.shape},X shape: {X.shape} ")
print()
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Check the shapes of all features
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train:{y_train.shape}, y_test:, {y_test.shape}')

y shape: (9866,),X shape: (9866, 50) 

X_train: (7892, 50), X_test: (1974, 50), y_train:(7892,), y_test:, (1974,)


In [36]:
# create an xgboost regression model
model_xgb = XGBRegressor()

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model_xgb.fit(X_train_scaled, y_train)
pred = model_xgb.predict(X_test_scaled)

# Append metrics to dataframe
results.loc[len(results.index)] = ['XGBoost Regressor',' best_seller_rank_0_1', X_train.shape[0]+X_test.shape[0],  X_train.shape[1],
                                   metrics.mean_absolute_error(y_test, pred), 
                                   metrics.mean_squared_error(y_test, pred), 
                                   metrics.r2_score(y_test, pred),
                                   'Question related features-All Rows'] 

results



Unnamed: 0,Model,Label,#_Row,#_Features,MAE,MSE,R2,Note
0,XGBoost Regressor,best_seller_rank,9866,81,103113.70628,22887464114.92366,0.65554,All features and rows included
1,XGBoost Regressor,best_seller_rank_0_1,9866,81,0.04581,0.0045,0.6551,All features and rows included
2,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
3,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
4,XGBoost Regressor,best_seller_rank_0_1,1653,81,0.0341,0.00297,0.70954,Q&A text or #_questions is 0. 1653 row
5,XGBoost Regressor,best_seller_rank_0_1,1653,50,0.05287,0.00674,0.3407,Question related features.1653 rows
6,XGBoost Regressor,best_seller_rank_0_1,1653,31,0.03522,0.00309,0.69769,Non-Question related features-1653 row
7,XGBoost Regressor,best_seller_rank_0_1,9866,50,0.076,0.01049,0.19515,Question related features-All Rows


### 2.8 Predict Best Sellers Rank - Non-Question Related Features

In [37]:
y = df_model['best_seller_rank_0_1']
X = df_model[non_question_features]
print(f"y shape: {y.shape},X shape: {X.shape} ")
print()
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Check the shapes of all features
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train:{y_train.shape}, y_test:, {y_test.shape}')

y shape: (9866,),X shape: (9866, 31) 

X_train: (7892, 31), X_test: (1974, 31), y_train:(7892,), y_test:, (1974,)


In [38]:
# create an xgboost regression model
model_xgb = XGBRegressor()

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model_xgb.fit(X_train_scaled, y_train)
pred = model_xgb.predict(X_test_scaled)

# Append metrics to dataframe
results.loc[len(results.index)] = ['XGBoost Regressor',' best_seller_rank_0_1', X_train.shape[0]+X_test.shape[0],  X_train.shape[1],
                                   metrics.mean_absolute_error(y_test, pred), 
                                   metrics.mean_squared_error(y_test, pred), 
                                   metrics.r2_score(y_test, pred),
                                   'Non-Question related features-All Rows'] 
results                 



Unnamed: 0,Model,Label,#_Row,#_Features,MAE,MSE,R2,Note
0,XGBoost Regressor,best_seller_rank,9866,81,103113.70628,22887464114.92366,0.65554,All features and rows included
1,XGBoost Regressor,best_seller_rank_0_1,9866,81,0.04581,0.0045,0.6551,All features and rows included
2,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
3,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
4,XGBoost Regressor,best_seller_rank_0_1,1653,81,0.0341,0.00297,0.70954,Q&A text or #_questions is 0. 1653 row
5,XGBoost Regressor,best_seller_rank_0_1,1653,50,0.05287,0.00674,0.3407,Question related features.1653 rows
6,XGBoost Regressor,best_seller_rank_0_1,1653,31,0.03522,0.00309,0.69769,Non-Question related features-1653 row
7,XGBoost Regressor,best_seller_rank_0_1,9866,50,0.076,0.01049,0.19515,Question related features-All Rows
8,XGBoost Regressor,best_seller_rank_0_1,9866,31,0.04652,0.00468,0.64085,Non-Question related features-All Rows


### 2.9 Predict Best Seller Normalized - All Rows - Without Outliers

In [39]:
# Two products' BSR are more than 1.5M. Let's exclude them.
df_model_2 = df_model[df_model['best_seller_rank']<1500001.00000]

In [40]:
y = df_model_2['best_seller_rank_0_1']
X = df_model_2.drop(columns = drop_list)
print(f"y shape: {y.shape},X shape: {X.shape} ")
print()
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Check the shapes of all features
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train:{y_train.shape}, y_test:, {y_test.shape}')

y shape: (9864,),X shape: (9864, 81) 

X_train: (7891, 81), X_test: (1973, 81), y_train:(7891,), y_test:, (1973,)


In [41]:
# create an xgboost regression model
model_xgb = XGBRegressor()

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model_xgb.fit(X_train_scaled, y_train)
pred = model_xgb.predict(X_test_scaled)

# Append metrics to dataframe
results.loc[len(results.index)] = ['XGBoost Regressor',' best_seller_rank_0_1', X_train.shape[0]+X_test.shape[0],  X_train.shape[1],
                                   metrics.mean_absolute_error(y_test, pred), 
                                   metrics.mean_squared_error(y_test, pred), 
                                   metrics.r2_score(y_test, pred),
                                   'Outlier removed. All features.'] 

results



Unnamed: 0,Model,Label,#_Row,#_Features,MAE,MSE,R2,Note
0,XGBoost Regressor,best_seller_rank,9866,81,103113.70628,22887464114.92366,0.65554,All features and rows included
1,XGBoost Regressor,best_seller_rank_0_1,9866,81,0.04581,0.0045,0.6551,All features and rows included
2,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
3,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
4,XGBoost Regressor,best_seller_rank_0_1,1653,81,0.0341,0.00297,0.70954,Q&A text or #_questions is 0. 1653 row
5,XGBoost Regressor,best_seller_rank_0_1,1653,50,0.05287,0.00674,0.3407,Question related features.1653 rows
6,XGBoost Regressor,best_seller_rank_0_1,1653,31,0.03522,0.00309,0.69769,Non-Question related features-1653 row
7,XGBoost Regressor,best_seller_rank_0_1,9866,50,0.076,0.01049,0.19515,Question related features-All Rows
8,XGBoost Regressor,best_seller_rank_0_1,9866,31,0.04652,0.00468,0.64085,Non-Question related features-All Rows
9,XGBoost Regressor,best_seller_rank_0_1,9864,81,0.04512,0.00435,0.66462,Outlier removed. All features.


### 2.10 XGBoost Regressor - With BERT VECTORS

In [42]:
y = df_model_1653['best_seller_rank']
X = pd.merge(df_model_1653, df_with_vec, left_index=True, right_index=True).drop(columns = drop_list)
print(f"y shape: {y.shape},X shape: {X.shape} ")
print()
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Check the shapes of all features
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train:{y_train.shape}, y_test:, {y_test.shape}')

y shape: (1653,),X shape: (1653, 3153) 

X_train: (1322, 3153), X_test: (331, 3153), y_train:(1322,), y_test:, (331,)


In [43]:
# create an xgboost regression model
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)
pred = model_xgb.predict(X_test)

# Append metrics to dataframe
results.loc[len(results.index)] = ['XGBoost Regressor','best_seller_rank', X_train.shape[0]+X_test.shape[0],  X_train.shape[1],
                                   metrics.mean_absolute_error(y_test, pred), 
                                   metrics.mean_squared_error(y_test, pred), 
                                   metrics.r2_score(y_test, pred),
                                   'Vectors are added. 1653 rows.'] 

results



Unnamed: 0,Model,Label,#_Row,#_Features,MAE,MSE,R2,Note
0,XGBoost Regressor,best_seller_rank,9866,81,103113.70628,22887464114.92366,0.65554,All features and rows included
1,XGBoost Regressor,best_seller_rank_0_1,9866,81,0.04581,0.0045,0.6551,All features and rows included
2,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
3,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
4,XGBoost Regressor,best_seller_rank_0_1,1653,81,0.0341,0.00297,0.70954,Q&A text or #_questions is 0. 1653 row
5,XGBoost Regressor,best_seller_rank_0_1,1653,50,0.05287,0.00674,0.3407,Question related features.1653 rows
6,XGBoost Regressor,best_seller_rank_0_1,1653,31,0.03522,0.00309,0.69769,Non-Question related features-1653 row
7,XGBoost Regressor,best_seller_rank_0_1,9866,50,0.076,0.01049,0.19515,Question related features-All Rows
8,XGBoost Regressor,best_seller_rank_0_1,9866,31,0.04652,0.00468,0.64085,Non-Question related features-All Rows
9,XGBoost Regressor,best_seller_rank_0_1,9864,81,0.04512,0.00435,0.66462,Outlier removed. All features.


### 2.11 XGBoost Regressor - With BERT VECTORS - Normalized

In [44]:
y = df_model_1653['best_seller_rank_0_1']
X = pd.merge(df_model_1653, df_with_vec, left_index=True, right_index=True).drop(columns = drop_list)
print(f"y shape: {y.shape},X shape: {X.shape} ")
print()
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Check the shapes of all features
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train:{y_train.shape}, y_test:, {y_test.shape}')

y shape: (1653,),X shape: (1653, 3153) 

X_train: (1322, 3153), X_test: (331, 3153), y_train:(1322,), y_test:, (331,)


In [45]:
# create an xgboost regression model
model_xgb = XGBRegressor()

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model_xgb.fit(X_train_scaled, y_train)
pred = model_xgb.predict(X_test_scaled)

# Append metrics to dataframe
results.loc[len(results.index)] = ['XGBoost Regressor',' best_seller_rank_0_1', X_train.shape[0]+X_test.shape[0],  X_train.shape[1],
                                   metrics.mean_absolute_error(y_test, pred), 
                                   metrics.mean_squared_error(y_test, pred), 
                                   metrics.r2_score(y_test, pred),
                                    'Vectors are added. 1653 rows.Normalized'] 

results



Unnamed: 0,Model,Label,#_Row,#_Features,MAE,MSE,R2,Note
0,XGBoost Regressor,best_seller_rank,9866,81,103113.70628,22887464114.92366,0.65554,All features and rows included
1,XGBoost Regressor,best_seller_rank_0_1,9866,81,0.04581,0.0045,0.6551,All features and rows included
2,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
3,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
4,XGBoost Regressor,best_seller_rank_0_1,1653,81,0.0341,0.00297,0.70954,Q&A text or #_questions is 0. 1653 row
5,XGBoost Regressor,best_seller_rank_0_1,1653,50,0.05287,0.00674,0.3407,Question related features.1653 rows
6,XGBoost Regressor,best_seller_rank_0_1,1653,31,0.03522,0.00309,0.69769,Non-Question related features-1653 row
7,XGBoost Regressor,best_seller_rank_0_1,9866,50,0.076,0.01049,0.19515,Question related features-All Rows
8,XGBoost Regressor,best_seller_rank_0_1,9866,31,0.04652,0.00468,0.64085,Non-Question related features-All Rows
9,XGBoost Regressor,best_seller_rank_0_1,9864,81,0.04512,0.00435,0.66462,Outlier removed. All features.


### 2.12 Just BERT VECTORS (Description and Information Embeddings)

In [46]:
y = df_model['best_seller_rank_0_1']
X = pd.merge(df_model, df_with_id, left_index=True, right_index=True).drop(columns = drop_list)
print(f"y shape: {y.shape},X shape: {X.shape} ")
print()
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Check the shapes of all features
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train:{y_train.shape}, y_test:, {y_test.shape}')

y shape: (9866,),X shape: (9866, 1617) 

X_train: (7892, 1617), X_test: (1974, 1617), y_train:(7892,), y_test:, (1974,)


In [47]:
# create an xgboost regression model
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)
pred = model_xgb.predict(X_test)

# Append metrics to dataframe
results.loc[len(results.index)] = ['XGBoost Regressor','best_seller_rank', X_train.shape[0]+X_test.shape[0],  X_train.shape[1],
                                   metrics.mean_absolute_error(y_test, pred), 
                                   metrics.mean_squared_error(y_test, pred), 
                                   metrics.r2_score(y_test, pred),
                                   'All features +Vector ID, All rows'] 

results



Unnamed: 0,Model,Label,#_Row,#_Features,MAE,MSE,R2,Note
0,XGBoost Regressor,best_seller_rank,9866,81,103113.70628,22887464114.92366,0.65554,All features and rows included
1,XGBoost Regressor,best_seller_rank_0_1,9866,81,0.04581,0.0045,0.6551,All features and rows included
2,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
3,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
4,XGBoost Regressor,best_seller_rank_0_1,1653,81,0.0341,0.00297,0.70954,Q&A text or #_questions is 0. 1653 row
5,XGBoost Regressor,best_seller_rank_0_1,1653,50,0.05287,0.00674,0.3407,Question related features.1653 rows
6,XGBoost Regressor,best_seller_rank_0_1,1653,31,0.03522,0.00309,0.69769,Non-Question related features-1653 row
7,XGBoost Regressor,best_seller_rank_0_1,9866,50,0.076,0.01049,0.19515,Question related features-All Rows
8,XGBoost Regressor,best_seller_rank_0_1,9866,31,0.04652,0.00468,0.64085,Non-Question related features-All Rows
9,XGBoost Regressor,best_seller_rank_0_1,9864,81,0.04512,0.00435,0.66462,Outlier removed. All features.


### 2.13 XGBoost Regressor- With Vectors - 1653 Products

In [48]:
y = df_model_1653['best_seller_rank_0_1']
X = df_with_vec.loc[df_model_1653.index]
print(f"y shape: {y.shape},X shape: {X.shape} ")
print()
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Check the shapes of all features
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train:{y_train.shape}, y_test:, {y_test.shape}')

y shape: (1653,),X shape: (1653, 3072) 

X_train: (1322, 3072), X_test: (331, 3072), y_train:(1322,), y_test:, (331,)


In [49]:
# create an xgboost regression model
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)
pred = model_xgb.predict(X_test)

# Append metrics to dataframe
results.loc[len(results.index)] = ['XGBoost Regressor','best_seller_rank', X_train.shape[0]+X_test.shape[0],  X_train.shape[1],
                                   metrics.mean_absolute_error(y_test, pred), 
                                   metrics.mean_squared_error(y_test, pred), 
                                   metrics.r2_score(y_test, pred),
                                   'All vectors.1653 rows'] 

results



Unnamed: 0,Model,Label,#_Row,#_Features,MAE,MSE,R2,Note
0,XGBoost Regressor,best_seller_rank,9866,81,103113.70628,22887464114.92366,0.65554,All features and rows included
1,XGBoost Regressor,best_seller_rank_0_1,9866,81,0.04581,0.0045,0.6551,All features and rows included
2,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
3,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
4,XGBoost Regressor,best_seller_rank_0_1,1653,81,0.0341,0.00297,0.70954,Q&A text or #_questions is 0. 1653 row
5,XGBoost Regressor,best_seller_rank_0_1,1653,50,0.05287,0.00674,0.3407,Question related features.1653 rows
6,XGBoost Regressor,best_seller_rank_0_1,1653,31,0.03522,0.00309,0.69769,Non-Question related features-1653 row
7,XGBoost Regressor,best_seller_rank_0_1,9866,50,0.076,0.01049,0.19515,Question related features-All Rows
8,XGBoost Regressor,best_seller_rank_0_1,9866,31,0.04652,0.00468,0.64085,Non-Question related features-All Rows
9,XGBoost Regressor,best_seller_rank_0_1,9864,81,0.04512,0.00435,0.66462,Outlier removed. All features.


# 3. Classification Models - XGBoost Classifier

In [50]:
# Define a function for auc score

def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
  lb = LabelBinarizer()
  lb.fit(y_test)
  y_test = lb.transform(y_test)
  y_pred = lb.transform(y_pred)
  return roc_auc_score(y_test, y_pred, average=average)

In [51]:
# Generate a dataframe to store the metrics
results_cls = pd.DataFrame(columns=['Model','Label','#_Row', '#_Features' , 'AUC', 'Accuracy','F1','Precision','Recall', 'Note'])

**Note:** We have six different buckets. We use them as an output with loop for all models.

### 3.1 Predict with all features and rows

In [52]:
for bucket in ['bucket_30_bsr', 'bucket_10_bsr', 'bucket_5_bsr', 'bucket_rev_6', 'bucket_5_frq', 'bucket_10_frq']:
  y = df_model[bucket]
  X = df_model.drop(columns = drop_list)
  # Split train and test set
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
  # Initialize the Model
  model_xgb = xgb.XGBClassifier(seed=0)
  scaler = MinMaxScaler()
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)
  model_xgb.fit(X_train_scaled, y_train)
  pred = model_xgb.predict(X_test_scaled)

  # Append metrics to dataframe
  results_cls.loc[len(results_cls.index)] = ['XGBoost Classifier',bucket, X_train.shape[0]+X_test.shape[0], 
                                    X_train.shape[1],
                                    multiclass_roc_auc_score(y_test, pred, average="macro"),
                                    accuracy_score(y_test,pred),
                                    f1_score(y_test,pred, average='macro'),
                                    precision_score(y_test,pred, average='macro'),
                                    recall_score(y_test,pred, average='macro'),
                                    'All features, rows.'] 

### 3.2 Predict with Question Related Features

In [53]:
for bucket in ['bucket_30_bsr', 'bucket_10_bsr', 'bucket_5_bsr', 'bucket_rev_6', 'bucket_5_frq', 'bucket_10_frq']:
  y = df_model[bucket]
  X = df_model.drop(columns = drop_list).drop(columns=non_question_features)
  # Split train and test set
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
  model_xgb = xgb.XGBClassifier(seed=0)
  # Initialize the Model
  scaler = MinMaxScaler()
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)
  model_xgb.fit(X_train_scaled, y_train)
  pred = model_xgb.predict(X_test_scaled)

  # Append metrics to dataframe
  results_cls.loc[len(results_cls.index)] = ['XGBoost Classifier',bucket, X_train.shape[0]+X_test.shape[0], 
                                    X_train.shape[1],
                                    multiclass_roc_auc_score(y_test, pred, average="macro"),
                                    accuracy_score(y_test,pred),
                                    f1_score(y_test,pred, average='macro'),
                                    precision_score(y_test,pred, average='macro'),
                                    recall_score(y_test,pred, average='macro'),
                                    'Question related features, rows.'] 

### 3.3 Predict with Non-Question Related Features

In [54]:
for bucket in ['bucket_30_bsr', 'bucket_10_bsr', 'bucket_5_bsr', 'bucket_rev_6', 'bucket_5_frq', 'bucket_10_frq']:
  y = df_model[bucket]
  X = df_model[non_question_features]
  # Split train and test set
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
  model_xgb = xgb.XGBClassifier(seed=0)
  # Initialize the Model
  scaler = MinMaxScaler()
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)
  model_xgb.fit(X_train_scaled, y_train)
  pred = model_xgb.predict(X_test_scaled)

  # Append metrics to dataframe
  results_cls.loc[len(results_cls.index)] = ['XGBoost Classifier',bucket, X_train.shape[0]+X_test.shape[0], 
                                    X_train.shape[1],
                                    multiclass_roc_auc_score(y_test, pred, average="macro"),
                                    accuracy_score(y_test,pred),
                                    f1_score(y_test,pred, average='macro'),
                                    precision_score(y_test,pred, average='macro'),
                                    recall_score(y_test,pred, average='macro'),
                                    'Non-Question related features, rows.'] 

### 3.5 Predict with all features and 1653 rows

In [55]:
for bucket in ['bucket_30_bsr', 'bucket_10_bsr', 'bucket_5_bsr', 'bucket_rev_6', 'bucket_5_frq', 'bucket_10_frq']:
  y = df_model_1653[bucket]
  X = df_model_1653.drop(columns = drop_list)
  # Split train and test set
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
  model_xgb = xgb.XGBClassifier(seed=0)
  # Initialize the Model
  scaler = MinMaxScaler()
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)
  model_xgb.fit(X_train_scaled, y_train)
  pred = model_xgb.predict(X_test_scaled)

  # Append metrics to dataframe
  results_cls.loc[len(results_cls.index)] = ['XGBoost Classifier',bucket, X_train.shape[0]+X_test.shape[0], 
                                    X_train.shape[1],
                                    multiclass_roc_auc_score(y_test, pred, average="macro"),
                                    accuracy_score(y_test,pred),
                                    f1_score(y_test,pred, average='macro'),
                                    precision_score(y_test,pred, average='macro'),
                                    recall_score(y_test,pred, average='macro'),
                                    'All features, 1653 rows.'] 

### 3.6 Predict with Question Related Features - 1653 Rows

In [56]:
for bucket in ['bucket_30_bsr', 'bucket_10_bsr', 'bucket_5_bsr', 'bucket_rev_6', 'bucket_5_frq', 'bucket_10_frq']:
  y = df_model_1653[bucket]
  X = df_model_1653.drop(columns = drop_list).drop(columns=non_question_features)
  # Split train and test set
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
  model_xgb = xgb.XGBClassifier(seed=0)
  # Initialize the Model
  scaler = MinMaxScaler()
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)
  model_xgb.fit(X_train_scaled, y_train)
  pred = model_xgb.predict(X_test_scaled)

  # Append metrics to dataframe
  results_cls.loc[len(results_cls.index)] = ['XGBoost Classifier',bucket, X_train.shape[0]+X_test.shape[0], 
                                    X_train.shape[1],
                                    multiclass_roc_auc_score(y_test, pred, average="macro"),
                                    accuracy_score(y_test,pred),
                                    f1_score(y_test,pred, average='macro'),
                                    precision_score(y_test,pred, average='macro'),
                                    recall_score(y_test,pred, average='macro'),
                                    'Question related features, 1653 rows.'] 

### 3.7 Predict with Non-Question Related Features

In [57]:
for bucket in ['bucket_30_bsr', 'bucket_10_bsr', 'bucket_5_bsr', 'bucket_rev_6', 'bucket_5_frq', 'bucket_10_frq']:
  y = df_model_1653[bucket]
  X = df_model_1653[non_question_features]
  # Split train and test set
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
  model_xgb = xgb.XGBClassifier(seed=0)
  # Initialize the Model
  scaler = MinMaxScaler()
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)
  model_xgb.fit(X_train_scaled, y_train)
  pred = model_xgb.predict(X_test_scaled)

  # Append metrics to dataframe
  results_cls.loc[len(results_cls.index)] = ['XGBoost Classifier',bucket, X_train.shape[0]+X_test.shape[0], 
                                    X_train.shape[1],
                                    multiclass_roc_auc_score(y_test, pred, average="macro"),
                                    accuracy_score(y_test,pred),
                                    f1_score(y_test,pred, average='macro'),
                                    precision_score(y_test,pred, average='macro'),
                                    recall_score(y_test,pred, average='macro'),
                                    'Non-Question related features, 1653 rows.'] 

# 4. Mixed Models

**NOTE:** In this section, we will predict with regression then assign the bucket. We don't use all regresson models. We use the classification metrics for assess the performance. We only use the best model with XGBoost Regressor. 

The model: XGB Boost Regressor, 1653 rows and all features.Output Normalized BSR rank.  

In [58]:
# define X and y
y = df_model_1653['best_seller_rank']
X = df_model_1653.drop(columns = drop_list)
print(f"y shape: {y.shape},X shape: {X.shape} ")
print()
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Check the shapes of all features
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train:{y_train.shape}, y_test:, {y_test.shape}')

y shape: (1653,),X shape: (1653, 81) 

X_train: (1322, 81), X_test: (331, 81), y_train:(1322,), y_test:, (331,)


In [59]:
model_xgb = XGBRegressor()
# Use function with scaled data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Fit and predict
model_xgb.fit(X_train, y_train)
pred = model_xgb.predict(X_test)   



In [60]:
df_mixed = pd.DataFrame(columns = ['y_test', 'pred'])
df_mixed['y_test'] = y_test
df_mixed['pred'] = pred
df_mixed.head()

Unnamed: 0,y_test,pred
1203,79461.0,145372.89062
6758,339516.0,187188.70312
9401,45282.0,156735.45312
3199,55468.0,2589.13647
4751,104446.0,67513.04688


In [61]:
df_mixed['y_test_30'] = df_mixed['y_test'] // 50000 + 1
df_mixed['pred_30'] = df_mixed['pred'] // 50000 + 1

df_mixed['y_test_10'] = df_mixed['y_test'] // 150000 + 1
df_mixed['pred_10'] = df_mixed['pred'] // 150000 + 1

In [62]:
results_cls.loc[len(results_cls.index)] = ['XGBoost Mixed','bsr_bucket', X_train.shape[0]+X_test.shape[0], 
                                   X_train.shape[1],
                                   multiclass_roc_auc_score(df_mixed['y_test_30'], df_mixed['pred_30'], average="macro"),
                                   accuracy_score(df_mixed['y_test_30'], df_mixed['pred_30']),
                                   f1_score(df_mixed['y_test_30'], df_mixed['pred_30'], average='macro'),
                                   precision_score(df_mixed['y_test_30'], df_mixed['pred_30'], average='macro'),
                                   recall_score(df_mixed['y_test_30'], df_mixed['pred_30'], average='macro'),
                                   '30 Bucket, All rows, features. Non-normalized'] 

results_cls.loc[len(results_cls.index)] = ['XGBoost Mixed','bsr_bucket_10', X_train.shape[0]+X_test.shape[0], 
                                   X_train.shape[1],
                                   multiclass_roc_auc_score(df_mixed['y_test_10'], df_mixed['pred_10'], average="macro"),
                                   accuracy_score(df_mixed['y_test_10'], df_mixed['pred_10']),
                                   f1_score(df_mixed['y_test_10'], df_mixed['pred_10'], average='macro'),
                                   precision_score(df_mixed['y_test_10'], df_mixed['pred_10'], average='macro'),
                                   recall_score(df_mixed['y_test_10'], df_mixed['pred_10'], average='macro'),
                                   '10 Bucket. All rows, features. Non-normalized']                                  

# 5. Ordinal Regression

In [63]:
# Generate a dataframe to store the metrics
results_ord_reg = pd.DataFrame(columns=['Model','Label','#_Row', '#_Fea' , 'AUC', 'Accuracy','F1','Precision','Recall', 'MAE','Note'])

In [64]:
drop_list

['best_seller_rank',
 'best_seller_rank_0_1',
 'bucket_30_bsr',
 'bucket_10_bsr',
 'bucket_5_bsr',
 'bucket_rev_6',
 'bucket_5_frq',
 'bucket_10_frq',
 'customer_questions_and_answers']

In [65]:
for bucket in ['bucket_30_bsr', 'bucket_10_bsr','bucket_5_bsr', 'bucket_rev_6', 'bucket_5_frq', 'bucket_10_frq']:
  y = df_model[bucket].astype(int)
  X = df_model.drop(columns = drop_list)
  # Split train and test set
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
  #Initialize model  
  ord_rid = mord.OrdinalRidge()
  ord_rid.fit(X_train,y_train)
  pred = ord_rid.predict(X_test)
  # Append metrics to dataframe
  results_ord_reg.loc[len(results_ord_reg.index)] = ['Ordinal Regression', bucket, X_train.shape[0]+X_test.shape[0],  X_train.shape[1],
                                                      multiclass_roc_auc_score(y_test, pred, average="macro"),
                                                      accuracy_score(y_test,pred),
                                                      f1_score(y_test,pred, average='macro'),
                                                      precision_score(y_test,pred, average='macro'),
                                                      recall_score(y_test,pred, average='macro'),
                                                      metrics.mean_absolute_error(y_test, pred),
                                                      'All features and rows included'] 

In [66]:
y = df_model['bucket_5_bsr']
X = df_model.drop(columns = drop_list)
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
#Initialize model  
ord_rid = mord.OrdinalRidge()
ord_rid.fit(X_train,y_train)
pred = ord_rid.predict(X_test)
print(macro_averaged_mean_absolute_error(y_test, pred))

0.9980794325625786


# 6. Assessing Metrics for All Models

In [68]:
results

Unnamed: 0,Model,Label,#_Row,#_Features,MAE,MSE,R2,Note
0,XGBoost Regressor,best_seller_rank,9866,81,103113.70628,22887464114.92366,0.65554,All features and rows included
1,XGBoost Regressor,best_seller_rank_0_1,9866,81,0.04581,0.0045,0.6551,All features and rows included
2,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
3,XGBoost Regressor,best_seller_rank_0_1,1653,81,75791.24194,14593610071.63353,0.71968,Q&A text or #_questions is 0. 1653 row.
4,XGBoost Regressor,best_seller_rank_0_1,1653,81,0.0341,0.00297,0.70954,Q&A text or #_questions is 0. 1653 row
5,XGBoost Regressor,best_seller_rank_0_1,1653,50,0.05287,0.00674,0.3407,Question related features.1653 rows
6,XGBoost Regressor,best_seller_rank_0_1,1653,31,0.03522,0.00309,0.69769,Non-Question related features-1653 row
7,XGBoost Regressor,best_seller_rank_0_1,9866,50,0.076,0.01049,0.19515,Question related features-All Rows
8,XGBoost Regressor,best_seller_rank_0_1,9866,31,0.04652,0.00468,0.64085,Non-Question related features-All Rows
9,XGBoost Regressor,best_seller_rank_0_1,9864,81,0.04512,0.00435,0.66462,Outlier removed. All features.


In [69]:
results_cls

Unnamed: 0,Model,Label,#_Row,#_Features,AUC,Accuracy,F1,Precision,Recall,Note
0,XGBoost Classifier,bucket_30_bsr,9866,81,0.53323,0.24924,0.09493,0.11125,0.09305,"All features, rows."
1,XGBoost Classifier,bucket_10_bsr,9866,81,0.58341,0.54103,0.24483,0.36223,0.23004,"All features, rows."
2,XGBoost Classifier,bucket_5_bsr,9866,81,0.63846,0.76596,0.38522,0.43248,0.37036,"All features, rows."
3,XGBoost Classifier,bucket_rev_6,9866,81,0.55334,0.91996,0.27379,0.4108,0.24399,"All features, rows."
4,XGBoost Classifier,bucket_5_frq,9866,81,0.69023,0.50811,0.49861,0.50235,0.503,"All features, rows."
5,XGBoost Classifier,bucket_10_frq,9866,81,0.61917,0.31763,0.29839,0.29637,0.31415,"All features, rows."
6,XGBoost Classifier,bucket_30_bsr,9866,50,0.50549,0.20466,0.03382,0.06279,0.04485,"Question related features, rows."
7,XGBoost Classifier,bucket_10_bsr,9866,50,0.50881,0.45643,0.08948,0.15747,0.11086,"Question related features, rows."
8,XGBoost Classifier,bucket_5_bsr,9866,50,0.50408,0.67629,0.17172,0.20147,0.20361,"Question related features, rows."
9,XGBoost Classifier,bucket_rev_6,9866,50,0.50322,0.9154,0.16584,0.23644,0.16963,"Question related features, rows."


In [70]:
results_ord_reg

Unnamed: 0,Model,Label,#_Row,#_Fea,AUC,Accuracy,F1,Precision,Recall,MAE,Note
0,Ordinal Regression,bucket_30_bsr,9866,81,0.52763,0.15805,0.07603,0.09471,0.08461,2.41895,All features and rows included
1,Ordinal Regression,bucket_10_bsr,9866,81,0.57036,0.42806,0.1778,0.19486,0.19431,0.77761,All features and rows included
2,Ordinal Regression,bucket_5_bsr,9866,81,0.64438,0.74316,0.36957,0.35586,0.38798,0.2847,All features and rows included
3,Ordinal Regression,bucket_rev_6,9866,81,0.55804,0.87335,0.2017,0.21388,0.21871,0.18338,All features and rows included
4,Ordinal Regression,bucket_5_frq,9866,81,0.59971,0.35968,0.28782,0.43016,0.29841,0.81054,All features and rows included
5,Ordinal Regression,bucket_10_frq,9866,81,0.5509,0.192,0.16932,0.28267,0.17369,1.63982,All features and rows included


# Conclusion

**Note:** We will continue with XGBoost Regressor with filtered data.

In [71]:
session_info.show()