In [26]:
import seaborn as sns
import matplotlib as plt
%matplotlib inline

In [1]:
import pandas as pd

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

In [2]:
file_path = 't-shirts.csv'
df = pd.read_csv(file_path)

In [3]:
df.describe()

Unnamed: 0,size,material,color,sleeves,demand
count,20000,20000,20000,20000,20000
unique,7,5,10,2,3
top,L,nylon,white,long,high
freq,4408,5652,3286,10117,8965


In [5]:

def one_hot_encode_column(df, column):
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')
    ohe_transformed = ohe.fit_transform(df[[column]])
    df = df.drop(columns=[column])
    df = pd.concat([df, ohe_transformed], axis=1)
    return df

In [10]:
def ordinal_encode_column(df, column, categories):
    encoder = OrdinalEncoder(categories=[categories])
    df[column] = encoder.fit_transform(df[[column]])
    return df

In [11]:
# material, color - one hot encoding
df_transformed = one_hot_encode_column(df, 'material')
df_transformed = one_hot_encode_column(df_transformed, 'color')

# size, sleeves, demand - ordinal
sizes = ['XS', 'S', 'M', 'L', 'XL', 'XXL', '3XL']
sleeves = ['short', 'long']
demand = ['low', 'medium', 'high']

df_transformed = ordinal_encode_column(df_transformed, 'size', sizes)
df_transformed = ordinal_encode_column(df_transformed, 'sleeves', sleeves)
df_transformed = ordinal_encode_column(df_transformed, 'demand', demand)

In [12]:
df_transformed.describe()

Unnamed: 0,size,sleeves,demand,material_cotton,material_linen,material_nylon,material_polyester,material_silk,color_black,color_blue,color_cream,color_green,color_navy,color_orange,color_red,color_violet,color_white,color_yellow
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,3.00185,0.50585,1.33195,0.2167,0.1668,0.2826,0.27775,0.05615,0.1559,0.0979,0.1149,0.05175,0.11445,0.066,0.1,0.06475,0.1643,0.07005
std,1.720231,0.499978,0.674079,0.412007,0.372807,0.450275,0.447901,0.230217,0.36277,0.297187,0.318909,0.221527,0.318365,0.248288,0.300008,0.24609,0.370557,0.255238
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.0,1.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,6.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [30]:
def standardize_set(df):
    scaler = StandardScaler()
    sc_set = scaler.fit_transform(df)
    return pd.DataFrame(sc_set, columns=df.columns)

def normalize_set(df):
    scaler = MinMaxScaler()
    sc_set = scaler.fit_transform(df)
    return pd.DataFrame(sc_set, columns=df.columns)

def naive_bayes(X_train, y_train, X_valid, hyperparameters=None):
    if hyperparameters:
        gnb = GaussianNB(**hyperparameters)
    else:
        gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_valid, y_pred, average='weighted')
    cm = confusion_matrix(y_valid, y_pred)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("Confusion Matrix:")
    print(cm)
    return y_pred

def decision_tree(X_train, y_train, X_valid, hyperparameters=None):
    if hyperparameters:
        dtc = DecisionTreeClassifier(**hyperparameters)
    else:
        dtc = DecisionTreeClassifier()
    dtc.fit(X_train, y_train)
    y_pred = dtc.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_valid, y_pred, average='weighted')
    cm = confusion_matrix(y_valid, y_pred)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("Confusion Matrix:")
    print(cm)
    return y_pred

In [31]:
X = df_transformed.drop(columns=['demand'])  # Features
y = df_transformed['demand']  # Target variable

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# standardized set
X_train_st = standardize_set(X_train)
X_valid_st = standardize_set(X_valid)

#normalized set
X_train_n = normalize_set(X_train)
X_valid_n = normalize_set(X_valid)

In [41]:
X_train.describe()

Unnamed: 0,size,sleeves,material_cotton,material_linen,material_nylon,material_polyester,material_silk,color_black,color_blue,color_cream,color_green,color_navy,color_orange,color_red,color_violet,color_white,color_yellow
count,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0
mean,3.009063,0.50725,0.215375,0.168125,0.283438,0.276938,0.056125,0.155562,0.096312,0.114687,0.052062,0.11575,0.064875,0.099625,0.064,0.166,0.071125
std,1.724722,0.499963,0.411095,0.373989,0.450681,0.447499,0.23017,0.362451,0.295029,0.318654,0.22216,0.319935,0.246313,0.299509,0.244761,0.372092,0.257042
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [40]:
X_valid.describe()

Unnamed: 0,size,sleeves,material_cotton,material_linen,material_nylon,material_polyester,material_silk,color_black,color_blue,color_cream,color_green,color_navy,color_orange,color_red,color_violet,color_white,color_yellow
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,2.973,0.50025,0.222,0.1615,0.27925,0.281,0.05625,0.15725,0.10425,0.11575,0.0505,0.10925,0.0705,0.1015,0.06775,0.1575,0.06575
std,1.702056,0.500062,0.415643,0.368038,0.448687,0.449544,0.230433,0.364082,0.305623,0.319965,0.219002,0.311992,0.25602,0.302027,0.251348,0.364317,0.247876
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [42]:
y_train.describe()

count    16000.000000
mean         1.330812
std          0.674558
min          0.000000
25%          1.000000
50%          1.000000
75%          2.000000
max          2.000000
Name: demand, dtype: float64

In [43]:
y_valid.describe()

count    4000.000000
mean        1.336500
std         0.672221
min         0.000000
25%         1.000000
50%         1.000000
75%         2.000000
max         2.000000
Name: demand, dtype: float64

In [45]:
X_valid_st.describe().round(5)

Unnamed: 0,size,sleeves,material_cotton,material_linen,material_nylon,material_polyester,material_silk,color_black,color_blue,color_cream,color_green,color_navy,color_orange,color_red,color_violet,color_white,color_yellow
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0
std,1.00013,1.00013,1.00013,1.00013,1.00013,1.00013,1.00013,1.00013,1.00013,1.00013,1.00013,1.00013,1.00013,1.00013,1.00013,1.00013,1.00013
min,-1.74693,-1.0005,-0.53418,-0.43887,-0.62245,-0.62516,-0.24414,-0.43196,-0.34115,-0.3618,-0.23062,-0.35021,-0.2754,-0.3361,-0.26958,-0.43237,-0.26529
25%,-0.57173,-1.0005,-0.53418,-0.43887,-0.62245,-0.62516,-0.24414,-0.43196,-0.34115,-0.3618,-0.23062,-0.35021,-0.2754,-0.3361,-0.26958,-0.43237,-0.26529
50%,0.01587,0.9995,-0.53418,-0.43887,-0.62245,-0.62516,-0.24414,-0.43196,-0.34115,-0.3618,-0.23062,-0.35021,-0.2754,-0.3361,-0.26958,-0.43237,-0.26529
75%,0.60346,0.9995,-0.53418,-0.43887,1.60656,1.5996,-0.24414,-0.43196,-0.34115,-0.3618,-0.23062,-0.35021,-0.2754,-0.3361,-0.26958,-0.43237,-0.26529
max,1.77866,0.9995,1.87203,2.27859,1.60656,1.5996,4.09607,2.31502,2.93127,2.76393,4.33613,2.8554,3.63103,2.97527,3.70947,2.31284,3.7695


In [46]:
X_train_st.describe().round(5)

Unnamed: 0,size,sleeves,material_cotton,material_linen,material_nylon,material_polyester,material_silk,color_black,color_blue,color_cream,color_green,color_navy,color_orange,color_red,color_violet,color_white,color_yellow
count,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0
mean,-0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0
std,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003
min,-1.74472,-1.01461,-0.52392,-0.44956,-0.62893,-0.61888,-0.24385,-0.42921,-0.32646,-0.35992,-0.23435,-0.3618,-0.26339,-0.33264,-0.26149,-0.44614,-0.27671
25%,-0.58508,-1.01461,-0.52392,-0.44956,-0.62893,-0.61888,-0.24385,-0.42921,-0.32646,-0.35992,-0.23435,-0.3618,-0.26339,-0.33264,-0.26149,-0.44614,-0.27671
50%,-0.00525,0.9856,-0.52392,-0.44956,-0.62893,-0.61888,-0.24385,-0.42921,-0.32646,-0.35992,-0.23435,-0.3618,-0.26339,-0.33264,-0.26149,-0.44614,-0.27671
75%,0.57457,0.9856,-0.52392,-0.44956,1.59,1.61584,-0.24385,-0.42921,-0.32646,-0.35992,-0.23435,-0.3618,-0.26339,-0.33264,-0.26149,-0.44614,-0.27671
max,1.73421,0.9856,1.90868,2.2244,1.59,1.61584,4.1009,2.32987,3.06315,2.77837,4.26705,2.76393,3.79661,3.00627,3.82426,2.24145,3.61383


In [47]:
X_valid_n.describe().round(5)

Unnamed: 0,size,sleeves,material_cotton,material_linen,material_nylon,material_polyester,material_silk,color_black,color_blue,color_cream,color_green,color_navy,color_orange,color_red,color_violet,color_white,color_yellow
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,0.4955,0.50025,0.222,0.1615,0.27925,0.281,0.05625,0.15725,0.10425,0.11575,0.0505,0.10925,0.0705,0.1015,0.06775,0.1575,0.06575
std,0.28368,0.50006,0.41564,0.36804,0.44869,0.44954,0.23043,0.36408,0.30562,0.31996,0.219,0.31199,0.25602,0.30203,0.25135,0.36432,0.24788
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.33333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.66667,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [48]:
X_train_n.describe().round(5)

Unnamed: 0,size,sleeves,material_cotton,material_linen,material_nylon,material_polyester,material_silk,color_black,color_blue,color_cream,color_green,color_navy,color_orange,color_red,color_violet,color_white,color_yellow
count,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0
mean,0.50151,0.50725,0.21538,0.16812,0.28344,0.27694,0.05612,0.15556,0.09631,0.11469,0.05206,0.11575,0.06488,0.09962,0.064,0.166,0.07112
std,0.28745,0.49996,0.4111,0.37399,0.45068,0.4475,0.23017,0.36245,0.29503,0.31865,0.22216,0.31993,0.24631,0.29951,0.24476,0.37209,0.25704
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.33333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.66667,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [49]:
hyperparameters = [
    {'var_smoothing': 0.5},
    {'var_smoothing': 1e-7},
    {'var_smoothing': 1e-11}
]
data_sets = [
    (X_train, X_valid, "Unchanged"),
    (X_train_st, X_valid_st, "Standardized"),
    (X_train_n, X_valid_n, "Normalized")
]

In [50]:
print("Naive Bayes - Unchanged set: ")
naive_bayes(X_train, y_train, X_valid)
print("Naive Bayes - Standardized set: ")
naive_bayes(X_train_st, y_train, X_valid_st)
print("Naive Bayes - Normalized set: ")
naive_bayes(X_train_n, y_train, X_valid_n)
# hyperparameters
for set_train, set_valid, description in data_sets:
    print(f"Evaluating on {description} set:")
    for i, params in enumerate(hyperparameters, 1):
        print(f"  Training model {i} with hyperparameters: {params}")
        naive_bayes(set_train, y_train, set_valid, params)
    print('------' * 10)

Naive Bayes - Unchanged set: 
Accuracy: 0.62
Precision: 0.7378040897034
Recall: 0.62
Confusion Matrix:
[[ 186   16  255]
 [  39  530 1171]
 [   0   39 1764]]
Naive Bayes - Standardized set: 
Accuracy: 0.48175
Precision: 0.7154058517780463
Recall: 0.48175
Confusion Matrix:
[[ 369   88    0]
 [ 182 1558    0]
 [ 208 1595    0]]
Naive Bayes - Normalized set: 
Accuracy: 0.62
Precision: 0.7378040897034
Recall: 0.62
Confusion Matrix:
[[ 186   16  255]
 [  39  530 1171]
 [   0   39 1764]]
Evaluating on Unchanged set:
  Training model 1 with hyperparameters: {'var_smoothing': 0.5}
Accuracy: 0.642
Precision: 0.6817758415123183
Recall: 0.642
Confusion Matrix:
[[   0  324  133]
 [   0 1141  599]
 [   0  376 1427]]
  Training model 2 with hyperparameters: {'var_smoothing': 1e-07}
Accuracy: 0.62
Precision: 0.7378040897034
Recall: 0.62
Confusion Matrix:
[[ 186   16  255]
 [  39  530 1171]
 [   0   39 1764]]
  Training model 3 with hyperparameters: {'var_smoothing': 1e-11}
Accuracy: 0.62
Precision: 0

In [51]:
hyperparameters = [
    {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None, 'ccp_alpha': 0.1},
    {'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'ccp_alpha': 0.01},
    {'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'ccp_alpha': 0.001}
]

In [52]:
# no hyperparameters
print("Decision Tree - Unchanged set: ")
decision_tree(X_train, y_train, X_valid)
print("Decision Tree - Standardized set: ")
decision_tree(X_train_st, y_train, X_valid_st)
print("Decision Tree - Normalized set: ")
decision_tree(X_train_n, y_train, X_valid_n)
# hyperparameters
for set_train, set_valid, description in data_sets:
    print(f"Evaluating on {description} set:")
    for i, params in enumerate(hyperparameters, 1):
        print(f"\nTraining model {i} with hyperparameters: {params}")
        decision_tree(set_train, y_train, set_valid, params)
    print('------'*10)

Decision Tree - Unchanged set: 
Accuracy: 0.971
Precision: 0.9710169240594123
Recall: 0.971
Confusion Matrix:
[[ 421   36    0]
 [  23 1693   24]
 [   0   33 1770]]
Decision Tree - Standardized set: 
Accuracy: 0.971
Precision: 0.9710169240594123
Recall: 0.971
Confusion Matrix:
[[ 421   36    0]
 [  23 1693   24]
 [   0   33 1770]]
Decision Tree - Normalized set: 
Accuracy: 0.971
Precision: 0.9710169240594123
Recall: 0.971
Confusion Matrix:
[[ 421   36    0]
 [  23 1693   24]
 [   0   33 1770]]
Evaluating on Unchanged set:

Training model 1 with hyperparameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None, 'ccp_alpha': 0.1}
Accuracy: 0.45075
Precision: 0.7524255624999999
Recall: 0.45075
Confusion Matrix:
[[   0    0  457]
 [   0    0 1740]
 [   0    0 1803]]

Training model 2 with hyperparameters: {'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'ccp_alpha': 0.0