In [2]:
import pandas as pd
# Load the data into a DataFrame 
df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,Rating,Review Count,Age,"Total Sizes_M, L, XL","Total Sizes_S, L, XL","Total Sizes_S, M, L",Available Sizes_L,Available Sizes_M,Available Sizes_S,Available Sizes_XL,...,Season_Fall/Winter,Season_Spring,Season_Spring/Summer,Season_Summer,Season_Winter,Customer Reviews_Mixed,Customer Reviews_Negative,Customer Reviews_Neutral,Customer Reviews_Positive,Customer Reviews_Unknown
0,0.527132,492,24,1,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0
1,0.047096,57,61,1,0,0,0,0,0,1,...,0,0,0,0,1,0,1,0,0,0
2,3.708884,197,27,0,0,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,1
3,2.305824,473,50,0,0,1,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,0.229053,55,23,1,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

# Features Engeenering:
df['Demand_M_L_XL'] = df['Total Sizes_M, L, XL'] - (df['Available Sizes_M'] + df['Available Sizes_L'] + df['Available Sizes_XL'])
df['Demand_S_L_XL'] = df['Total Sizes_S, L, XL'] - (df['Available Sizes_S'] + df['Available Sizes_L'] + df['Available Sizes_XL'])
df['Demand_S_M_L'] = df['Total Sizes_S, M, L'] - (df['Available Sizes_S'] + df['Available Sizes_M'] + df['Available Sizes_L'])

# Creating combined features for category and season
for category in ['Accessories', 'Activewear', 'Bottoms', 'Dresses', 'Footwear', 'Jewelry', 'Lingerie', 'Outerwear', 'Swimwear', 'Tops']:
    for season in ['Fall', 'Fall/Winter', 'Spring', 'Spring/Summer', 'Summer', 'Winter']:
        df[f'Category_{category}_Season_{season}'] = df[f'Category_{category}'] & df[f'Season_{season}']

# Target variables
target_columns = ['Total Sizes_M, L, XL', 'Total Sizes_S, L, XL', 'Total Sizes_S, M, L', 'Available Sizes_L', 
                  'Available Sizes_M', 'Available Sizes_S', 'Available Sizes_XL'] 

# Excluding direct size-related columns, but including category-season combinations
feature_columns = df.columns.drop(target_columns)

X = df[feature_columns]
y = df[target_columns]  # Use a DataFrame with all target columns

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the multi-output classifier with RandomForest
multi_output_rf = MultiOutputClassifier(RandomForestClassifier(random_state=42), n_jobs=-1)

# Train the model
multi_output_rf.fit(X_train, y_train)

# Make predictions
y_pred = multi_output_rf.predict(X_test)

# Output the results. Since we have multiple target columns, we can generate a report for each one
for i, target_name in enumerate(target_columns):
    print(f"Classification Report for {target_name}:")
    print(classification_report(y_test[target_name], y_pred[:, i]))


Classification Report for Total Sizes_M, L, XL:
              precision    recall  f1-score   support

           0       0.87      0.88      0.88    133547
           1       0.76      0.74      0.75     66453

    accuracy                           0.83    200000
   macro avg       0.81      0.81      0.81    200000
weighted avg       0.83      0.83      0.83    200000

Classification Report for Total Sizes_S, L, XL:
              precision    recall  f1-score   support

           0       0.87      0.88      0.87    133232
           1       0.75      0.74      0.75     66768

    accuracy                           0.83    200000
   macro avg       0.81      0.81      0.81    200000
weighted avg       0.83      0.83      0.83    200000

Classification Report for Total Sizes_S, M, L:
              precision    recall  f1-score   support

           0       0.87      0.88      0.88    133221
           1       0.76      0.74      0.75     66779

    accuracy                           

# OPTIMIZATION

In [17]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

multi_output_rf = MultiOutputClassifier(RandomForestClassifier(random_state=42), n_jobs=-1)

# Define the parameter grid
param_dist = {
    'estimator__n_estimators': randint(10, 200),
    'estimator__max_depth': randint(2, 5),
    'estimator__min_samples_leaf': randint(1, 4),
    'estimator__max_features': [ 'sqrt', None],
    'estimator__bootstrap': [True, False]
}

# Initialize the RandomizedSearchCV object
random_search = RandomizedSearchCV(
    estimator=multi_output_rf,
    param_distributions=param_dist,
    n_iter=2,
    cv=2,
    random_state=42,
    n_jobs=-1
)

# Fit to the training data
random_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found: ", random_search.best_params_)

# Use the best estimator to make predictions
y_pred = random_search.best_estimator_.predict(X_test)

# Output the results. 
for i, target_name in enumerate(target_columns):
    print(f"Classification Report for {target_name}:")
    print(classification_report(y_test[target_name], y_pred[:, i]))


Best parameters found:  {'estimator__bootstrap': True, 'estimator__max_depth': 2, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 3, 'estimator__n_estimators': 81}
Classification Report for Total Sizes_M, L, XL:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.67      1.00      0.80    133547
           1       0.00      0.00      0.00     66453

    accuracy                           0.67    200000
   macro avg       0.33      0.50      0.40    200000
weighted avg       0.45      0.67      0.53    200000

Classification Report for Total Sizes_S, L, XL:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.67      1.00      0.80    133232
           1       0.00      0.00      0.00     66768

    accuracy                           0.67    200000
   macro avg       0.33      0.50      0.40    200000
weighted avg       0.44      0.67      0.53    200000

Classification Report for Total Sizes_S, M, L:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.67      1.00      0.80    133221
           1       0.00      0.00      0.00     66779

    accuracy                           0.67    200000
   macro avg       0.33      0.50      0.40    200000
weighted avg       0.44      0.67      0.53    200000

Classification Report for Available Sizes_L:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.75      1.00      0.86    149994
           1       0.00      0.00      0.00     50006

    accuracy                           0.75    200000
   macro avg       0.37      0.50      0.43    200000
weighted avg       0.56      0.75      0.64    200000

Classification Report for Available Sizes_M:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.75      1.00      0.86    149787
           1       0.00      0.00      0.00     50213

    accuracy                           0.75    200000
   macro avg       0.37      0.50      0.43    200000
weighted avg       0.56      0.75      0.64    200000

Classification Report for Available Sizes_S:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.75      1.00      0.86    150215
           1       0.00      0.00      0.00     49785

    accuracy                           0.75    200000
   macro avg       0.38      0.50      0.43    200000
weighted avg       0.56      0.75      0.64    200000

Classification Report for Available Sizes_XL:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.75      1.00      0.86    150004
           1       0.00      0.00      0.00     49996

    accuracy                           0.75    200000
   macro avg       0.38      0.50      0.43    200000
weighted avg       0.56      0.75      0.64    200000



  _warn_prf(average, modifier, msg_start, len(result))
