# Random Forest Classifier for Retail_Store



### Dependencies

Add these packages:
- Numpy
- Pandas
- imbalanced-learn
- scikit-learn
- streamlit
- snowflake ml

In [None]:
import numpy as np
import pandas as pd
import time

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix
from sklearn.metrics import classification_report

from snowflake.ml.registry import Registry
from snowflake.snowpark.context import get_active_session

### Data Preparation

Splitting the data in to X, and y for both train and test. Also dropping ID columns.

In [None]:
session = get_active_session()

session.use_database("ML")
session.use_schema("RETAIL_STORE")

df_model_data = session.table('model_data') # importing data

start_time = time.time()


df_model_data = df_model_data.drop("CUSTOMER_ID", "OFFER_PRODUCT_ID") # dropping id columns
X = df_model_data.drop("REPEATER_INT")
y = df_model_data.select("REPEATER_INT")


FEATURE_COLS = X.columns[:len(X.columns)]
LABEL_COLS = ["REPEATER_INT"]

print(f"Feature Columns: {FEATURE_COLS}")

X = X.to_pandas()
y = y.to_pandas()

y = y.values.ravel()

# 80/20 train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

end_time = time.time()
final_time = end_time - start_time

print(f"\nData Preparation time: {final_time}")

### Training the Random Forest Classifier

##### Using a Randomized search to find optimal paramters

There are a two main reasons why I chose to use randomized search instead of a GridSearch of manually tuning it. 
1. I don't necessarily have great reasons for choosing the various values in a param_grid without doing some manual testing first. 
2. Time and cost. Improving the model by 0.1% will have very little effect on the outcome and it would cost a lot more compute and time to do so with more thorough optimisation methods. 

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report


start_time = time.time()

model = RandomForestClassifier()


param_dist = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2'], # dont use 'auto'. error
}


random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=5, ###
    scoring='f1',
    cv=5,
    verbose=2,
    random_state=42,
)

random_search.fit(X_train, y_train)

end_time = time.time()
training_time = end_time - start_time

print("Training time: ", training_time) 

parameters = random_search.best_params_

print("Best Parameters:", parameters)
print("Best Score:", random_search.best_score_)

##### Best Parameters

**Accuracy**

Training time: 612.6654381752014
Best Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 20}  

**F1 Score**

Training time: 662.3674252033234 

Best Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': None}  

**Precision**
Training time:  617.7530901432037  

Best Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 20}  

Best Score: 0.4705840774492579

**Recall**

It took 666.42 seconds to train or about 11 minutes


Best Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': None}  


Best Score: 0.30048660720508213

### Data Preparation for different methods to deal with overfitting
- Oversampling
- Undersampling
- SMOTE
- class_weight = "balanced."

In [None]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()

X_oversampled, y_oversampled = ros.fit_resample(X_train, y_train)


print(f"Total num in y_train: {len(y_train)}. Number of 1's: {np.sum(y_train == 1)}. Number of 0's: {np.sum(y_train==0)}")
print(f"Total num in y_train: {len(y_oversampled)}. Number of 1's: {np.sum(y_oversampled == 1)}. Number of 0's: {np.sum(y_oversampled==0)}")


from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
X_undersampled, y_undersampled = rus.fit_resample(X_train, y_train)

print(f"Total num in y_train: {len(y_train)}. Number of 1's: {np.sum(y_train == 1)}. Number of 0's: {np.sum(y_train==0)}")
print(f"Total num in y_train: {len(y_undersampled)}. Number of 1's: {np.sum(y_undersampled == 1)}. Number of 0's: {np.sum(y_undersampled==0)}")


from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_smoted, y_smoted = smote.fit_resample(X_train, y_train)

print(f"Total num in y_train: {len(y_train)}. Number of 1's: {np.sum(y_train == 1)}. Number of 0's: {np.sum(y_train==0)}")
print(f"Total num in y_train: {len(y_smoted)}. Number of 1's: {np.sum(y_smoted == 1)}. Number of 0's: {np.sum(y_smoted==0)}")

### Training and evaluating model

Using the optimal hyperparmeters, but also a version that is all default.


In [None]:
start_time = time.time()

#model = RandomForestClassifier(
    #class_weight = 'balanced'
#)

# model = RandomForestClassifier(
#     n_estimators = 200,
#     min_samples_split = 5,
#     min_samples_leaf = 1,
#     max_features = 'log2',
#     max_depth = None, 
#     #class_weight = 'balanced'
# )

#params_precision = {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 20}  
#params_accuracy = {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 20} 
params_f1 = {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': None} #'class_weight':'balanced' 

model = RandomForestClassifier(**params_f1)

#model.fit(X_train, y_train)
#model.fit(X_undersampled, y_undersampled)
#model.fit(X_oversampled, y_oversampled)
model.fit(X_smoted, y_smoted)

end_time = time.time()
training_time = end_time - start_time

#### Feature Importance and Estimators

In [None]:
importances = model.feature_importances_
parameters = model.get_params()
estimators = model.estimators_

print("Feature importance:", importances) #sklearn_model.get_params())
print("Estimators:", estimators[0])

In [None]:
# predict
start_time = time.time()

predictions = model.predict(X_test)

end_time = time.time()
prediction_time = end_time - start_time
print(f"Prediction time: {prediction_time}")
print(f"Predictions: {predictions}")

print(classification_report(y_test, predictions))

In [None]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, confusion_matrix

accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.4f}')

recall = recall_score(y_test, predictions)
print(f'Recall: {recall:.4f}')

precision = precision_score(y_test, predictions)
print(f'Precision: {precision:.4f}')

f1_score = f1_score(y_test, predictions)
print(f'F1 Score: {f1_score:.4f}')


conf_matrix = confusion_matrix(y_test, predictions)
print('Confusion Matrix:')
print(conf_matrix)

# For storing in db
true_positive = conf_matrix[1][1]  
true_negative = conf_matrix[0][0]  
false_positive = conf_matrix[0][1]  
false_negative = conf_matrix[1][0]

**Record Performance functinon**

In [None]:
import json

def record_performance(true_positive, true_negative, false_positive, false_negative, model_name, accuracy, recall, training_time, prediction_time, 
         parameters, coefficients, intercept, notes):

    confusion_matrix_insert_sql = f"""
        insert into model_results_schema.confusion_matrix
        (true_positive, true_negative, false_positive, false_negative)
        values
        ({true_positive}, {true_negative}, {false_positive}, {false_negative});
    """
    
    session.sql(confusion_matrix_insert_sql).collect()

    last_id_sql = """
        select id
        from model_results_schema.confusion_matrix
        order by create_at desc
        limit 1;
    """ 

    #SELECT LAST_VALUE(id) OVER (ORDER BY id RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS last_id
    confusion_matrix_id = session.sql(last_id_sql).collect()
    confusion_matrix_id = confusion_matrix_id[0]['ID']

    # These two need to be on a string format.
    if coefficients != "":
        coefficients = ', '.join(map(str, coefficients))
    parameters = json.dumps(parameters)
    
    # Insert data into the model_performance table
    session.sql(f"""
        insert into model_results_schema.model_performance
            (model_name, accuracy, recall, confusion_matrix_id,
            training_time, prediction_time, parameters, coefficients,
            intercept, notes)
        values
            ('{model_name}', {accuracy}, {recall}, {confusion_matrix_id}, {training_time}, {prediction_time}, '{parameters}', '{coefficients}', {intercept}, '{notes}');
    """).collect()

    
    return "success"



In [None]:

notes = "precision:" + str(precision) + "| f1_score: " + str(f1_score) # precision score
model_name = "RF-f1_score-smote"

# record_performance(true_positive, true_negative, false_positive, false_negative, model_name, accuracy, recall, training_time, prediction_time, 
         #parameters, coefficients, intercept, notes):
result = record_performance(true_positive, true_negative, false_positive, false_negative, model_name, accuracy, recall, training_time, prediction_time, 
         parameters, [0], 0.0, notes)
print(result)

## Review Results

In [None]:
use database ml;
use schema model_results_schema;

select * from model_performance
order by accuracy desc;

In [None]:
select * from confusion_matrix;

In [None]:
print(8343 / len(y_test)*100)

## Probability thresholds


In [None]:
use schema retail_store;
select count(distinct customer_id) from transactions;


In [None]:
# using the best model

model = RandomForestClassifier(
    n_estimators = 200,
    min_samples_split = 5,
    min_samples_leaf = 1,
    max_features = 'log2',
    max_depth = None
)

model.fit(X_undersampled, y_undersampled)

predictions = model.predict(X_test)

In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn.metrics import precision_recall_curve

probabilities = model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, probabilities)

for i in range(0, len(recall)-1):
    print(f"Recall: {recall[i]}, threshold: {thresholds[i]}")

plt.plot(thresholds, precision[:-1], label='Precision')
plt.plot(thresholds, recall[:-1], label='Recall')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision and Recall vs. Threshold')
plt.legend()
plt.show()

In [None]:
# chosen threshold. 95% recall
#threshold = 0.028952380952380955 

#y_pred = (probabilities >= threshold).astype(int)

# threshold for where precision meets recall (ish)

threshold = 0.75
y_pred = (probabilities >= threshold).astype(int)


In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

recall = recall_score(y_test, y_pred)
print(f'Recall: {recall:.4f}')

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# For storing in db
true_positive = conf_matrix[1][1]  
true_negative = conf_matrix[0][0]  
false_positive = conf_matrix[0][1]  
false_negative = conf_matrix[1][0]

In [None]:

notes = ""
model_name = "RF-optimal-params-undersampled-threshold=0.75"

result = record_performance(true_positive, true_negative, false_positive, false_negative, model_name, accuracy, recall, training_time, prediction_time, 
         parameters, [0], 0.0, notes)
print(result)