# Using the cortex ML inbuilt binary classificaton function

- Gradient boosting machine
- binary: AUC loss functino


#### Preparing training data

Doing a 80/20 split on the training_data table to have two seperate datasets for training and testing. 
Doing this not randomly, but by taking the first 80% rows for training and the last 20% for test. 


Then selecting the appropriate columns. 

In [None]:
create or replace view temp_table as
select *, ROW_NUMBER() OVER (ORDER BY RANDOM()) AS row_num
from model_data;

-- Create the 80% sample view
create or replace view sample_80 as
select chain_cat_1, chain_cat_2, chain_cat_3, offer_value_1, offer_value_2, offer_value_3, offer_value_4, offer_value_5, offer_value_6, previous_purchase_category_int, previous_purchase_int, repeater_int
from temp_table
where row_num <= (SELECT COUNT(*) * 0.8 FROM temp_table);

select * from sample_80 limit 2;

-- Create the 20% sample view
create or replace view sample_20 as
select chain_cat_1, chain_cat_2, chain_cat_3, offer_value_1, offer_value_2, offer_value_3, offer_value_4, offer_value_5, offer_value_6, previous_purchase_category_int, previous_purchase_int, repeater_int
from temp_table
where row_num >= (SELECT COUNT(*) * 0.8 FROM temp_table);

select * from sample_20 limit 2;

In [None]:
select count(*) from sample_20;

Checking the count of sample 80 and 20. 

In [None]:
select count(*) from sample_80;

In [None]:
select count(*) from sample_20;

### Creating model

Creating and training the model on sample_20 with the label=repeater_int

In [None]:
create or replace snowflake.ml.classification model_binary(
    input_data => system$reference('view', 'sample_80'),
    target_colname => 'repeater_int'
);

### Predictions and metrics

Using the PREDICT function to make predictions on the test set and display the results with its corresponding input features. 

In [None]:
create table my_predictions as
    select *, model_binary!PREDICT(
    INPUT_DATA => {*})
    as predictions from sample_20;

In [None]:
select * from my_predictions;

Various evaluation metrics. Copied into markdown as well such that it is saved. 

In [None]:
CALL model_binary!SHOW_EVALUATION_METRICS();

In [None]:
CALL model_binary!SHOW_GLOBAL_EVALUATION_METRICS();

In [None]:
CALL model_binary!SHOW_THRESHOLD_METRICS();

In [None]:
CALL model_binary!SHOW_CONFUSION_MATRIX();

In [None]:
CALL model_binary!SHOW_FEATURE_IMPORTANCE();

## Recording results


In [None]:
import json

def record_performance(true_positive, true_negative, false_positive, false_negative, model_name, accuracy, recall, training_time, prediction_time, 
         parameters, coefficients, intercept, notes):

    confusion_matrix_insert_sql = f"""
        insert into model_results_schema.confusion_matrix
        (true_positive, true_negative, false_positive, false_negative)
        values
        ({true_positive}, {true_negative}, {false_positive}, {false_negative});
    """
    
    session.sql(confusion_matrix_insert_sql).collect()

    last_id_sql = """
        select id
        from model_results_schema.confusion_matrix
        order by create_at desc
        limit 1;
    """ 

    #SELECT LAST_VALUE(id) OVER (ORDER BY id RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS last_id
    confusion_matrix_id = session.sql(last_id_sql).collect()
    confusion_matrix_id = confusion_matrix_id[0]['ID']

    # These two need to be on a string format.
    if coefficients != "":
        coefficients = ', '.join(map(str, coefficients))
    parameters = json.dumps(parameters)
    
    # Insert data into the model_performance table
    session.sql(f"""
        insert into model_results_schema.model_performance
            (model_name, accuracy, recall, confusion_matrix_id,
            training_time, prediction_time, parameters, coefficients,
            intercept, notes)
        values
            ('{model_name}', {accuracy}, {recall}, {confusion_matrix_id}, {training_time}, {prediction_time}, '{parameters}', '{coefficients}', {intercept}, '{notes}');
    """).collect()

    
    return "success"


In [None]:
# as cortex seem unable to calculate the different metrics on the predicitons

session = get_active_session()

session.use_database("ML")
session.use_schema("RETAIL_STORE")

df_predictions = session.table('my_predictions') # importing data

df_predictions = df_predictions.select('PREDICTIONS', 'REPEATER_INT')

df_predictions = df_predictions.to_pandas()

df_predictions.head()

In [None]:
import ast

for index, row in df_predictions.iterrows():
    d = ast.literal_eval(row['PREDICTIONS'])
    df_predictions.at[index, 'PREDICTIONS'] = int(d['class'])
    df_predictions.at[index, 'REPEATER_INT'] = int(row['REPEATER_INT'])

In [None]:
y_test = df_predictions['REPEATER_INT'].to_numpy()
predictions = df_predictions['PREDICTIONS'].astype(int).to_numpy()
print(y_test.dtype)
print(predictions.dtype)

In [None]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, confusion_matrix

accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.4f}')

recall = recall_score(y_test, predictions)
print(f'Recall: {recall:.4f}')

precision = precision_score(y_test, predictions)
print(f'Precision: {precision:.4f}')

f1_score = f1_score(y_test, predictions)
print(f'F1 Score: {f1_score:.4f}')


conf_matrix = confusion_matrix(y_test, predictions)
print('Confusion Matrix:')
print(conf_matrix)

# For storing in db
true_positive = conf_matrix[1][1]  
true_negative = conf_matrix[0][0]  
false_positive = conf_matrix[0][1]  
false_negative = conf_matrix[1][0]

In [None]:
training_time = 21
prediction_time = 10

model_name = "CORTEX"
parameters = "no parameters"
coefficients = [0]
intercept = 0.0
notes = ""

record_performance(true_positive, true_negative, false_positive, false_negative, model_name, accuracy, recall, training_time, prediction_time, parameters, coefficients, intercept, notes)


