In [None]:
## General data processing and visualisation use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import regex as re
import os
import glob
import plotly.express as px

## For webscraping
import requests
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.by import By

## Machine learning / Deep learning classification models
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import tree
from sklearn.metrics import ConfusionMatrixDisplay

## XGBoost as extra
import xgboost as xgb

## Set the pandas display option set to max_columns
pd.set_option('display.max_columns', None)

## Natural language processing
from collections import Counter
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import spacy


Get correct dataframe

In [None]:
## Load the dataset
df_n_fresh = pd.read_csv('df_n_fresh.csv')

In [None]:
## function to check the inputted model type
## raises value error.
def check_model_type(model_type):
    # Check valid model type
    valid_models = ["DecisionTree", "RandomForest", "LogisticRegression", "XGBoost", "ExtraTrees", "SVM"]
    if model_type not in valid_models:
        raise ValueError("Invalid model type - TRY AGAIN\n - TRY -> [DecisionTree, RandomForest, LogisticRegression, XGBoost, ExtraTrees, SVM]")

In [None]:
## Function to create any model
## input: Takes in a X_train and a y_train
## output: Model
## SHOULD DEFINETLY IMPLEMENT PICKLE FOR A REAL PROJECT
## MEANS WE WOULDN'T HAVE TO RETURN IT EACH TIME
def train_model(X_train, y_train, model_type):

    svm_linear_bool = False
    svm_poly_bool = False

    # Check input data and target shapes
    if X_train.shape[0] != y_train.shape[0]:
        raise ValueError("Shapes do not match! - TRY AGAIN")

    check_model_type(model_type)

    if model_type == 'DecisionTree':
        model = DecisionTreeClassifier(random_state=10)
        param_grid = {'max_depth': [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]}

    elif model_type == 'ExtraTrees':
        model = ExtraTreesClassifier()
        param_grid = {
            'n_estimators': [5,10,20,40,60,80,100,120, 150,175, 200],
            'max_depth': [1, 2, 3, 4, 5, 6, 7,8,9,10,11,12,13,14,15,16]}

    elif model_type == 'LogisticRegression':
        model = LogisticRegression(random_state=10, max_iter=300)
        param_grid = {'C': [0.1, 0.5, 1, 5, 10, 25, 50, 75, 100]}

    elif model_type == 'RandomForest':
        model = RandomForestClassifier(random_state=10)
        param_grid = {'n_estimators':[5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,100,125,150,200],
                      'max_depth':[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,20,32,64]}

    elif model_type == 'XGBoost':
        model = xgb.XGBClassifier()
        param_grid = {'n_estimators': [25,50,100,150,200],
                      'max_depth': [2,4,8,16,32],
                      'learning_rate': [0.01, 0.05, 0.07, 0.1, 0.02, 0.03, 0.06, 0.08, 0.15]}
    elif model_type == 'SVM':
        model = SVC()
        param_grid = {
            'probability': [True],
            'C': [0.01, 0.1, 0.5, 1, 5, 10,12,16,20],
            'gamma': [50,40,30,20,10,5,2,1,0.1,0.01,0.001],
            'kernel': ['sigmoid']
        }
    else:
        raise ValueError("Invalid model type - please try again with the following options: \n - LogisticRegression\n - DecisionTree\n - RandomForest\n - SVM")

    ## Initialise and declare GridsearchCV and fit accordingly
    grid_search = GridSearchCV(model, param_grid, cv=10)
    grid_search.fit(X_train, y_train.values.ravel())                    ############################
    print(f'BEST PARAMS {grid_search.best_params_}\n')

    ## Assign to model_output
    model_output = grid_search.best_estimator_

    if model_type == 'LogisticRegression':
        print(f'feature importance: \n {grid_search.best_estimator_.coef_[0]}')
        print(f"mean : {np.mean(abs(cross_val_score(model, X_train, y_train, cv=10, scoring='f1_macro')))}")
        print(f"std : {np.std(abs(cross_val_score(model, X_train, y_train, cv=10, scoring='f1_macro')))}")
        return model_output, model_output.feature_importances_
    elif model_type == 'SVM':
        # print(f'w: \n {grid_search.best_estimator_.coef_}')
        print(print('b = ', grid_search.best_estimator_.intercept_))
        print('Indices of support vectors = ', grid_search.best_estimator_.support_)
        print('Support vectors = ', grid_search.best_estimator_.support_vectors_)
        print('Number of support vectors for each class = ', grid_search.best_estimator_.n_support_)
        print('Coefficients of the support vector in the decision function = ', np.abs(grid_search.best_estimator_.dual_coef_))
        print(f"mean : {np.mean(abs(cross_val_score(model, X_train, y_train, cv=10, scoring='f1_macro')))}")
        print(f"std : {np.std(abs(cross_val_score(model, X_train, y_train, cv=10, scoring='f1_macro')))}")
        return model_output, None
    else:
        print(f'feature importance: \n {grid_search.best_estimator_.feature_importances_}')
        print(f"mean : {np.mean(abs(cross_val_score(model, X_train, y_train, cv=10, scoring='f1_macro')))}")
        print(f"std : {np.std(abs(cross_val_score(model, X_train, y_train, cv=10, scoring='f1_macro')))}")
        return model_output, model_output.feature_importances_

In [None]:
# Bit of a counter intuative name. test model basically runs for train as well. Its a bool param to indeicate if were running for train or test.
## INPUT: X, y, model:model, model_type:str, train:bool, threshold_input:float
## OUTPUT: accuracy, precision, recall, f1, confusion, proba, prediction
def test_model(X, y, model, model_type, train, threshold_input=None):

    if train:

        threshold_return = 0
        accuracy_output = 0
        precision_output = 0
        recall_output = 0
        f1_output = 0
        confusion_output = None
        prediction = 0
        proba = 0
        data_arr =[]

        for epoch in range(1,21):
            # print(f'epoch: {epoch}')
            threshold = epoch * 0.05
            proba = model.predict_proba(X)
            prediction = (model.predict_proba(X)[:,1] >= threshold).astype(bool)

            if model_type == 'SVM':
                ## Evaluate the model
                accuracy = accuracy_score(y, prediction)
                precision = precision_score(y, prediction, average='weighted', labels=np.unique(prediction))
                recall = recall_score(y, prediction, average='weighted', labels=np.unique(prediction))
                f1 = f1_score(y, prediction, average='weighted', labels=np.unique(prediction))
                confusion = confusion_matrix(y, prediction)
                data_arr += [[threshold, accuracy, precision, recall, f1]]
            else:
                ## Evaluate the model
                accuracy = accuracy_score(y, prediction)
                precision = precision_score(y, prediction)
                recall = recall_score(y, prediction)
                f1 = f1_score(y, prediction)
                confusion = confusion_matrix(y, prediction)
                data_arr += [[threshold, accuracy, precision, recall, f1]]

            ## Check recall as thats the focus
            ## Cant have recall or precision to be 1 as then either completely guessing 1 or 0.
            if recall_output < recall < 0.85 and accuracy > accuracy_output:
                recall_output = recall
                threshold_return = threshold
                accuracy_output = accuracy
                precision_output = precision
                f1_output = f1
                confusion_output = confusion


        return accuracy_output, precision_output, recall_output, f1_output, confusion_output, proba, prediction, threshold_return, data_arr
    else:
        proba = model.predict_proba(X)
        prediction = (model.predict_proba(X)[:,1] >= threshold_input).astype(bool)

        if model_type == 'SVM':
            ## Evaluate the model
            accuracy = accuracy_score(y, prediction)
            precision = precision_score(y, prediction, average='weighted', labels=np.unique(prediction))
            recall = recall_score(y, prediction, average='weighted', labels=np.unique(prediction))
            f1 = f1_score(y, prediction, average='weighted', labels=np.unique(prediction))
            confusion = confusion_matrix(y, prediction)
        else:
            ## Evaluate the model
            accuracy = accuracy_score(y, prediction)
            precision = precision_score(y, prediction)
            recall = recall_score(y, prediction)
            f1 = f1_score(y, prediction)
            confusion = confusion_matrix(y, prediction)

        ## Return
        return accuracy, precision, recall, f1, confusion, proba, prediction

In [None]:
# Runs and uses all other function created to measure and evaluate perofmance
## works for XGBoost, SVM, Extra trees, Random Forest, Decision tree.
## Want to add deep learning.
def evaluate_note_performance(X, y, model_type, training_data_type):

    # Check valid model type
    check_model_type(model_type)

    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

    print(f"-- TRAINING AND TESTING {model_type} --")
    model, fe_imp = train_model(X_train, y_train, model_type)
    train = test_model(X_train, y_train, model, model_type, train=True)
    confusion_type_train, thershold_input, data_arr_train = train[4], train[7], train[8]
    # thershold_input = train[7]
    # data_arr_train = train[8]
    print(f'THRESHOLD FOR {model_type} IS {thershold_input}')
    test = test_model(X_test, y_test, model, model_type, train=False, threshold_input=thershold_input)
    confusion_type_test = test[4]
    print(f'Accuracy train: {train[0]} | Accuracy test: {test[0]}')
    print(f'Precision train: {train[1]} | Precision test: {test[1]}')
    print(f'Recall train: {train[2]} | Recall test: {test[2]}')
    print(f'F1 train: {train[3]} | F1 test: {test[3]}')
    print(f'Confusion train: {train[4]} \n Confusion test: {test[4]}')

    # if (model_type == 'DecisionTree' and training_data_type == 'np'):
    #     return "DATA SHOWN \n"
    # else:
    #
    #     X_train[f'prob_sol_{model_type}_{training_data_type}'] = [lst[1] for lst in train[5]]
    #     X_test[f'prob_sol_{model_type}_{training_data_type}'] = [lst[1] for lst in test[5]]
    #
    #     test_df = pd.concat([X_train,X_test])
    #     sol_temp = pd.merge(sol_temp, test_df[[f'prob_sol_{model_type}_{training_data_type}']], left_index=True, right_index=True)
    #     sol_temp[f'accuracy_{model_type}_{training_data_type}'] = test[0]
    #
    #     return sol_temp

    # For evaluation - DO I NEED TO RETURN MODEL AS WELL?
    if model_type == 'SVM':
        return data_arr_train, confusion_type_train, confusion_type_test, model, None
    else:
        return data_arr_train, confusion_type_train, confusion_type_test, model, fe_imp


In [None]:
''' Gets data and provides illustrations based of accuracy, precusion, recall and f1 '''
def show_eval_performance(data, fe=None):
    id_vis = []
    accuracy_vis = []
    precision_vis = []
    recall_vis = []
    f1_vis = []

    print(data)

    for i in data:
        id_vis += [i[0]]
        accuracy_vis += [i[1]]
        precision_vis += [i[2]]
        recall_vis += [i[3]]
        f1_vis += [i[4]]

    data_vis_df = pd.DataFrame({
        'id':id_vis,
        'acc':accuracy_vis,
        'prec':precision_vis,
        'recall':recall_vis,
        'f1':f1_vis,
    })

    # plot lines
    plt.figure(figsize=(15,10))
    plt.plot(id_vis, accuracy_vis, label = "accuracy")
    plt.plot(id_vis, precision_vis, label = "precision")
    plt.plot(id_vis, recall_vis, label = "recall")
    plt.plot(id_vis, f1_vis, label = "f1 values")
    plt.xlabel('Threshold')
    plt.ylabel('Metrics')
    plt.legend()
    plt.show()

    if fe is not None:

        small_df = pd.DataFrame({
            'features': features_n_svm1,
            'importance': fe
        })

        small_df = small_df.sort_values('importance', ascending=False)

        plt.figure(figsize=(8,5))
        sns.barplot(
            data=small_df,
            x='features',
            y='importance'
        )
        plt.xlabel('feature')
        plt.ylabel('importance')
        plt.xticks(
            rotation=45,
            horizontalalignment = 'right',
            fontweight = 'light',
            fontsize = 'large'
        )
        plt.title('feature importance')
        plt.show()

    return data_vis_df

Utilisation of created functions

In [None]:
# Without month
features_n = ['instability', 'Dry', 'Frozen', 'Storm', 'Strong', 'Weak', 'Wet',
              'East Central', 'East North', 'East South', 'Mt Hood', 'Olympics',
              'Other', 'Snoqualmie Pass', 'Stevens Pass', 'West Central',
              'West North', 'West South', 'year', 'NWAC Forecaster', 'NWAC Observer', 'Pro', 'Public']


features_n_svm1 = ['instability', 'Dry', 'Frozen', 'Storm', 'Strong', 'Weak', 'Wet',
                   'East Central', 'East North', 'East South', 'Mt Hood', 'Olympics',
                   'Other', 'Snoqualmie Pass', 'Stevens Pass', 'West Central',
                   'West North', 'West South', 'NWAC Forecaster', 'NWAC Observer', 'Pro', 'Public', '2020', '2021', '2022', '2023']

# With month
features_m = ['instability', 'Dry', 'Frozen', 'Storm', 'Strong', 'Weak', 'Wet',
              'East Central', 'East North', 'East South', 'Mt Hood', 'Olympics',
              'Other', 'Snoqualmie Pass', 'Stevens Pass', 'West Central',
              'West North', 'West South', 'year', 'NWAC Forecaster', 'NWAC Observer', 'Pro', 'Public', 'Apr', 'Dec', 'Feb', 'Jan',
              'Jul', 'Jun', 'Mar', 'May', 'Nov','Oct', 'Sep']
target = ['avalanche_Y/N']

print(len(features_m), len(target))

In [None]:
# df_n_fresh.dtypes
df_n_fresh['year'] = df_n_fresh['year'].astype('int')
df_n_fresh.dtypes

## DECISION TREE TRAINING AND EVALUATION

In [None]:
# Works better without month
X = df_n_fresh[features_n]
y = df_n_fresh[target]

data_eval_decision_tree, cm_train, cm_test, model, fe_importance = evaluate_note_performance(X, y, model_type='DecisionTree', training_data_type='np')

print("--- DECISION TREE ---")
plt.figure(figsize=(15,10))
tree.plot_tree(
    model, feature_names=features_n, class_names=['Yes', 'No'], filled=True
)
plt.show()

cmap = plt.cm.tab20c
lab = ['Yes', 'No']

print("--- TRAIN CONFUSION MATRIX ---")
disp_cm = ConfusionMatrixDisplay(confusion_matrix=cm_train, display_labels=['No', 'Yes'])
disp_cm.plot(cmap=cmap)
plt.show()
print(cm_train)

print("--- TEST CONFUSION MATRIX ---")
disp_cm = ConfusionMatrixDisplay(confusion_matrix=cm_test, display_labels=['No', 'Yes'])
disp_cm.plot(cmap=cmap)
plt.show()
print(cm_test)

print("--- OVERALL TRAIN EVALUATION ---")
df_disp = show_eval_performance(data_eval_decision_tree, fe_importance)
df_disp


## RANDOM FOREST TRAIN AND EVALUATION

In [None]:
X = df_n_fresh[features_m]
y = df_n_fresh[target]

data_eval_random_forest, cm_train, cm_test, model, fe = evaluate_note_performance(X, y, model_type='RandomForest', training_data_type='np')

print("--- TRAIN CONFUSION MATRIX ---")
disp_cm = ConfusionMatrixDisplay(confusion_matrix=cm_train, display_labels=['No', 'Yes'])
disp_cm.plot(cmap=cmap)
plt.show()
print(cm_train)

print("--- TEST CONFUSION MATRIX ---")
disp_cm = ConfusionMatrixDisplay(confusion_matrix=cm_test, display_labels=['No', 'Yes'])
disp_cm.plot(cmap=cmap)
plt.show()
print(cm_test)

print("--- OVERALL TRAIN EVALUATION ---")
df_disp = show_eval_performance(data_eval_random_forest, fe)
df_disp

## EXTRA TREE TRAIN AND EVAL

In [None]:
X = df_n_fresh[features_m]
y = df_n_fresh[target]

data_eval_extratrees, cm_train, cm_test, model, fe = evaluate_note_performance(X, y, model_type='ExtraTrees', training_data_type='np')

print("--- TRAIN CONFUSION MATRIX ---")
disp_cm = ConfusionMatrixDisplay(confusion_matrix=cm_train, display_labels=['No', 'Yes'])
disp_cm.plot(cmap=cmap)
plt.show()
print(cm_train)

print("--- TEST CONFUSION MATRIX ---")
disp_cm = ConfusionMatrixDisplay(confusion_matrix=cm_test, display_labels=['No', 'Yes'])
disp_cm.plot(cmap=cmap)
plt.show()
print(cm_test)

print("--- OVERALL TRAIN EVALUATION ---")
df_disp = show_eval_performance(data_eval_extratrees,fe)
df_disp

## XGBOOST TRAIN AND EVAL

In [None]:
X = df_n_fresh[features_n_svm1]
y = df_n_fresh[target]

data_eval_xgboost, cm_train, cm_test, model, fe = evaluate_note_performance(X, y, model_type='XGBoost', training_data_type='np')

print("--- TRAIN CONFUSION MATRIX ---")
disp_cm = ConfusionMatrixDisplay(confusion_matrix=cm_train, display_labels=['No', 'Yes'])
disp_cm.plot(cmap=cmap)
plt.show()
print(cm_train)

print("--- TEST CONFUSION MATRIX ---")
disp_cm = ConfusionMatrixDisplay(confusion_matrix=cm_test, display_labels=['No', 'Yes'])
disp_cm.plot(cmap=cmap)
plt.show()
print(cm_test)

print("--- OVERALL TRAIN EVALUATION ---")
df_disp = show_eval_performance(data_eval_xgboost, fe)
df_disp

## SVM TRAIN AND EVAL

In [None]:
# Heatmap as an SVM works on correlation
sns.heatmap(df_n_fresh.corr())

In [None]:
X = df_n_fresh[features_n_svm1]
y = df_n_fresh[target]

data_eval_svm, cm_train, cm_test, model, fe = evaluate_note_performance(X, y, model_type='SVM', training_data_type='np')

print("--- TRAIN CONFUSION MATRIX ---")
disp_cm = ConfusionMatrixDisplay(confusion_matrix=cm_train, display_labels=['No', 'Yes'])
disp_cm.plot()
plt.show()
print(cm_train)

print("--- TEST CONFUSION MATRIX ---")
disp_cm = ConfusionMatrixDisplay(confusion_matrix=cm_test, display_labels=['No', 'Yes'])
disp_cm.plot()
plt.show()
print(cm_test)

print("--- OVERALL TRAIN EVALUATION ---")
df_disp = show_eval_performance(data_eval_svm)
df_disp

# Performance measure with temparature data.

I think the best way to go about this to show a proof of conecpt is to get the main data weather data per given area, these include but will limit the size of my data set:
- Mt Hood
- Olympics
- Stevens Pass
- Snoqualmie Pass
Will need to also test it on the original subset of the dataset with no temperature to get a clear idea of if there is an increase in performance or not.

The data is very unballanced, thus need to try on specific areas and see if that improves things.
- furhter need to cut
