In [1]:
import pandas as pd
import numpy as np

# Data Sourcing and Merging

In [2]:
from data import Data

In [9]:
data = Data().get_data()
df = Data().get_match_table()

## Data Preprocessing

In [10]:
col_num = []
col_bool =[]
col_object =[]

for col in df:
    if df[col].dtype == "float64":
        col_num.append(col)
        
    if df[col].dtype == "int64":
        col_num.append(col)
        
    if df[col].dtype == 'bool':
        col_bool.append(col)
        
    if df[col].dtype == 'object':
        col_object.append(col)
        
col_bool.remove('summit_success')

In [11]:
for col in df:        
    if df[col].dtype == 'bool':
        df[col].fillna(method='bfill')

In [12]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer, KNNImputer
from scipy.stats import uniform
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=2)),
    ('scaler', MinMaxScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop= 'first', handle_unknown='error'))])

# boolean_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, col_num),
        ('cat', categorical_transformer, col_object)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor)])

X = df.drop(columns=['summit_success'])
y = df.summit_success

X_trans = clf.fit_transform(X)

## Models

In [13]:
from sklearn.metrics import classification_report

### Split the data

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_trans, y, test_size=0.3, random_state=42)

### Baseline Models

In [17]:
## Baseline
y_base = np.zeros(len(y_test))
print(classification_report(y_test, y_base))

              precision    recall  f1-score   support

       False       0.57      1.00      0.72      2790
        True       0.00      0.00      0.00      2125

    accuracy                           0.57      4915
   macro avg       0.28      0.50      0.36      4915
weighted avg       0.32      0.57      0.41      4915



  _warn_prf(average, modifier, msg_start, len(result))


### KNN Classifier

In [18]:
from sklearn.neighbors import KNeighborsClassifier

#### Simple KNN Classifier Model

In [19]:
%%time

model = KNeighborsClassifier(n_neighbors=1)
model.fit(X_train, y_train)

print("model score: %.3f" % model.score(X_test, y_test))

model score: 0.808
CPU times: user 3.52 s, sys: 343 ms, total: 3.87 s
Wall time: 3.87 s


In [20]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.84      0.81      0.83      2790
        True       0.77      0.80      0.78      2125

    accuracy                           0.81      4915
   macro avg       0.80      0.81      0.80      4915
weighted avg       0.81      0.81      0.81      4915



#### Find the best KNN Classifier Model

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
%%time

#List Hyperparameters that we want to tune.
leaf_size = list(range(1,2))
n_neighbors = list(range(1,2))
p=[1,2]

#Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

#Create new KNN object
knn_2 = KNeighborsClassifier()

#Use GridSearch
clf = GridSearchCV(knn_2, hyperparameters, cv=10, n_jobs=-1)

#Fit the model
best_model = clf.fit(X_train, y_train)

#Print The value of best Hyperparameters
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])

Best leaf_size: 1
Best p: 1
Best n_neighbors: 1
CPU times: user 139 ms, sys: 147 ms, total: 287 ms
Wall time: 6.71 s


In [28]:
%%time
y_pred = best_model.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.85      0.82      0.84      2790
        True       0.78      0.81      0.79      2125

    accuracy                           0.82      4915
   macro avg       0.81      0.82      0.82      4915
weighted avg       0.82      0.82      0.82      4915

CPU times: user 15.6 s, sys: 314 ms, total: 15.9 s
Wall time: 1.19 s


#### Best KNN Model

__F1_Score = 0.82__

__leaf_size = 1__

__p = 1__

__n_neighbors = 1__

__Time to compute = 1.19 seconds__