In [None]:
import data_prep
import pandas as pd
import numpy as np

# Get the preprocessed dataset
df = data_prep.get_cleaned_dataset()

print(df.dtypes)

In [None]:
#####################
### FUNCTION DEFS ###
#####################

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

def prep_dataset(df, test_size=0.2):
    # Separate labels & classes
    X = df.drop('Class', axis=1).values     # Labels
    y = df['Class'].values                  # Classes

    X = OneHotEncoder().fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

    return X_train, X_test, y_train, y_test

from sklearn.metrics import confusion_matrix # TODO confusion matrix

def basic_predict(X_train, X_test, y_train, y_test, classifier):
    """
    Runs basic prediction with 
    :returns double indicating performance
    NOTE: Random train/test split means performance is not consistent
    """
    classifier.fit(X_train, y_train)

    y_test_predict = classifier.predict(X_test)

    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")

    result = np.array([y_test_predict[ii] == y_test[ii] for ii in range(len(y_test))])

    performance = np.count_nonzero(result)/len(result)

    return performance

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# Dataset
df = data_prep.get_cleaned_dataset()

X_train, X_test, y_train, y_test = prep_dataset(df, test_size=0.9966)

# K-Nearest Neighbours
knn = KNeighborsClassifier(n_neighbors=3)
knn_perf = basic_predict(X_train, X_test, y_train, y_test, knn)

# Decision Tree 
# TODO use something other than decision tree
dt = DecisionTreeClassifier()
dt_perf = basic_predict(X_train, X_test, y_train, y_test, dt)

# Naive Bayes
gnb = GaussianNB()
gnb_perf = basic_predict(X_train, X_test, y_train, y_test, dt)

print(f"knn performance: {knn_perf}")
print(f"dt performance: {dt_perf}")
print(f"gnb performance: {gnb_perf}")


In [None]:
###########
### OLD ###
###########

# Try some knn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

train_test_ratio = 0.8

performance_arr = []

for n in range(1, 101):
    knn = KNeighborsClassifier(n_neighbors=1)

    last_train_row = int(len(df) * train_test_ratio)

    X = df.drop('Class', axis=1).values     # Labels
    y = df['Class'].values                  # Classes

    X = OneHotEncoder().fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    knn.fit(X_train, y_train)

    y_predict = knn.predict(X_test)

    result = np.array([y_predict[ii] == y_test[ii] for ii in range(len(y_test))])

    performance = np.count_nonzero(result)/len(result)

    performance_arr.append(performance)

best_n = performance_arr.index(max(performance_arr))
print(best_n)

print(performance_arr[best_n])

In [18]:
#########################################
## Pipeline 1: K-Fold Cross-Validation ##
#########################################
## Missing: hparams, attribute selection, meta-classifiers ##

import pickle

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier

import data_prep

# Get dataset (mix of categorical & numeric)
df_train, df_test = data_prep.get_prepped_dataset(bins=10, verbose=False)

full = df_train.to_numpy()

# Define values & labels
X = df_train.drop('Class', axis=1).to_numpy()
y = df_train['Class'].to_numpy()

# Perform one-hot encoding on categorical values
X = OneHotEncoder().fit_transform(X)

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20)

knn = KNeighborsClassifier(n_neighbors=3)

result = cross_val_score(knn, X, y, cv=5)


[0.70555556 0.72222222 0.70555556 0.68333333 0.72777778]


In [None]:
#############################################################
## Pipeline 2: K-Fold Cross-Validation & Feature Selection ##
#############################################################
## Missing: hparams, attribute selection, meta-classifiers ##

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20) TODO do this later!!!

import pickle

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

import data_prep

# Get dataset (mix of categorical & numeric)
df_train, df_test = data_prep.get_prepped_dataset(bins=10, verbose=False)

full = df_train.to_numpy()

# Define values & labels
X = df_train.drop('Class', axis=1).to_numpy()
y = df_train['Class'].to_numpy()

knn = KNeighborsClassifier(n_neighbors=3)

# Build pipeline
pl = Pipeline([
    ('onehot', OneHotEncoder()),
    ('sfs', SequentialFeatureSelector(knn, direction='backward', n_features_to_select=None, cv=5)) # TODO change these params
])

print("Starting pipeline fit...")
pl.fit(X, y)

# Try removing all 2 pairs of features to see what gets the best performance
print("Done!")
with open('pipeline_2', 'wb') as f:
    pickle.dump(pl, f)

In [None]:
###############
## NEW THING ##
###############



In [34]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import pickle

# messing about with the created object
df_train, df_test = data_prep.get_prepped_dataset(bins=10, verbose=False)

with open('./pipeline_2', 'rb') as f:
    result:Pipeline = pickle.load(f)

sfs:SequentialFeatureSelector = result.named_steps['sfs']
print(sfs.get_support().shape)

# test = df_train.to_numpy()
# X = df_train.drop('Class', axis=1).to_numpy()
# y = df_train['Class'].to_numpy()

# score = accuracy_score(result.predict(X), y)
# print(score)

(118,)


In [2]:
####################################################################
## Pipeline 3: K-Fold Cross-Validation & SMOTE ##
####################################################################
## Missing: hparams, attribute selection, meta-classifiers ##

import pickle

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold, KFold, cross_val_predict, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from imblearn.over_sampling import SMOTE, SMOTEN
from imblearn.pipeline import Pipeline as imbpipeline

import data_prep


# Get dataset (mix of categorical & numeric)
df_train, df_test = data_prep.get_prepped_dataset(bins=10, verbose=False)

full = df_train.to_numpy()

# Define values & labels
X = df_train.drop('Class', axis=1).to_numpy()
y = df_train['Class'].to_numpy()

# Label-encode the labels  
y = LabelEncoder().fit_transform(y)

# Build imblearn pipeline with nominal SMOTE
# ipl = make_pipeline(
#     SMOTEN(random_state=123, sampling_strategy='not minority'),
#     OneHotEncoder()
# )

# X_r, y_r = ipl.fit_resample(X, y)

# for col in df_train.drop('Class', axis=1).columns:
#     print(list(df_train[col].cat.categories))

# Get list of categories per column
categories = [list(df_train[col].cat.categories) for col in df_train.drop('Class', axis=1).columns]

pl = imbpipeline(steps= [
        ('smoten', SMOTEN(random_state=123, sampling_strategy='not majority')),
        ('onehot', OneHotEncoder(categories=categories)),
        ('knn', KNeighborsClassifier())
    ]
)

pl.fit(X, y)

# K-Fold Cross validation generator
# NOTE: Stratified K-Fold used to ensure training classes are balanced (gives better representation of SMOTE-enabled prediction)
# kfold = StratifiedKFold(n_splits=5)

kfold = StratifiedKFold(n_splits=5)

param_grid = {
    'knn__n_neighbors': [ii for ii in range(3,5)]
}
grid_search = GridSearchCV(pl, param_grid)

print("Starting fit..")
grid_search.fit(X, y)
print("Done!")

with open('pipeline_3_graph_search_knn', 'wb') as f:
    pickle.dump(grid_search, f)

# BELOW WORKY
# scores = cross_validate(pl, X, y, cv=kfold, scoring=('accuracy', 'balanced_accuracy', 'f1', 'precision', 'recall'))
# print("SCORES")
# print(scores)

# preds = cross_val_predict(pl, X, y, cv=kfold)

# conf_mat = confusion_matrix(y_pred=preds, y_true=y, normalize='all')
# print("CONFUSION MATRIX")
# print(conf_mat)

# sns.heatmap(conf_mat, annot=True)

# Get label proportions
unique, counts = np.unique(y, return_counts=True)
counts = np.array([val/len(y) for val in counts])
label_proportions = dict(zip(unique, counts))
print(label_proportions)


# THIS WORKY ######################################################
# print(f"Pre-SMOTE shape: {X.shape}")

# smoten = SMOTEN(random_state=123, sampling_strategy='not majority')
# X_r, y_r = smoten.fit_resample(X, y)

# print(f"Post-SMOTE shape: {X_r.shape}")

# oh_enc = OneHotEncoder()
# X_r = oh_enc.fit_transform(X_r)

# print(f"Post-Onehot shape: {X_r.shape}")

# l_enc = LabelEncoder()
# y_r = l_enc.fit_transform(y_r)
# /THIS WORKY #####################################################

# print(X_r)
# np.savetxt('./x.txt', X_r, fmt='%s')

# print("Done!")
# with open('pipeline_2', 'wb') as f:
#     pickle.dump(sfs, f)

Starting fit..
Done!
{0: 0.7222222222222222, 1: 0.2777777777777778}


In [45]:
# Messing about with grid search output

from imblearn.pipeline import Pipeline as imbpipeline
import pickle

with open('./pipeline_3_graph_search_knn', 'rb') as f:
    r:imbpipeline = pickle.load(f)

r.cv_results_

{'mean_fit_time': array([0.14563704, 0.14527011, 0.14538217, 0.14670243, 0.14642553,
        0.14640503, 0.14465084, 0.14359536, 0.14569526, 0.14511809,
        0.14548187, 0.14464917, 0.14447303, 0.14507599, 0.14581785,
        0.14637213, 0.14540472, 0.14490886, 0.14604597]),
 'std_fit_time': array([0.00274631, 0.0005022 , 0.00167734, 0.00353577, 0.00351841,
        0.00200359, 0.00232369, 0.00210118, 0.00232087, 0.00210185,
        0.00280988, 0.00168527, 0.00252989, 0.00131327, 0.00189256,
        0.0015224 , 0.0017069 , 0.00189405, 0.00200904]),
 'mean_score_time': array([0.01212091, 0.01228065, 0.01290884, 0.01261086, 0.01274905,
        0.01294932, 0.01286087, 0.01273909, 0.01277547, 0.01304007,
        0.01288052, 0.01284432, 0.01263766, 0.0129756 , 0.01338882,
        0.01301546, 0.01298103, 0.01299376, 0.01298695]),
 'std_score_time': array([3.25947250e-04, 3.77436428e-04, 3.03362271e-04, 3.75344858e-04,
        4.86528536e-04, 3.83040787e-04, 3.86740087e-04, 3.75382996e-04,


In [None]:
###############################
## ATTEMPT AT FULL FUNCTIONS ##
###############################

# TODO clean these up
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold, KFold, cross_val_predict, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from imblearn.over_sampling import SMOTE, SMOTEN
from imblearn.pipeline import Pipeline as ImbPipeline


import data_prep

RANDOM_STATE = 123
# TODO don't be a perfectionist!

# Non Sklearn-adjustable hyperparams go in here!!!
def trial(model:str, drop_attributes:list, sampling:str, bins:int, cat_binning_threshold:int): # TODO pick h-params to adjust
    df_train, df_test = data_prep.get_prepped_dataset(bins=bins, cat_binning_threshold=cat_binning_threshold)

    # Input validation
    assert sampling in ['none', 'over', 'under', 'both']
    assert model in ['knn', 'dt', 'nbayes', 'svm']

    ## Prepare Dataset

    # Split into labels/not labels
    df_X = df_train.drop('Class', axis=1)
    df_y = df_train['Class']

    X = df_X.to_numpy()
    y = df_y.to_numpy()

    # Encode labels
    y = LabelEncoder().fit_transform(y)

    ## Make data pipeline
    pipe_parts = []

    # Sampling strategy
    if bins != None: # If everything binned
        if sampling == 'over':
            pipe_parts.append(('smoten', SMOTEN(random_state=RANDOM_STATE, sampling_strategy='not majority')))
        elif sampling == 'under':
            raise NotImplementedError() # TODO
        elif sampling == 'both':
            raise NotImplementedError() # TODO
    else: # Add Smote-Mixed if things not binned
        raise NotImplementedError() # TODO
    
    # One-hot encoding
    if bins != None: # Use onehot encoder if everything binned
        pipe_parts.append('onehot', OneHotEncoder())
    else: # Use mixed encoder if not everything binned
        raise NotImplementedError() # TODO
    
    # Model itself
    if model =='knn':
        pipe_parts.append('model_knn', KNeighborsClassifier())
    if model =='dt':
        pipe_parts.append('model_dt', DecisionTreeClassifier())
    if model =='nbayes':
        raise NotImplementedError
        pipe_parts.append('model_nbayes', ) # TODO
    if model =='svm':
        raise NotImplementedError
        pipe_parts.append('model_svm', ) # TODO

    # Make final pipeline
    pipe = ImbPipeline(pipe_parts)

    ## Add grid search feature selection
    if model =='knn':
        param_grid = {
            # All odd values for k-neighbors between 3 & 22
            'knn__k_neighbors': [ii for ii in range(3, 22, 2)]
        }
    if model =='dt':
        param_grid = {
            'dt__criterion': ['gini', 'entropy'],
            'dt__min_samples_split': [ii for ii in range(2, 41)],
            'dt__min_samples_leaf': [ii for ii in range(1, 21)]
        }
    if model =='nbayes':
        param_grid = {
        }
    if model =='svm':
        raise NotImplementedError
        param_grid = {

        }
    grid_search = GridSearchCV(pipe, param_grid, verbose=1)

In [12]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

arr = np.array(['A', 'B', 'C', 'A', 'C'])

arr = arr.reshape(-1, 1)

arr_oh = OneHotEncoder().fit_transform(arr)

print(arr_oh)
print(type(arr_oh))

  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 0)	1.0
  (4, 2)	1.0
<class 'scipy.sparse.csr.csr_matrix'>


In [52]:
l = ['A', 'B', 'C', 'D']
l2 = ['B', 'C']

set(l2) <= set(l)

True