In [None]:
import math # math
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt # plotting
import plotly.express as px # ROC curve plot

import seaborn as sns # fancy plotting
import scipy # dendrogram

In [None]:
colors = [sns.color_palette('rocket')[4],sns.color_palette('seismic')[0]] # color palette
sns.set_theme(style = "darkgrid",palette= 'magma') # plot theme

## Data Cleaning <a name="data_cleaning"></a>

In [None]:
# Data preparation
# Read data
kepler = pd.read_csv('C:/Users/marco/Desktop/Scuola/Data spaces/Tesina/Data/cumulative_2.csv')
# Remove uninteresting variables
data = kepler.drop(['kepid','kepoi_name','koi_vet_date','kepler_name','koi_vet_stat','koi_disposition','koi_score','koi_fpflag_nt','koi_fpflag_ss','koi_fpflag_co','koi_fpflag_ec','koi_disp_prov','koi_comment','koi_model_dof','koi_model_chisq','koi_limbdark_mod','koi_parm_prov','koi_tce_delivname','koi_bin_oedp_sig','koi_quarters','koi_trans_mod','koi_datalink_dvr','koi_datalink_dvs','koi_sparprov','koi_fittype','koi_max_sngle_ev','koi_max_mult_ev','koi_num_transits','koi_model_snr'],axis = 1)

# Remove also the predictors from Pixel-Based KOI Vetting Statistics, Flux weighter analysis
for col in data.columns:
    if ('fwm' in col) or ('dicco' in col) or ('dikco' in col):
        data = data.drop([col], axis = 1)

# Categorize the response variable
data['koi_pdisposition'] = data['koi_pdisposition'].astype('category')

# Simplify the names
data.columns = [col.replace('koi_','',1) for col in data.columns]
data = data.rename({'pdisposition':'disposition'},axis = 1)

In [None]:
# function to see the max, min and number of NaNs for each column
def description(data):
    mins = [data[col].min() for col in data.iloc[:,1:].columns]
    maxs = [data[col].max() for col in data.iloc[:,1:].columns]
    nans = [data[col].isnull().sum() for col in data.iloc[:,1:].columns]
    means = [data[col].mean() for col in data.iloc[:,1:].columns]
    maxmin = pd.DataFrame({'minimum': mins,'maximum': maxs, 'mean':means, 'NaNs': nans}, index = data.iloc[:,1:].columns)
    return(maxmin)

In [None]:
description(data)

In [None]:
# Remove the variables with only NaN
data = data.drop(['longp','ingress','sage'], axis = 1)
# Observe that eccen, ldm_coeff4 and ldm_coeff3 assume only value 0
# They give us no information therefore we remove them
data = data.drop(['eccen','ldm_coeff4','ldm_coeff3'], axis = 1)

In [None]:
# time0bk is time0 minus an offset constant therefore they can be described by just one variable
sns.scatterplot(data = data, x='time0bk', y='time0',color = colors[1] ,s= 14)

In [None]:
# We decide to keep 'time0bk' as its values are smaller
data = data.drop(['time0'], axis = 1)

## FEATURE TRANSFORMATION

In [None]:
# Skewed distributions of the predictors
cols = data.iloc[:,1:].columns
fig, axes = plt.subplots(1,3,figsize=(16,4))
for i in [1,2,3]:
    sns.histplot(data = data, x =cols[i], color = colors[1], ax = axes[i-1])

In [None]:
# Inverse hyperbolic sine function
sns.lineplot(x = np.linspace(-14,14,200), y =np.arcsinh(np.linspace(-14,14,200)), color = colors[1])

In [None]:
# We use the Iiverse hyperbolic sine function defined as
# IHS(x) = log(x + sqrt(x^2+1)) to transform the pathological predictors
data['period'] = np.arcsinh(data['period'])
data['time0bk'] = np.arcsinh(data['time0bk'])
data['impact'] = np.arcsinh(data['impact'])
data['duration'] = np.arcsinh(data['duration'])
data['depth'] = np.arcsinh(data['depth'])
data['ror'] = np.arcsinh(data['ror'])
data['srho'] = np.arcsinh(data['srho'])
data['prad'] = np.arcsinh(data['prad'])
data['insol'] = np.arcsinh(data['insol'])
data['dor'] = np.arcsinh(data['dor'])
data['srad'] = np.arcsinh(data['srad'])

In [None]:
fig, axes = plt.subplots(1,3,figsize=(16,4))
for i in [1,2,3]:
    sns.histplot(data = data, x =cols[i], color = colors[1], ax = axes[i-1])

## FEATURE SELECTION

In [None]:
# Function for corrplot masking
def mask_corr(corr):
    mask = np.triu(np.ones_like(corr, dtype=bool))
    return mask

In [None]:
# Plot of the absolute correlation values between predictors
plt.figure(figsize = (14,14))
sns.heatmap(abs(data.corr()), cmap = 'magma',center = 0.4, mask = mask_corr(data.corr()))

In [None]:
# Let us look at the magnitude ('mag') variables 
sns.heatmap(abs(data.iloc[:,26:33].corr()), center = 0.4, cmap = 'rocket', mask = mask_corr(data.iloc[:,26:33].corr()))

In [None]:
sns.pairplot(data = data.iloc[:,[25,26,27,28,29,30,31,32]], diag_kind="kde", plot_kws={"s": 14,"color":colors[1]},diag_kws= {'color': colors[1]})

In [None]:
# Two groups can be distinguished associated with high in-group correlation (https://en.wikipedia.org/wiki/Photometric_system)
# - group1: 'kepmag', 'gmag', 'rmag', 'imag' -> visible and near infrared band
# - group2: 'zmag', 'jmag', 'hmag', 'kmag' -> infrared band
# we then keep only one for each group (the ones with the lowest amount of missing data): 'kepmag' and 'hmag'
data = data.drop(['gmag','rmag','imag','zmag','jmag','kmag'], axis = 1)


In [None]:
description(data)

## MISSING DATA

In [None]:
raw_data=data

In [None]:
# We now drop the missing data
data = data.dropna()
print(f"The dataset had {raw_data.shape[0]} rows. It now has {data.shape[0]} rows.\n({raw_data.shape[0]-data.shape[0]} rows were dropped, leaving us with {round(((data.shape[0]/raw_data.shape[0])*100),2)}% of the original number of entries.)")

## FEATURE HIERARCHICAL CLUSTERING

In [None]:
import scipy
from scipy.cluster.hierarchy import dendrogram, linkage


In [None]:
linked = linkage(data.iloc[:,1:].transpose(), 'single',metric='correlation')

plt.figure(figsize=(12, 7))
dendrogram(linked,
            orientation='right',
            labels=data.iloc[:,1:].columns,
            distance_sort='descending',
            show_leaf_counts=True)
plt.show()

## OUTLIERS

In [None]:
description(data)

In [None]:
# There are still outliers present
sns.scatterplot(data = data,x='ror',y='srho',color = colors[1],s = 14)

In [None]:
sns.scatterplot(data = data,x='sma',y='impact',color = colors[1],s = 14)


In [None]:
def mahalanobis(x, data):
    x_mu = x - np.mean(data)
    cov = np.cov(data.values.T)
    inv_covmat = np.linalg.inv(cov)
    left = np.dot(x_mu, inv_covmat)
    mahal = np.dot(left, x_mu.T)
    return mahal.diagonal()

In [None]:
mah =mahalanobis(x = data.iloc[:,1:], data = data.iloc[:,1:])
t = np.linspace(0,3.5,100)

In [None]:
def survival(t):
    s = []
    for i in t:
        s.append(data[np.log10(mah)>i].shape[0]/data.shape[0])
    return s

In [None]:
# Plot the log10 of the mahalanobis distance with the fraction of lost data by removing outliers at a certain threshold
fig, ax = plt.subplots(figsize = (8,4))
sns.boxenplot(np.log10(mahalanobis(x = data.iloc[:,1:], data = data.iloc[:,1:])),color = colors[0],saturation = 0.7)
ax2 = ax.twinx()
plt.plot(t,survival(t),color = colors[1])
plt.grid(b=None)
plt.show()

In [None]:
# By cutting outliers with distance greater than 2 we loose a fraction of the data equal to
survival([2])

In [None]:
data = data[np.log10(mahalanobis(x = data.iloc[:,1:], data = data.iloc[:,1:]))<2]

In [None]:
sns.scatterplot(data = data,x='sma',y='impact',color = colors[1],s = 14)


In [None]:
sns.scatterplot(data = data,x='ror',y='srho',color = colors[1],s = 14)


## EXPLORATORY DATA ANALYSIS

In [None]:
# The classes are balanced
sns.histplot(data = data, x = 'disposition', hue = 'disposition', palette = colors)

In [None]:
cols = data.iloc[:,1:].columns
fig, axes = plt.subplots(2,5,figsize=(20,8))
for i in range(5):
    sns.kdeplot(data = data, x =cols[i],hue='disposition', palette = colors, ax = axes[(0,i)],fill=True, legend = False)
for i in range(5):
    sns.kdeplot(data = data, x =cols[i+5],hue='disposition', palette = colors, ax = axes[(1,i)],fill=True,legend = False)

In [None]:
fig, axes = plt.subplots(2,5,figsize=(20,8))
for i in range(5):
    sns.kdeplot(data = data, x =cols[i+10],hue='disposition', palette = colors, ax = axes[(0,i)],fill=True,legend = False)
for i in range(5):
    if i <= 1:
        sns.histplot(data = data, x =cols[i+15],hue='disposition', palette = colors, ax = axes[(1,i)],legend = False, multiple = 'dodge')
    else: 
        sns.kdeplot(data = data, x =cols[i+15],hue='disposition', palette = colors, ax = axes[(1,i)],fill=True,legend = False)

In [None]:
fig, axes = plt.subplots(1,5,figsize=(20,4))
for i in range(5):
    sns.kdeplot(data = data, x =cols[i+20],hue='disposition', palette = colors, ax = axes[i], fill =True,legend = False)

In [None]:
# The sky patch observed by Kepler
plt.figure(figsize = (10,10))
sns.scatterplot(data = data, x= 'ra', y='dec', palette = colors, hue = 'disposition', s= 14)

In [None]:
# Export clean dataset
data.to_csv('C:/Users/marco/Desktop/Scuola/Data spaces/Tesina/Data/cumulative_clean.csv')

# CLASSIFICATION

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve, auc, f1_score, recall_score, precision_score, plot_confusion_matrix, plot_roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline


In [None]:
# Import clean dataset
data = pd.read_csv('C:/Users/marco/Desktop/Scuola/Data spaces/Tesina/Data/cumulative_clean.csv')
data = data.iloc[:,1:]
data['disposition'] = data['disposition'].astype('category')

X = data.iloc[:,1:]
Y = data.iloc[:,0]

# 1 -> CANDIDATE, 0 -> FALSE POSITIVE
le = preprocessing.LabelEncoder()
le.fit(['CANDIDATE','FALSE POSITIVE'])
Y = le.transform(Y)
Y = 1-Y

# Train-Test data split
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.3, random_state = 0, stratify = Y )
# Scaling for PCA
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)

In [None]:
# function for grouping grid search results
def grid_search_groupby(results: pd.DataFrame, param_1: str, param_2: str) -> pd.DataFrame:

    assert (type(results) ==  type(pd.DataFrame())), 'results should be a pandas.core.frame.DataFrame'
    assert (type(param_1) == str), 'param_1 should be a string'
    assert (type(param_2) == str), 'param_2 should be a string'
    
    params_df  = pd.DataFrame.from_dict(list(results.params.values))
    mean_test_score = results.mean_test_score
    result_shrt_df = pd.concat([mean_test_score, params_df], axis=1)
    result_groupby = result_shrt_df.groupby([param_1, param_2])['mean_test_score'].mean().unstack()
    return result_groupby

# Function for plotting grid search results
def plot_grid_search(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2, s, log = False):
    # Get Test Scores Mean for each grid search
    scores_mean = cv_results['mean_test_score']    
    mean = np.zeros((len(grid_param_2),len(grid_param_1)))

    for idx,val in enumerate(grid_param_2):
        mean[idx,:] = [k for id,k in enumerate(scores_mean) if cv_results['params'][id][name_param_2]==val]

    # Plot Grid search scores
    _,ax = plt.subplots(1,1,figsize=(10,6))
    # Param1 is the X-axis, Param 2 is represented as a different curve (color line)
    for idx, val in enumerate(grid_param_2):    
        ax.plot(grid_param_1, mean[idx,:], '-o', label= name_param_2 + ': ' + str(val), markersize=s)
    if log == True:
        ax.set_xscale('log')
    ax.set_title("Grid Search Scores", fontsize=20, fontweight='bold')
    ax.set_xlabel(name_param_1, fontsize=16)
    ax.set_ylabel('CV Average Score', fontsize=16)
    ax.legend(loc="best", fontsize=15)

## PCA 

In [None]:
from sklearn.decomposition import PCA
import plotly.graph_objects as go


In [None]:
# Explained variance curve
pca = PCA(n_components = 26)
pca.fit(train_X_scaled)

variance = pca.explained_variance_ratio_ 
var=np.cumsum(variance)
plt.ylabel('Fraction of variance explained')
plt.xlabel('# of principal components')
plt.title('PCA Analysis')
plt.plot(np.arange(1,27), var)
plt.show()

In [None]:
train_X_pca = pca.transform(train_X_scaled)

In [None]:
# Plot of the first two principal components 
plt.figure(figsize = (7,7))
ax = sns.scatterplot(x= train_X_pca[:,0], y=train_X_pca[:,1], palette = colors, hue = 1- train_Y, legend = False, s= 14)
ax.set(xlabel='PC 1', ylabel='PC 2')

In [None]:
# Plot of the first three principal components 
fig = go.Figure(data=[go.Scatter3d(
    x=train_X_pca[:,0],
    y=train_X_pca[:,1],
    z=train_X_pca[:,2],
    mode='markers',
    marker=dict(
        size=4,
        color=[tuple(256*x for x in colors[0]) if p == 1 else tuple(256*x for x in colors[1])  for p in train_Y],              
        opacity=0.7
    )
)])

fig.update_layout(
    autosize=False,
    width=1600,
    height=600,
    scene = dict(
    xaxis_title="PC 1",
    yaxis_title="PC 2",
    zaxis_title="PC 3")
    )

fig.show()


## KNN 

In [None]:
from sklearn import neighbors

In [None]:
# We can start with the simplest model knn and perform grid search cross validation to tune the hyperparameters:
# -the number of neighbors considered;
# -the norm used {Euclidean, Manhattan, Chebyshev}.
grid = {'n_neighbors': range(1,90),'metric':['euclidean','manhattan','chebyshev']}
knn =  neighbors.KNeighborsClassifier()
pipe = Pipeline(steps=[('scaler', StandardScaler()), ('knn', knn)])

In [None]:
# Grid search without scaling
knn_grid = GridSearchCV(estimator = knn, param_grid = grid, cv = 4, verbose=2, n_jobs = -1)
knn_grid.fit(train_X,train_Y)

In [None]:
# Grid search result without scaling
plot_grid_search(knn_grid.cv_results_, grid['n_neighbors'], grid['metric'], 'n_neighbors', 'metric', s = 5)

In [None]:
grid = {'knn__n_neighbors': range(1,90),'knn__metric':['euclidean','manhattan','chebyshev']}
knn_grid_scaled = GridSearchCV(estimator = pipe, param_grid = grid, cv = 4, verbose=2, n_jobs = -1)
knn_grid_scaled.fit(train_X,train_Y)

In [None]:
# Grid search result with scaling
plot_grid_search(knn_grid_scaled.cv_results_, grid['knn__n_neighbors'], grid['knn__metric'], 'knn__n_neighbors', 'knn__metric', s = 5)

In [None]:
# The scaled one performs much better
best_knn = Pipeline(steps=[('scaler', StandardScaler()), ('knn', neighbors.KNeighborsClassifier(n_neighbors=20, metric = 'manhattan'))])

In [None]:
# The cross validated score is
cross_val_score(best_knn,train_X,train_Y).mean()

## DECISION TREE CLASSIFIER

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

In [None]:
grid = {'ccp_alpha':np.arange(0,0.03,0.001), 'criterion':['gini','entropy']}
tree = DecisionTreeClassifier(random_state = 1)

In [None]:
tree_grid = GridSearchCV(estimator = tree, param_grid = grid, cv = 10, verbose=2, n_jobs = -1)
tree_grid.fit(train_X,train_Y)

In [None]:
# Grid search results
plot_grid_search(tree_grid.cv_results_, np.arange(0,0.03,0.001), ['gini','entropy'], 'ccp_alpha', 'criterion',s = 5)

In [None]:
# A more interpretable tree can be optained by using a stronger pruning parameter without loosing too much accuracy
interp_tree = DecisionTreeClassifier(criterion = 'entropy', ccp_alpha = 0.005, random_state = 1)
interp_tree.fit(train_X,train_Y)

In [None]:
# Plot the tree
plt.figure(num=None, figsize=(20, 6), dpi=80)
plot_tree(interp_tree, filled = True, proportion = True,  feature_names=X.columns  
)

In [None]:
# The cross validated score is
cross_val_score(interp_tree, train_X, train_Y).mean()

## RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier


In [None]:
rf = RandomForestClassifier()
rf_grid = GridSearchCV(estimator = rf, param_grid = {'n_estimators': range(1,300,10), 'criterion':['entropy']}, cv = 4, verbose=2, n_jobs = -1 )
rf_grid.fit(train_X,train_Y)

In [None]:
# Grid search results
plt.rcParams["axes.grid"] = True
plot_grid_search(rf_grid.cv_results_, range(1,300,10), ['entropy'], 'n_estimators', 'criterion',s = 5)

In [None]:
# 100 trees seems to be enough for reaching convergence
rf2 = RandomForestClassifier(n_estimators=100, criterion = 'entropy')
rf2_grid = GridSearchCV(estimator = rf2, param_grid = {'max_depth':np.arange(2,20,2),'min_samples_split':np.arange(2,40,2)}, cv = 4, verbose=2, n_jobs = -1 )
rf2_grid.fit(train_X,train_Y)

In [None]:
rf3 = RandomForestClassifier(n_estimators=100, criterion = 'gini')
rf3_grid = GridSearchCV(estimator = rf2, param_grid = {'max_depth':np.arange(2,20,2),'min_samples_split':np.arange(2,40,2)}, cv = 4, verbose=2, n_jobs = -1 )
rf3_grid.fit(train_X,train_Y)

In [None]:
results_entropy = pd.DataFrame(rf2_grid.cv_results_)
results_gini = pd.DataFrame(rf3_grid.cv_results_)

In [None]:
results_grouped_entropy = grid_search_groupby(results_entropy,'max_depth','min_samples_split')
results_grouped_gini = grid_search_groupby(results_gini,'max_depth','min_samples_split')

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize = (12,6))
sns.heatmap(results_grouped_entropy, cmap = 'magma', center = 0.83,ax = ax1)
ax1.set_title('Entropy')
ax2.set_title('Gini')
sns.heatmap(results_grouped_gini, cmap = 'magma', center = 0.83, ax = ax2)


In [None]:
# Best CV score Entropy
[rf2_grid.best_params_, rf2_grid.best_score_]

In [None]:
# Best CV score Gini
[rf3_grid.best_params_, rf3_grid.best_score_]

In [None]:
best_rf = RandomForestClassifier(n_estimators = 100, max_depth = 18, min_samples_split = 6, criterion = 'entropy')

In [None]:
# The cross validated score is
cross_val_score(best_rf,train_X,train_Y).mean()

In [None]:
# Plot of feature importance
importance = pd.DataFrame({'name':X.columns,'importance':best_rf.feature_importances_})
plt.figure(figsize=(10,5))
sns.barplot(data = importance,x='name',y='importance', palette = 'magma')
plt.xticks(rotation=70)
plt.tight_layout()

## SUPPORT VECTOR MACHINE

In [None]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

### Linear

In [None]:
grid = {'linear__C' : [0.001,0.01,0.1,1,10,100,1000,10000], 'linear__penalty':['l1','l2']}
scaler = StandardScaler()
linear = LinearSVC(max_iter=10000)
pipe = Pipeline(steps=[('scaler', scaler), ('linear', linear)])

In [None]:
linear_grid = GridSearchCV(estimator = pipe, param_grid = grid, cv = 4, verbose=2, n_jobs = -1 )
linear_grid.fit(train_X,train_Y)

In [None]:
# Grid search results
plt.rcParams["axes.grid"] = True
plot_grid_search(linear_grid.cv_results_, grid['linear__C'], grid['linear__penalty'], 'linear__C', 'linear__penalty',s = 5, log = True)

In [None]:
best_linear = Pipeline(steps=[('scaler', StandardScaler()), ('linear', LinearSVC(max_iter = 10000, penalty = 'l2', C = 1e-2))])

In [None]:
# The cross validated score is
cross_val_score(best_linear, train_X, train_Y).mean()

### Gaussian RBF

In [None]:
grid = {'rbf__gamma': [1e-10,1e-9,1e-8,1e-7,1e-6,1e-5,1e-4, 1e-3,1e-2,0.1,10,100],'rbf__C' : [0.0001,0.001,0.01,0.1,1,10,100,1e3,1e4,1e5]}
scaler = StandardScaler()
rbf = SVC(kernel = 'rbf')
pipe = Pipeline(steps=[('scaler', scaler), ('rbf', rbf )])
rbf_grid = GridSearchCV(estimator = pipe, param_grid = grid, cv = 4, verbose=10, n_jobs = -1 )
rbf_grid.fit(train_X,train_Y)

In [None]:
results = pd.DataFrame(rbf_grid.cv_results_)
results_grouped= grid_search_groupby(results,'rbf__gamma','rbf__C')
sns.heatmap(results_grouped, cmap = 'magma', center = 0.4)

In [None]:
rbf_grid.best_params_

In [None]:
best_rbf = Pipeline(steps=[('scaler', StandardScaler()), ('rbf', SVC(kernel = 'rbf', C = 1, gamma = 0.1))])

In [None]:
# The cross validated score is
cross_val_score(best_rbf, train_X, train_Y).mean()

# CONCLUSIONS

## Results on the test set

In [None]:
# Fit the best estimators
best_rbf.fit(train_X,train_Y)
best_linear.fit(train_X,train_Y)
best_knn.fit(train_X,train_Y)
interp_tree.fit(train_X,train_Y)
best_rf.fit(train_X,train_Y)

In [None]:
# ROC CURVES
fig, ax = plt.subplots(1, 1, figsize=(7, 7))
plot_roc_curve(best_knn, test_X, test_Y, ax = ax, name = 'KNN')
plot_roc_curve(interp_tree, test_X, test_Y, ax = ax, name = 'Tree classifier')
plot_roc_curve(best_rf, test_X, test_Y, ax = ax, name = 'Random Forest')
plot_roc_curve(best_rbf, test_X, test_Y, ax = ax, name = 'RBF SVM')
plot_roc_curve(best_linear, test_X, test_Y, ax = ax, name = 'Linear SVM')
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 7))
ax.set_xlim(0.02, 0.35)
ax.set_ylim(0.4, 1)
plot_roc_curve(best_knn, test_X, test_Y, ax = ax, name = 'KNN')
plot_roc_curve(interp_tree, test_X, test_Y, ax = ax, name = 'Tree classifier')
plot_roc_curve(best_rf, test_X, test_Y, ax = ax, name = 'Random Forest')
plot_roc_curve(best_rbf, test_X, test_Y, ax = ax, name = 'RBF SVM')
plot_roc_curve(best_linear, test_X, test_Y, ax = ax, name = 'Linear SVM')
plt.show()

In [None]:
# Confusion matrices
fig, (ax1,ax2,ax3) = plt.subplots(ncols = 3,figsize=(14,4))
plot_confusion_matrix(best_knn, test_X, test_Y, ax = ax1, cmap = 'magma', colorbar = False)
plot_confusion_matrix(interp_tree, test_X, test_Y, ax = ax2, cmap = 'magma', colorbar = False)
plot_confusion_matrix(best_rf, test_X, test_Y, ax = ax3, cmap = 'magma', colorbar = False)
ax1.set_title('KNN')
ax2.set_title('Tree')
ax3.set_title('Forest')
ax1.grid(b=None)
ax2.grid(b=None)
ax3.grid(b=None)

In [None]:
# Confusion matrices
fig, (ax1,ax2) = plt.subplots(ncols = 2,figsize=(9,4))
plot_confusion_matrix(best_linear, test_X, test_Y, ax = ax1, cmap = 'magma', colorbar = False)
plot_confusion_matrix(best_rbf, test_X, test_Y, ax = ax2, cmap = 'magma', colorbar = False)
ax1.set_title('Linear SVM')
ax2.set_title('RBF SVM')
ax1.grid(b=None)
ax2.grid(b=None)

In [None]:
# Cross validated scores
df = pd.DataFrame({'Name':[],'Score':[],'Type':[]})
estimators = [best_knn,interp_tree, best_rf,best_rbf,best_linear]
names = ['KNN','Tree','Random Forest','RBF SVM','Linear SVM']
for i in range(len(estimators)):
    df.loc[2*i] = [names[i], cross_val_score(estimators[i],train_X,train_Y, scoring = 'precision').mean(),'Precision']
    df.loc[2*i+1] = [names[i], cross_val_score(estimators[i],train_X,train_Y, scoring = 'recall').mean(),'Recall']
sns.catplot(data = df, kind  = 'bar', x ='Name', y = 'Score', hue = 'Type', alpha = 0.8)

In [None]:
# Test set scores
df_test = pd.DataFrame({'Name':[],'Score':[],'Type':[]})
for i in range(len(estimators)):
    df_test.loc[2*i] = [names[i], cross_val_score(estimators[i],train_X,train_Y, scoring = 'precision').mean(),'Precision']
    df_test.loc[2*i+1] = [names[i], cross_val_score(estimators[i],train_X,train_Y, scoring = 'recall').mean(),'Recall']
sns.catplot(data = df_test, kind  = 'bar', x ='Name', y = 'Score', hue = 'Type', alpha = 0.8)