In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn import metrics
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import accuracy_score, recall_score, precision_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier


from sklearn.pipeline import Pipeline
import time
import warnings
# Load Spaceship Titanic dataset
from google.colab import drive
drive.mount('/content/drive')


BASE_DIR = '/content/drive/MyDrive/Colab Notebooks/spaceship-titanic'
train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/spaceship-titanic/train.csv")
test = pd.read_csv( "/content/drive/MyDrive/Colab Notebooks/spaceship-titanic/test.csv")
sample_submission = pd.read_csv( "/content/drive/MyDrive/Colab Notebooks/spaceship-titanic/sample_submission.csv")
#print("Train Data:",train.shape)
#print("Test Data:",test.shape)
#print("Sample Data:",sample_submission.shape)

miss = pd.DataFrame({'num_miss_train':train.isna().sum(),
                     'pct_miss_train':np.round(train.isna().sum()/len(train)*100,2),
                    'num_miss_test':test.isna().sum(),
                    'pct_miss_test':np.round(test.isna().sum()/len(train)*100,2)})
miss.T


Mounted at /content/drive


Unnamed: 0,Age,Cabin,CryoSleep,Destination,FoodCourt,HomePlanet,Name,PassengerId,RoomService,ShoppingMall,Spa,Transported,VIP,VRDeck
num_miss_train,179.0,199.0,217.0,182.0,183.0,201.0,200.0,0.0,181.0,208.0,183.0,0.0,203.0,188.0
pct_miss_train,2.06,2.29,2.5,2.09,2.11,2.31,2.3,0.0,2.08,2.39,2.11,0.0,2.34,2.16
num_miss_test,91.0,100.0,93.0,92.0,106.0,87.0,94.0,0.0,82.0,98.0,101.0,,93.0,80.0
pct_miss_test,1.05,1.15,1.07,1.06,1.22,1.0,1.08,0.0,0.94,1.13,1.16,,1.07,0.92


In [2]:
def split_PassengerId(df):
    p_group = []
    for idx, row in df.iterrows():
        p_id = str(row['PassengerId'])
        if "_" in p_id:
            p_group.append(int(p_id.split("_")[0]))
        else:
            p_group.append(0)
    df['PassengerGroup'] = p_group
    df['SizeOfGroup'] = df.groupby('PassengerGroup')['PassengerId'].transform('nunique')
    return df
train = split_PassengerId(train)
test = split_PassengerId(test)

# function that creates table of value count in train and test set
def val_count(col_name):
    df = pd.DataFrame([train[col_name].value_counts(), test[col_name].value_counts()],
                      index=[[col_name, col_name],["train","test"]])
    return df

val_count('SizeOfGroup')

Unnamed: 0,Unnamed: 1,1,2,3,4,5,7,6,8
SizeOfGroup,train,4805,1682,1020,412,265,231,174,104
SizeOfGroup,test,2340,908,486,204,115,98,78,48


In [3]:
# function that creates InGroup derived from SizeOfGroup
def create_InGroup(df):
    in_group = []
    for idx, row in df.iterrows():
        ig = row['SizeOfGroup']
        if ig==1:
            in_group.append("False")
        else:
            in_group.append("True")
    df['InGroup'] = in_group
    return df
train = create_InGroup(train)
test = create_InGroup(test)

val_count('InGroup')

Unnamed: 0,Unnamed: 1,False,True
InGroup,train,4805,3888
InGroup,test,2340,1937


In [4]:
# Calculate the number of HomePlanet for each PassengerGroup in Train data
hp_pg_train = pd.crosstab(train['PassengerGroup'], train['HomePlanet'])

cnt = []
for i in range(len(hp_pg_train)):
    cnt.append(np.sum(hp_pg_train.iloc[i,:] != 0))
hp_pg_train['count_planet'] = cnt
hp_pg_train.head(20).T

PassengerGroup,1,2,3,4,5,6,7,8,9,10,11,12,14,15,16,17,20,22,24,25
HomePlanet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Earth,0,1,0,1,1,2,1,0,0,1,1,1,0,1,0,2,6,0,0,1
Europa,1,0,2,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,1,0
Mars,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0
count_planet,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [5]:
# Calculate the number of HomePlanet for each PassengerGroup in Test data
hp_pg_test = pd.crosstab(test['PassengerGroup'], test['HomePlanet'])

cnt = []
for i in range(len(hp_pg_test)):
    cnt.append(np.sum(hp_pg_test.iloc[i,:] != 0))
hp_pg_test['count_planet'] = cnt
hp_pg_test.head(20).T

PassengerGroup,13,18,19,21,23,27,29,32,33,37,40,42,46,47,48,49,54,55,57,59
HomePlanet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Earth,1,1,0,0,1,1,0,0,1,1,0,0,3,0,1,1,0,1,0,1
Europa,0,0,1,1,0,0,1,2,0,0,2,0,0,3,0,0,3,0,1,0
Mars,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
count_planet,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [6]:
print("Before :", train['HomePlanet'].isna().sum(), test['HomePlanet'].isna().sum())

# Index with missing HomePlanet and not missing PassengerGroup
hp_pg_train_idx = train[train['HomePlanet'].isna()][(train[train['HomePlanet'].isna()]['PassengerGroup']).isin(hp_pg_train.index)].index
hp_pg_test_idx = test[test['HomePlanet'].isna()][(test[test['HomePlanet'].isna()]['PassengerGroup']).isin(hp_pg_test.index)].index

# Fill corresponding missing values in HomePlanet
train.loc[hp_pg_train_idx, 'HomePlanet'] = train.iloc[hp_pg_train_idx,:]['PassengerGroup'].map(lambda x:hp_pg_train.idxmax(axis=1)[x])
test.loc[hp_pg_test_idx, 'HomePlanet'] = test.iloc[hp_pg_test_idx,:]['PassengerGroup'].map(lambda x:hp_pg_test.idxmax(axis=1)[x])

print("After :", train['HomePlanet'].isna().sum(), test['HomePlanet'].isna().sum())

Before : 201 87
After : 111 46


In [7]:
# function that creates CabinDeck, CabinNum, CabinSide derived from Cabin
def split_Cabin(df):
    c_deck = []
    c_num = []
    c_side = []
    for idx, row in df.iterrows():
        cabin = str(row['Cabin'])
        if "/" in cabin:
            c_deck.append(cabin.split("/")[0])
            c_num.append(cabin.split("/")[1])
            c_side.append(cabin.split("/")[2])
        else:
            c_deck.append(None)
            c_num.append(-1)
            c_side.append(None)
    df['CabinDeck'] = c_deck
    df['CabinNum'] = c_num
    df['CabinSide'] = c_side
    return df
train = split_Cabin(train)
test = split_Cabin(test)

train['CabinNum'] = pd.to_numeric(train['CabinNum'], errors='ignore')
test['CabinNum'] = pd.to_numeric(test['CabinNum'], errors='ignore')

val_count('CabinDeck')



Unnamed: 0,Unnamed: 1,F,G,E,B,C,D,A,T
CabinDeck,train,2794,2559,876,779,747,478,256,5
CabinDeck,test,1445,1222,447,362,355,242,98,6


In [8]:
# Calculate the number of HomePlanet for each CabinDeck in Train data
hp_cd_train = pd.crosstab(train['CabinDeck'], train['HomePlanet'])

cnt = []
for i in range(len(hp_cd_train)):
    cnt.append(np.sum(hp_cd_train.iloc[i,:] != 0))
hp_cd_train['count_planet'] = cnt
hp_cd_train.T

CabinDeck,A,B,C,D,E,F,G,T
HomePlanet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Earth,0,0,0,0,400,1620,2518,0
Europa,255,778,743,189,130,0,0,4
Mars,0,0,0,285,335,1130,0,0
count_planet,1,1,1,2,3,2,1,1


In [9]:
# Calculate the number of HomePlanet for each CabinDeck in Test data
hp_cd_test = pd.crosstab(test['CabinDeck'], test['HomePlanet'])

cnt = []
for i in range(len(hp_cd_test)):
    cnt.append(np.sum(hp_cd_test.iloc[i,:] != 0))
hp_cd_test['count_planet'] = cnt
hp_cd_test.T

CabinDeck,A,B,C,D,E,F,G,T
HomePlanet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Earth,0,0,0,0,189,817,1213,0
Europa,97,359,353,112,69,0,0,6
Mars,0,0,0,125,180,612,0,0
count_planet,1,1,1,2,3,2,1,1


In [10]:
print("Before :", train['HomePlanet'].isna().sum(), test['HomePlanet'].isna().sum())

# Missing HomePlanet and Deck A, B, C, T
train.loc[(train['HomePlanet'].isna()) & (train['CabinDeck'].isin(['A','B','C','T'])), 'HomePlanet']='Europa'
test.loc[(test['HomePlanet'].isna()) & (test['CabinDeck'].isin(['A','B','C','T'])), 'HomePlanet']='Europa'

# Missing HomePlanet and Deck G
train.loc[(train['HomePlanet'].isna()) & (train['CabinDeck']=='G'), 'HomePlanet']='Europa'
test.loc[(test['HomePlanet'].isna()) & (test['CabinDeck']=='G'), 'HomePlanet']='Earth'

print("After :", train['HomePlanet'].isna().sum(), test['HomePlanet'].isna().sum())

Before : 111 46
After : 63 31


In [11]:
val_count('CabinNum')


Unnamed: 0,Unnamed: 1,-1,82,86,19,56,176,97,230,269,65,...,1803,1623,1785,1806,1835,1836,1833,1811,1813,1827
CabinNum,train,199.0,28.0,22.0,22.0,21.0,21.0,21.0,20.0,19.0,19.0,...,,,,,,,,,,
CabinNum,test,100.0,6.0,3.0,2.0,7.0,2.0,3.0,7.0,1.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
val_count('CabinSide')

Unnamed: 0,Unnamed: 1,S,P
CabinSide,train,4288,4206
CabinSide,test,2093,2084


In [13]:
# Calculate the number of Cabin Features for each PassengerGroup in Train data
cd_pg_train = pd.crosstab(train[train['SizeOfGroup']>1]['PassengerGroup'],
                          train[train['SizeOfGroup']>1]['CabinDeck'])
cn_pg_train = pd.crosstab(train[train['SizeOfGroup']>1]['PassengerGroup'],
                          train[train['SizeOfGroup']>1]['CabinNum'])
cs_pg_train = pd.crosstab(train[train['SizeOfGroup']>1]['PassengerGroup'],
                          train[train['SizeOfGroup']>1]['CabinSide'])

cnt1 = []
cnt2 = []
cnt3 = []
for i in range(len(cd_pg_train)):
    cnt1.append(np.sum(cd_pg_train.iloc[i,:] != 0))
    cnt2.append(np.sum(cn_pg_train.iloc[i,:] != 0))
    cnt3.append(np.sum(cs_pg_train.iloc[i,:] != 0))

cd_pg_train['count_deck'] = cnt1
cn_pg_train['count_num'] = cnt2
cs_pg_train['count_side'] = cnt3

print(cd_pg_train.head(10).T, "\n")
print(cn_pg_train.head(10).T, "\n")
print(cs_pg_train.head(10).T)

PassengerGroup  3   6   8   17  20  31  44  45  56  64
CabinDeck                                             
A                2   0   0   0   0   0   0   0   3   0
B                0   0   3   0   0   0   0   0   0   0
C                0   0   0   0   0   0   0   0   0   0
D                0   0   0   0   0   0   0   0   0   0
E                0   0   0   0   6   0   0   0   0   1
F                0   1   0   1   0   3   0   2   0   1
G                0   1   0   1   0   0   3   0   0   0
count_deck       1   2   1   2   1   1   1   1   1   2 

PassengerGroup  3   6   8   17  20  31  44  45  56  64
CabinNum                                              
-1               0   0   0   0   0   0   0   0   0   0
0                2   1   0   1   6   0   0   0   0   0
1                0   0   3   0   0   0   0   0   3   0
2                0   1   0   0   0   0   0   0   0   0
3                0   0   0   0   0   0   3   0   0   1
...             ..  ..  ..  ..  ..  ..  ..  ..  ..  ..
1871    

In [14]:
# Calculate the number of Cabin Features for each PassengerGroup in Test data
cd_pg_test = pd.crosstab(test[test['SizeOfGroup']>1]['PassengerGroup'],
                          test[test['SizeOfGroup']>1]['CabinDeck'])
cn_pg_test = pd.crosstab(test[test['SizeOfGroup']>1]['PassengerGroup'],
                          test[test['SizeOfGroup']>1]['CabinNum'])
cn_pg_test = cn_pg_test.drop(9223,axis=0)
cs_pg_test = pd.crosstab(test[test['SizeOfGroup']>1]['PassengerGroup'],
                          test[test['SizeOfGroup']>1]['CabinSide'])

cnt1 = []
cnt2 = []
cnt3 = []
for i in range(len(cd_pg_test)):
    cnt1.append(np.sum(cd_pg_test.iloc[i,:] != 0))
    cnt2.append(np.sum(cn_pg_test.iloc[i,:] != 0))
    cnt3.append(np.sum(cs_pg_test.iloc[i,:] != 0))

cd_pg_test['count_deck'] = cnt1
cn_pg_test['count_num'] = cnt2
cs_pg_test['count_side'] = cnt3

print(cd_pg_test.head(10).T, "\n")
print(cn_pg_test.head(10).T, "\n")
print(cs_pg_test.head(10).T)

PassengerGroup  32   40   46   47   54   94   100  125  142  150
CabinDeck                                                       
A                 0    0    0    0    0    0    0    0    0    0
B                 0    0    0    2    3    0    0    2    0    2
C                 0    0    0    0    0    0    0    0    0    0
D                 2    2    0    0    0    0    0    0    0    0
E                 0    0    0    0    0    0    0    0    0    0
F                 0    0    2    0    0    1    0    0    3    0
G                 0    0    1    0    0    1    2    0    0    0
T                 0    0    0    0    0    0    0    0    0    0
count_deck        1    1    2    1    1    2    1    1    1    1 

PassengerGroup  32   40   46   47   54   94   100  125  142  150
CabinNum                                                        
-1                0    0    0    1    0    0    0    0    0    0
0                 2    0    0    2    0    0    0    0    0    0
2                 0    

In [15]:
print("Before :", train['CabinSide'].isna().sum(), test['CabinSide'].isna().sum())

# Index with missing HomePlanet and not missing PassengerGroup
cs_pg_train_idx = train[train['CabinSide'].isna()][(train[train['CabinSide'].isna()]['PassengerGroup']).isin(cs_pg_train.index)].index
cs_pg_test_idx = test[test['CabinSide'].isna()][(test[test['CabinSide'].isna()]['PassengerGroup']).isin(cs_pg_test.index)].index

# Fill corresponding missing values in HomePlanet
train.loc[cs_pg_train_idx, 'CabinSide'] = train.iloc[cs_pg_train_idx,:]['PassengerGroup'].map(lambda x:cs_pg_train.idxmax(axis=1)[x])
test.loc[cs_pg_test_idx, 'CabinSide'] = test.iloc[cs_pg_test_idx,:]['PassengerGroup'].map(lambda x:cs_pg_test.idxmax(axis=1)[x])

print("After :", train['CabinSide'].isna().sum(), test['CabinSide'].isna().sum())

Before : 199 100
After : 99 63


In [16]:
# function that create CabinGroup derived from CabinNum
def create_CabinGroup(df):
    c_group = []
    for idx, row in df.iterrows():
        cn = row['CabinNum']
        if cn==-1:
            c_group.append(None)
        elif cn < 300:
            c_group.append("group1")
        elif cn < 600:
            c_group.append("group2")
        elif cn < 900:
            c_group.append("group3")
        elif cn < 1200:
            c_group.append("group4")
        elif cn < 1500:
            c_group.append("group5")
        elif cn < 1800:
            c_group.append("group6")
        else:
            c_group.append("group7")
    df['CabinGroup'] = c_group
    return df
train = create_CabinGroup(train)
test = create_CabinGroup(test)

val_count('CabinGroup')

Unnamed: 0,Unnamed: 1,group1,group2,group4,group3,group5,group6,group7
CabinGroup,train,3560,1488,992,976,938,460,80
CabinGroup,test,1698,759,479,481,486,236,38


In [17]:
train.InGroup.value_counts()

False    4805
True     3888
Name: InGroup, dtype: int64

In [18]:
train.CabinDeck .value_counts()

F    2794
G    2559
E     876
B     779
C     747
D     478
A     256
T       5
Name: CabinDeck, dtype: int64

In [19]:
train['HomePlanet'] = train['HomePlanet'].replace(['Europa', 'Earth','Mars'],[0, 1,2])
train['CryoSleep'] = train['CryoSleep'].replace([False,True,'NaN'],[0, 1,2])
train['Destination'] = train['Destination'].replace(['TRAPPIST-1e','55 Cancri e','PSO J318.5-22'],[0,1,2])
train['VIP'] = train['VIP'].replace([False,True], [0,1])
train['InGroup'] = train['InGroup'].replace(["False","True"], [0,1])
train['CabinDeck'] = train['CabinDeck'].replace(['A','G','E','F','B','C','T','D'], [0,1,2,3,4,5,6,7])
train['CabinGroup'] = train['CabinGroup'].replace(['group1','group2','group3','group4','group5','group6','group7'], [0,1,2,3,4,5,6])
train['CabinSide'] = train['CabinSide'].replace(['S','P'], [0,1])
train['Transported'] = train['Transported'].replace([False,True], [0,1])
train.drop(['Cabin'], axis=1,inplace = True)
train.drop(['Name'], axis=1,inplace = True)

print(train)


     PassengerId  HomePlanet  CryoSleep  Destination   Age  VIP  RoomService  \
0        0001_01         0.0        0.0          0.0  39.0  0.0          0.0   
1        0002_01         1.0        0.0          0.0  24.0  0.0        109.0   
2        0003_01         0.0        0.0          0.0  58.0  1.0         43.0   
3        0003_02         0.0        0.0          0.0  33.0  0.0          0.0   
4        0004_01         1.0        0.0          0.0  16.0  0.0        303.0   
...          ...         ...        ...          ...   ...  ...          ...   
8688     9276_01         0.0        0.0          1.0  41.0  1.0          0.0   
8689     9278_01         1.0        1.0          2.0  18.0  0.0          0.0   
8690     9279_01         1.0        0.0          0.0  26.0  0.0          0.0   
8691     9280_01         0.0        0.0          1.0  32.0  0.0          0.0   
8692     9280_02         0.0        0.0          0.0  44.0  0.0        126.0   

      FoodCourt  ShoppingMall     Spa  

In [20]:
train.drop(['PassengerId'], axis=1,inplace = True)

In [21]:
train.drop(['CabinDeck'], axis=1,inplace = True)

In [22]:
print(train)

      HomePlanet  CryoSleep  Destination   Age  VIP  RoomService  FoodCourt  \
0            0.0        0.0          0.0  39.0  0.0          0.0        0.0   
1            1.0        0.0          0.0  24.0  0.0        109.0        9.0   
2            0.0        0.0          0.0  58.0  1.0         43.0     3576.0   
3            0.0        0.0          0.0  33.0  0.0          0.0     1283.0   
4            1.0        0.0          0.0  16.0  0.0        303.0       70.0   
...          ...        ...          ...   ...  ...          ...        ...   
8688         0.0        0.0          1.0  41.0  1.0          0.0     6819.0   
8689         1.0        1.0          2.0  18.0  0.0          0.0        0.0   
8690         1.0        0.0          0.0  26.0  0.0          0.0        0.0   
8691         0.0        0.0          1.0  32.0  0.0          0.0     1049.0   
8692         0.0        0.0          0.0  44.0  0.0        126.0     4688.0   

      ShoppingMall     Spa  VRDeck  Transported  Pa

In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   HomePlanet      8630 non-null   float64
 1   CryoSleep       8476 non-null   float64
 2   Destination     8511 non-null   float64
 3   Age             8514 non-null   float64
 4   VIP             8490 non-null   float64
 5   RoomService     8512 non-null   float64
 6   FoodCourt       8510 non-null   float64
 7   ShoppingMall    8485 non-null   float64
 8   Spa             8510 non-null   float64
 9   VRDeck          8505 non-null   float64
 10  Transported     8693 non-null   int64  
 11  PassengerGroup  8693 non-null   int64  
 12  SizeOfGroup     8693 non-null   int64  
 13  InGroup         8693 non-null   int64  
 14  CabinNum        8693 non-null   int64  
 15  CabinSide       8594 non-null   float64
 16  CabinGroup      8494 non-null   float64
dtypes: float64(12), int64(5)
memory u

In [24]:
from sklearn.impute import SimpleImputer
Dtx1= train.iloc[:,:].values
print(Dtx1)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
#Fitting imputer object to the independent varibles x.
imputer = imputer.fit(train)
#Replacing missing data with the calculated mean value
Dtx1= imputer.transform(train)
print(Dtx1)

[[0.00e+00 0.00e+00 0.00e+00 ... 0.00e+00 1.00e+00 0.00e+00]
 [1.00e+00 0.00e+00 0.00e+00 ... 0.00e+00 0.00e+00 0.00e+00]
 [0.00e+00 0.00e+00 0.00e+00 ... 0.00e+00 0.00e+00 0.00e+00]
 ...
 [1.00e+00 0.00e+00 0.00e+00 ... 1.50e+03 0.00e+00 5.00e+00]
 [0.00e+00 0.00e+00 1.00e+00 ... 6.08e+02 0.00e+00 2.00e+00]
 [0.00e+00 0.00e+00 0.00e+00 ... 6.08e+02 0.00e+00 2.00e+00]]
[[0.00e+00 0.00e+00 0.00e+00 ... 0.00e+00 1.00e+00 0.00e+00]
 [1.00e+00 0.00e+00 0.00e+00 ... 0.00e+00 0.00e+00 0.00e+00]
 [0.00e+00 0.00e+00 0.00e+00 ... 0.00e+00 0.00e+00 0.00e+00]
 ...
 [1.00e+00 0.00e+00 0.00e+00 ... 1.50e+03 0.00e+00 5.00e+00]
 [0.00e+00 0.00e+00 1.00e+00 ... 6.08e+02 0.00e+00 2.00e+00]
 [0.00e+00 0.00e+00 0.00e+00 ... 6.08e+02 0.00e+00 2.00e+00]]


In [25]:
train.dropna(inplace=True)
X=train.iloc[:, :-1].values
y=train.iloc[:, -1].values
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42)
print("Train data\t:", X_train.shape, y_train.shape)
print("Validation data\t:", X_val.shape, y_val.shape)

Train data	: (4818, 16) (4818,)
Validation data	: (2065, 16) (2065,)


In [26]:
for i in range(1,50):
  classifierknn= KNeighborsClassifier(n_neighbors=i, metric='euclidean')
  classifierknn.fit(X_train, y_train)
  y_pred= classifierknn.predict(X_val)
  print( metrics.accuracy_score(y_val, y_pred))

0.8799031476997579
0.876997578692494
0.8760290556900726
0.8731234866828087
0.8774818401937046
0.87409200968523
0.8721549636803874
0.8687651331719128
0.864406779661017
0.8697336561743342
0.8605326876513317
0.8663438256658595
0.8639225181598063
0.8639225181598063
0.8600484261501211
0.8600484261501211
0.8581113801452784
0.8571428571428571
0.8639225181598063
0.8590799031476998
0.8581113801452784
0.8595641646489104
0.8571428571428571
0.8566585956416465
0.8537530266343826
0.8556900726392251
0.8493946731234867
0.8484261501210654
0.85181598062954
0.8527845036319612
0.8503631961259079
0.8455205811138015
0.848910411622276
0.8435835351089589
0.8469733656174334
0.8411622276029056
0.8435835351089589
0.8416464891041162
0.8397094430992736
0.837772397094431
0.8411622276029056
0.837772397094431
0.836319612590799
0.8382566585956417
0.8353510895883777
0.8338983050847457
0.8353510895883777
0.8319612590799031
0.8368038740920097


In [27]:
classifierknn= KNeighborsClassifier(n_neighbors=2, metric='euclidean' )
classifierknn.fit(X_train, y_train)

In [28]:
y_pred= classifierknn.predict(X_val)

In [29]:
print(metrics.accuracy_score(y_val, y_pred))

0.876997578692494
