In [211]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

pd.set_option('max_columns',1000)
pd.set_option('max_row',300)

In [212]:
trn_data = pd.read_csv('Data/train.csv')
tst_data = pd.read_csv('Data/test.csv')
trn_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [213]:
def fill_nans_by_age(df, age_limit = 13):
    df['RoomService'] = np.where(df['Age'] < age_limit, 0, df['RoomService'])
    df['FoodCourt'] = np.where(df['Age'] < age_limit, 0, df['FoodCourt'])
    df['ShoppingMall'] = np.where(df['Age'] < age_limit, 0, df['ShoppingMall'])
    df['Spa'] = np.where(df['Age'] < age_limit, 0, df['Spa'])
    df['VRDeck'] = np.where(df['Age'] < age_limit, 0, df['VRDeck'])
    return df

trn_data =  fill_nans_by_age(trn_data)
tst_data =  fill_nans_by_age(tst_data)
trn_data.to_csv('tmp.csv', index=False)

In [214]:
def fill_nans_by_cryo(df, age_limit = 13):
    df['RoomService'] = np.where(df['CryoSleep'] == True, 0, df['RoomService'])
    df['FoodCourt'] = np.where(df['CryoSleep'] == True, 0, df['FoodCourt'])
    df['ShoppingMall'] = np.where(df['CryoSleep'] == True, 0, df['ShoppingMall'])
    df['Spa'] = np.where(df['CryoSleep'] == True, 0, df['Spa'])
    df['VRDeck'] = np.where(df['CryoSleep'] == True, 0, df['VRDeck'])
    return df

trn_data =  fill_nans_by_cryo(trn_data)
tst_data =  fill_nans_by_cryo(tst_data)
trn_data.to_csv('tmp.csv', index=False)


In [215]:
def age_groups(df, age_limit = 13):
    df['AgeGroup'] = np.where(df['Age'] < age_limit, 0, 1)
    return df

trn_data =  age_groups(trn_data)
tst_data =  age_groups(tst_data)    
trn_data.to_csv('tmp.csv', index=False)

In [216]:
def fill_missing(df):
    '''
    Fill NaNs values or with mean or most commond value...
    
    '''
    
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    
    numeric_tmp = df.select_dtypes(include = numerics)
    categ_tmp = df.select_dtypes(exclude = numerics)

    for col in numeric_tmp.columns:
        print(col)
        df[col] = df[col].fillna(value = df[col].mean())
        
    for col in categ_tmp.columns:
        print(col)
        df[col] = df[col].fillna(value = df[col].mode()[0])
        
    print('...')
    
    return df


trn_data =  fill_missing(trn_data)
tst_data =  fill_missing(tst_data)
trn_data.to_csv('tmp.csv', index=False)

Age
RoomService
FoodCourt
ShoppingMall
Spa
VRDeck
AgeGroup
PassengerId
HomePlanet
CryoSleep
Cabin
Destination
VIP
Name
Transported
...
Age
RoomService
FoodCourt
ShoppingMall
Spa
VRDeck
AgeGroup
PassengerId
HomePlanet
CryoSleep
Cabin
Destination
VIP
Name
...


In [217]:
def total_billed(df):
    '''
    Calculates total amount billed in the trip to the passenger... 
    Args:
    Returns:
    
    '''
    
    df['TotalBilled'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
    return df

trn_data = total_billed(trn_data)
tst_data = total_billed(tst_data)
trn_data.to_csv('tmp.csv', index=False)

In [218]:
def fill_nans_by_totalspend(df):
    df['CryoSleep'] = np.where(df['TotalBilled'] > 0, False, df['CryoSleep'])
    return df

trn_data =  fill_nans_by_totalspend(trn_data)
tst_data =  fill_nans_by_totalspend(tst_data)
trn_data.to_csv('tmp2.csv', index=False)

In [219]:
trn_data.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
AgeGroup        0
TotalBilled     0
dtype: int64

In [220]:
def cabin_separation(df):
    '''
    Split the Cabin name into Deck, Number and Side
    
    '''
    
    df['CabinDeck'] = df['Cabin'].str.split('/', expand=True)[0]
    df['CabinNum']  = df['Cabin'].str.split('/', expand=True)[1]
    df['CabinSide'] = df['Cabin'].str.split('/', expand=True)[2]
    
    df.drop(columns = ['Cabin'], inplace = True)
    return df

trn_data = cabin_separation(trn_data)
tst_data = cabin_separation(tst_data)

In [221]:
def name_ext(df):
    '''
    Split the Name of the passenger into First and Family...
    
    '''
    
    df['FirstName'] = df['Name'].str.split(' ', expand=True)[0]
    df['FamilyName'] = df['Name'].str.split(' ', expand=True)[1]
    df.drop(columns = ['Name'], inplace = True)
    return df

trn_data = name_ext(trn_data)
tst_data = name_ext(tst_data)

In [222]:
def age_groups(df, age_limit = 13):
    df['AgeGroup'] = np.where(df['Age'] < age_limit, 0, 1)
    return df

trn_data =  age_groups(trn_data)
tst_data =  age_groups(tst_data)

In [223]:
def extract_group(df):
    '''
    '''
    df['TravelGroup'] =  df['PassengerId'].str.split('_', expand = True)[0]
    return df

trn_data = extract_group(trn_data)
tst_data = extract_group(tst_data)

In [224]:
Weltiest_Deck = trn_data.groupby('CabinDeck').aggregate({'TotalBilled': 'sum', 'Transported': 'sum', 'CryoSleep': 'sum', 'PassengerId': 'size'}).reset_index()
Weltiest_Deck['AvgSpended'] = Weltiest_Deck['TotalBilled'] / Weltiest_Deck['PassengerId']
Weltiest_Deck['TransportedPercentage'] = Weltiest_Deck['Transported'] / Weltiest_Deck['PassengerId']
Weltiest_Deck['CryoSleepPercentage'] = Weltiest_Deck['CryoSleep'] / Weltiest_Deck['PassengerId']
Weltiest_Deck = Weltiest_Deck.sort_values('AvgSpended', ascending = False)
Weltiest_Deck.head(10)

Unnamed: 0,CabinDeck,TotalBilled,Transported,CryoSleep,PassengerId,AvgSpended,TransportedPercentage,CryoSleepPercentage
7,T,23753.61,1,0,5,4750.721118,0.2,0.0
2,C,3080282.0,508,294,747,4123.537444,0.680054,0.393574
0,A,873463.5,127,68,256,3411.966602,0.496094,0.265625
1,B,2286778.0,572,419,779,2935.529927,0.734275,0.537869
3,D,1105197.0,207,104,478,2312.128525,0.433054,0.217573
4,E,1155242.0,313,168,876,1318.769366,0.357306,0.191781
5,F,2816220.0,1229,550,2794,1007.952687,0.439871,0.19685
6,G,1341641.0,1421,1434,2758,486.454405,0.515228,0.519942


In [225]:
trn_data = trn_data.merge(Weltiest_Deck[['CabinDeck', 'TransportedPercentage', 'AvgSpended']], how = 'left', on = ['CabinDeck'])
tst_data = tst_data.merge(Weltiest_Deck[['CabinDeck', 'TransportedPercentage', 'AvgSpended']], how = 'left', on = ['CabinDeck'])

In [226]:
trn_relatives = trn_data.groupby('FamilyName')['PassengerId'].count().reset_index()
tst_relatives = tst_data.groupby('FamilyName')['PassengerId'].count().reset_index()
trn_relatives.head(10)

Unnamed: 0,FamilyName,PassengerId
0,Acobson,4
1,Acobsond,3
2,Adavisons,9
3,Adkinson,3
4,Admingried,4
5,Ageurante,1
6,Aginge,1
7,Ailled,2
8,Aillyber,3
9,Aiming,1


In [227]:
trn_relatives = trn_relatives.rename(columns = {'PassengerId': 'NumRelatives'})
tst_relatives = tst_relatives.rename(columns = {'PassengerId': 'NumRelatives'})

trn_data = trn_data.merge(trn_relatives[['FamilyName', 'NumRelatives']], how = 'left', on = ['FamilyName'])
tst_data = tst_data.merge(tst_relatives[['FamilyName', 'NumRelatives']], how = 'left', on = ['FamilyName'])

In [228]:
trn_relatives = trn_data.groupby('TravelGroup')['PassengerId'].count().reset_index()
tst_relatives = tst_data.groupby('TravelGroup')['PassengerId'].count().reset_index()

trn_relatives = trn_relatives.rename(columns = {'PassengerId': 'GroupSize'})
tst_relatives = tst_relatives.rename(columns = {'PassengerId': 'GroupSize'})

trn_data = trn_data.merge(trn_relatives[['TravelGroup', 'GroupSize']], how = 'left', on = ['TravelGroup'])
tst_data = tst_data.merge(tst_relatives[['TravelGroup', 'GroupSize']], how = 'left', on = ['TravelGroup'])

In [229]:
# A list of the original variables from the dataset
numerical_features = [
                      'Age', 
                      'RoomService', 
                      'FoodCourt', 
                      'ShoppingMall', 
                      'Spa', 
                      'VRDeck', 
                      'TotalBilled'
                     ]

categorical_features = [
                        #'Name',
                        'FirstName',
                        'FamilyName',
                        'CabinNum',
                        'TravelGroup',
                        'AgeGroup'
                       ]


categorical_features_onehot = [
                               'HomePlanet',
                               'CryoSleep',
                               #'Cabin',
                               'CabinDeck',
                               'CabinSide',
                               'Destination',
                               'VIP',
                               #'AgeGroup'
                               ]

target_feature = 'Transported'

In [230]:
from sklearn.preprocessing import LabelEncoder 
def encode_categorical(train_df, test_df, categ_feat = categorical_features):
    '''
    
    '''
    encoder_dict = {}
    
    concat_data = pd.concat([trn_data[categ_feat], tst_data[categ_feat]])
    
    for col in concat_data.columns:
        print('Encoding: ', col, '...')
        encoder = LabelEncoder()
        encoder.fit(concat_data[col])
        encoder_dict[col] = encoder

        train_df[col + '_Enc'] = encoder.transform(train_df[col])
        test_df[col + '_Enc'] = encoder.transform(test_df[col])
    
    train_df = train_df.drop(columns = categ_feat, axis = 1)
    test_df = test_df.drop(columns = categ_feat, axis = 1)

    return train_df, test_df

trn_data, tst_data = encode_categorical(trn_data, tst_data, categorical_features)




Encoding:  FirstName ...
Encoding:  FamilyName ...
Encoding:  CabinNum ...
Encoding:  TravelGroup ...
Encoding:  AgeGroup ...


In [231]:
def one_hot(df, one_hot_categ):
    for col in one_hot_categ:
        tmp = pd.get_dummies(df[col], prefix = col)
        df = pd.concat([df, tmp], axis = 1)
    df = df.drop(columns = one_hot_categ)
    return df


def encode_numerical(df, categ_feat = numerical_features):
    for col in categ_feat:
        df[col] = df[col].astype('float')
    return df


trn_data = encode_numerical(trn_data) 
tst_data = encode_numerical(tst_data)     


trn_data = one_hot(trn_data, categorical_features_onehot) 
tst_data = one_hot(tst_data, categorical_features_onehot)     

trn_data.head()

Unnamed: 0,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,TotalBilled,TransportedPercentage,AvgSpended,NumRelatives,GroupSize,FirstName_Enc,FamilyName_Enc,CabinNum_Enc,TravelGroup_Enc,AgeGroup_Enc,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,CryoSleep_True,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinSide_P,CabinSide_S,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True
0,0001_01,39.0,0.0,0.0,0.0,0.0,0.0,False,0.0,0.734275,2935.529927,1,1,1708,1551,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0
1,0002_01,24.0,109.0,9.0,25.0,549.0,44.0,True,736.0,0.439871,1007.952687,4,1,1493,2289,0,1,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0
2,0003_01,58.0,43.0,3576.0,0.0,6715.0,49.0,False,10383.0,0.496094,3411.966602,6,2,170,2153,0,2,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1
3,0003_02,33.0,0.0,1283.0,371.0,3329.0,193.0,False,5176.0,0.496094,3411.966602,6,2,2424,2153,0,2,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0
4,0004_01,16.0,303.0,70.0,151.0,565.0,2.0,True,1091.0,0.439871,1007.952687,6,1,2815,1924,1,3,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0


In [232]:
# lables, unique = trn_data['FamilyName'].factorize()
# trn_data['FamilyName'] = lables
# lables, unique = tst_data['FamilyName'].factorize()
# tst_data['FamilyName'] = lables

In [233]:
trn_data.head()

Unnamed: 0,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,TotalBilled,TransportedPercentage,AvgSpended,NumRelatives,GroupSize,FirstName_Enc,FamilyName_Enc,CabinNum_Enc,TravelGroup_Enc,AgeGroup_Enc,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,CryoSleep_True,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinSide_P,CabinSide_S,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True
0,0001_01,39.0,0.0,0.0,0.0,0.0,0.0,False,0.0,0.734275,2935.529927,1,1,1708,1551,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0
1,0002_01,24.0,109.0,9.0,25.0,549.0,44.0,True,736.0,0.439871,1007.952687,4,1,1493,2289,0,1,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0
2,0003_01,58.0,43.0,3576.0,0.0,6715.0,49.0,False,10383.0,0.496094,3411.966602,6,2,170,2153,0,2,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1
3,0003_02,33.0,0.0,1283.0,371.0,3329.0,193.0,False,5176.0,0.496094,3411.966602,6,2,2424,2153,0,2,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0
4,0004_01,16.0,303.0,70.0,151.0,565.0,2.0,True,1091.0,0.439871,1007.952687,6,1,2815,1924,1,3,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0


In [234]:
remove = ['PassengerId', 
          'Route', 
          'FirstName_Enc', 
          #'CabinNum_Enc', 
          'Transported',
          'Cabin',
          'TransportedPercentage',
          #'IsKid', 
          #'IsAdult', 
          #'IsOlder'
          #'RoomService',
          #'FoodCourt',
          #'ShoppingMall',
          #'Spa',
          #'VRDeck',
         ]
features = [feat for feat in trn_data.columns if feat not in remove]
features

['Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'TotalBilled',
 'AvgSpended',
 'NumRelatives',
 'GroupSize',
 'FamilyName_Enc',
 'CabinNum_Enc',
 'TravelGroup_Enc',
 'AgeGroup_Enc',
 'HomePlanet_Earth',
 'HomePlanet_Europa',
 'HomePlanet_Mars',
 'CryoSleep_False',
 'CryoSleep_True',
 'CabinDeck_A',
 'CabinDeck_B',
 'CabinDeck_C',
 'CabinDeck_D',
 'CabinDeck_E',
 'CabinDeck_F',
 'CabinDeck_G',
 'CabinDeck_T',
 'CabinSide_P',
 'CabinSide_S',
 'Destination_55 Cancri e',
 'Destination_PSO J318.5-22',
 'Destination_TRAPPIST-1e',
 'VIP_False',
 'VIP_True']

In [235]:
X_array = trn_data[features].to_numpy()
y_array = trn_data['Transported'].to_numpy()
X_test_array = tst_data[features].to_numpy()

In [236]:
from gokinjo import knn_kfold_extract
from gokinjo import knn_extract

KNN_trn_features = knn_kfold_extract(X_array, y_array, k = 2, normalize = 'standard')

In [237]:
knn_cols = ['KNN_K1_01',
            'KNN_K1_02',
            'KNN_K2_01',
            'KNN_K2_02']

KNN_feat = pd.DataFrame(KNN_trn_features, columns = knn_cols)
KNN_feat = pd.DataFrame(KNN_trn_features, columns = knn_cols).set_index(trn_data.index)

trn_data = pd.concat([trn_data, KNN_feat], axis = 1)
trn_data.head()

Unnamed: 0,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,TotalBilled,TransportedPercentage,AvgSpended,NumRelatives,GroupSize,FirstName_Enc,FamilyName_Enc,CabinNum_Enc,TravelGroup_Enc,AgeGroup_Enc,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,CryoSleep_True,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinSide_P,CabinSide_S,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True,KNN_K1_01,KNN_K1_02,KNN_K2_01,KNN_K2_02
0,0001_01,39.0,0.0,0.0,0.0,0.0,0.0,False,0.0,0.734275,2935.529927,1,1,1708,1551,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,1.66576,3.665677,2.571035,5.606894
1,0002_01,24.0,109.0,9.0,25.0,549.0,44.0,True,736.0,0.439871,1007.952687,4,1,1493,2289,0,1,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,1.414737,3.014986,0.861772,2.059607
2,0003_01,58.0,43.0,3576.0,0.0,6715.0,49.0,False,10383.0,0.496094,3411.966602,6,2,170,2153,0,2,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,4.67756,11.331706,7.292525,15.38427
3,0003_02,33.0,0.0,1283.0,371.0,3329.0,193.0,False,5176.0,0.496094,3411.966602,6,2,2424,2153,0,2,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,2.189934,5.685515,3.709621,7.562104
4,0004_01,16.0,303.0,70.0,151.0,565.0,2.0,True,1091.0,0.439871,1007.952687,6,1,2815,1924,1,3,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0.951504,2.716522,0.870298,2.014727


In [238]:
KNN_tst_features = knn_extract(X_array, y_array, X_test_array, k = 2, normalize = 'standard')
KNN_feat = pd.DataFrame(KNN_tst_features, columns = knn_cols).set_index(tst_data.index)

tst_data = pd.concat([tst_data, KNN_feat], axis = 1)
tst_data.head()

Unnamed: 0,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalBilled,TransportedPercentage,AvgSpended,NumRelatives,GroupSize,FirstName_Enc,FamilyName_Enc,CabinNum_Enc,TravelGroup_Enc,AgeGroup_Enc,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,CryoSleep_True,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinSide_P,CabinSide_S,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True,KNN_K1_01,KNN_K1_02,KNN_K2_01,KNN_K2_02
0,0013_01,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.515228,486.454405,4,1,1943,373,1117,12,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0.46971,1.092769,0.737601,1.539672
1,0018_01,19.0,0.0,9.0,0.0,2823.0,0.0,2832.0,0.439871,1007.952687,1,1,1613,1628,1228,17,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,1.567712,3.275803,2.626879,5.377692
2,0019_01,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.680054,4123.537444,1,1,2276,2244,0,18,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0,0,1,0,4.656102,9.583475,1.22513,2.617356
3,0021_01,38.0,0.0,6652.0,0.0,181.0,585.0,7418.0,0.680054,4123.537444,1,1,1800,350,1,20,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,4.330659,8.878172,1.623605,4.020382
4,0023_01,20.0,10.0,0.0,635.0,0.0,0.0,645.0,0.439871,1007.952687,3,1,487,1016,1339,22,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0.783441,1.67714,0.889539,1.88547


In [239]:
import scipy
def skew_log_transform(df):
    # using log1p transform to fix skew in data
    df = df.copy()
    numeric_features = df.select_dtypes(np.number).columns
    for column in numeric_features:
        skew = abs(scipy.stats.skew(df[column]))

        if skew >= 0.5:
            df[column] = np.log1p(df[column])
            # l1p = np.log1p(df[column])

            # tskew = abs(scipy.stats.skew(l1p))
            # # print(tskew)
            # # print(l1p)
            # if tskew < skew:
            #   df[column] = l1p
    return df

trn_data = skew_log_transform(trn_data)
tst_data = skew_log_transform(tst_data)
trn_data.head()

Unnamed: 0,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,TotalBilled,TransportedPercentage,AvgSpended,NumRelatives,GroupSize,FirstName_Enc,FamilyName_Enc,CabinNum_Enc,TravelGroup_Enc,AgeGroup_Enc,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,CryoSleep_True,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinSide_P,CabinSide_S,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True,KNN_K1_01,KNN_K1_02,KNN_K2_01,KNN_K2_02
0,0001_01,39.0,0.0,0.0,0.0,0.0,0.0,False,0.0,0.550589,7.984984,0.693147,0.693147,1708,1551,0,0,0.693147,0,0.693359,0.0,0.693359,0.0,0.0,0.693359,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,0.0,0.693359,0.693359,0.0,0.980489,1.540233,1.272856,1.888114
1,0002_01,24.0,4.70048,2.302585,3.258097,6.309918,3.806662,True,6.602588,0.364554,6.916668,1.609438,0.693147,1493,2289,0,1,0.693147,1,0.0,0.0,0.693359,0.0,0.0,0.0,0.0,0.0,0.0,0.693359,0.0,0.0,0,1,0.0,0.0,0.693359,0.693359,0.0,0.88159,1.390034,0.621529,1.118286
2,0003_01,58.0,3.78419,8.18228,0.0,8.812248,3.912023,False,9.248021,0.402858,8.135337,1.94591,1.098612,170,2153,0,2,0.693147,0,0.693359,0.0,0.693359,0.0,0.693359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0.0,0.0,0.693359,0.0,0.693359,1.736522,2.512174,2.115354,2.796322
3,0003_02,33.0,0.0,7.157735,5.918894,8.110728,5.267858,False,8.551981,0.402858,8.135337,1.94591,1.098612,2424,2153,0,2,0.693147,0,0.693359,0.0,0.693359,0.0,0.693359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0.0,0.0,0.693359,0.693359,0.0,1.16,1.899943,1.549607,2.147346
4,0004_01,16.0,5.717028,4.26268,5.023881,6.338594,1.098612,True,6.995766,0.364554,6.916668,1.94591,0.693147,2815,1924,1,3,0.693147,1,0.0,0.0,0.693359,0.0,0.0,0.0,0.0,0.0,0.0,0.693359,0.0,0.0,0,1,0.0,0.0,0.693359,0.693359,0.0,0.668601,1.312788,0.626098,1.103509


In [240]:
features = ['Age',
            'RoomService',
            'FoodCourt',
            'ShoppingMall',
            'Spa',
            'VRDeck',
            'TotalBilled',
            #'AvgSpended',
            #'NumRelatives',
            #'GroupSize',
            'FamilyName_Enc',
            'TravelGroup_Enc',
            'AgeGroup_Enc',
            'HomePlanet_Earth',
            'HomePlanet_Europa',
            'HomePlanet_Mars',
            'CryoSleep_False',
            'CryoSleep_True',
            'CabinDeck_A',
            'CabinDeck_B',
            'CabinDeck_C',
            'CabinDeck_D',
            'CabinDeck_E',
            'CabinDeck_F',
            'CabinDeck_G',
            'CabinDeck_T',
            'CabinSide_P',
            'CabinSide_S',
            'Destination_55 Cancri e',
            'Destination_PSO J318.5-22',
            'Destination_TRAPPIST-1e',
            'VIP_False',
            'VIP_True',
            #'AgeGroup_0',
            #'AgeGroup_1',
            #'AgeGroup_0',
            #'AgeGroup_1',
            'KNN_K1_01',
            'KNN_K1_02'
]

In [None]:
tar

In [241]:
trn_data = trn_data[features]

scaler = StandardScaler()
scaler.fit(trn_data)
trn_data = pd.DataFrame(scaler.transform(trn_data), index = trn_data.index, columns = trn_data.columns)
trn_data.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalBilled,FamilyName_Enc,TravelGroup_Enc,AgeGroup_Enc,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,CryoSleep_True,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinSide_P,CabinSide_S,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True,KNN_K1_01,KNN_K1_02
0,0.709437,-0.657152,-0.669077,-0.641178,-0.684572,-0.659135,-1.16042,0.51655,-1.734409,0.319677,-1.111173,1.754795,-0.503664,0.73277,-0.73277,-0.174191,3.187347,-0.30661,-0.241218,-0.334759,-0.688215,-0.68169,-0.02399,1.032865,-1.032865,-0.511013,-0.317487,0.652521,0.153063,-0.153063,-0.005171,0.031081
1,-0.336717,1.057118,0.108547,0.616929,1.572193,0.713251,0.626257,1.586357,-1.734034,0.319677,0.89995,-0.569867,-0.503664,0.73277,-0.73277,-0.174191,-0.313741,-0.30661,-0.241218,-0.334759,1.453035,-0.68169,-0.02399,-0.968181,0.968181,-0.511013,-0.317487,0.652521,0.153063,-0.153063,-0.2353,-0.265669
2,2.034566,0.722946,2.094226,-0.641178,2.46716,0.751236,1.342118,1.389211,-1.73366,0.319677,-1.111173,1.754795,-0.503664,0.73277,-0.73277,5.740821,-0.313741,-0.30661,-0.241218,-0.334759,-0.688215,-0.68169,-0.02399,-0.968181,0.968181,-0.511013,-0.317487,0.652521,-6.533255,6.533255,1.754055,1.951347
3,0.290975,-0.657152,1.748219,1.644391,2.216258,1.240045,1.153768,1.389211,-1.73366,0.319677,-1.111173,1.754795,-0.503664,0.73277,-0.73277,5.740821,-0.313741,-0.30661,-0.241218,-0.334759,-0.688215,-0.68169,-0.02399,-0.968181,0.968181,-0.511013,-0.317487,0.652521,0.153063,-0.153063,0.412537,0.741761
4,-0.894666,1.427853,0.770507,1.298783,1.582449,-0.263061,0.732652,1.057252,-1.733286,0.319677,0.89995,-0.569867,-0.503664,0.73277,-0.73277,-0.174191,-0.313741,-0.30661,-0.241218,-0.334759,1.453035,-0.68169,-0.02399,-0.968181,0.968181,-0.511013,-0.317487,0.652521,0.153063,-0.153063,-0.73091,-0.418283


In [242]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

from sklearn.model_selection import KFold, StratifiedKFold 
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

def preprocess(df):
    df_norm = (df - df.min()) / (df.max() - df.min())
    return df_norm




# trn_data = preprocess(trn_data)


# test_dataset = torch.tensor(preprocess(tst_data).to_numpy(), dtype=torch.float)





In [243]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.ensemble import VotingClassifier




# gbm0 = GradientBoostingClassifier(random_state=10)
# gbm0.fit(X,y)
# y_pred = gbm0.predict(X)
# y_predprob = gbm0.predict_proba(X)[:,1]
# print("Accuracy : %.4g" % accuracy_score(y.values, y_pred))
# print("AUC Score (Train): %f" % roc_auc_score(y, y_predprob))


# pred = gbm0.predict(tst_data) > 0.5

# submission = pd.read_csv('Data/test.csv')
# submission["Transported"] = pred
# submission[["PassengerId","Transported"]].to_csv('submission.csv', index=False)



In [244]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

from sklearn.model_selection import KFold, StratifiedKFold 
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

def preprocess(df):
    df_norm = (df - df.min()) / (df.max() - df.min())
    return df_norm


class dataset(Dataset):
    def __init__(self, xdata, ydata):
        self.labels = ydata
        self.inputs = xdata

    def __getitem__(self, index):
        
        return torch.tensor(self.inputs.iloc[index], dtype=torch.float), torch.tensor(self.labels.iloc[index], dtype=torch.float)

    def __len__(self):
        return len(self.inputs)



# df_norm, lables = preprocess(dataTest)
# test_dataset = dataset(df_norm, lables)

test_dataset = torch.tensor(preprocess(tst_data).to_numpy(), dtype=torch.float)


class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(NeuralNetwork, self).__init__()
        self.linear = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        y_pred = self.linear(x)
        return y_pred.squeeze(-1)




learning_rate = 0.2
batch_size = 128
epochs = 20

result = [0]



def train_loop(trn_idx, val_idx):

    train_dataset = trn_data.iloc[trn_idx]
    trtest_dataset = trn_data.iloc[val_idx]

    train_dataset['Transported'] = train_dataset['Transported'].astype('int')
    lables1 = train_dataset.pop('Transported')
    trtest_dataset['Transported'] = trtest_dataset['Transported'].astype('int')
    lables2 = trtest_dataset.pop('Transported')


    df_norm = preprocess(train_dataset)
    train_dataset = dataset(df_norm, lables1)
    
    test_x = torch.tensor(preprocess(trtest_dataset).to_numpy(), dtype=torch.float)
    test_y = torch.tensor(lables2.to_numpy(), dtype=torch.float)


    train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

    model = NeuralNetwork(df_norm.shape[1], 1)

    loss_fn = nn.BCELoss()

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01) #torch.optim.Adagrad(model.parameters(), lr=learning_rate)


    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        size = len(train_dataloader.dataset)
        for batch, (X, y) in enumerate(train_dataloader):
            # Compute prediction and loss
            pred = model(X)
            loss = loss_fn(pred, y)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            tmp = sum((model(test_x) > 0.5) == (test_y > 0.5)) / len(test_y)
            if tmp > result[0]:
                result[0] = tmp
                torch.save(model, 'model.pth')


            if batch % 100 == 0:
                
                loss, current = loss.item(), batch * len(X)
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]   {result[0]}")


kf = StratifiedKFold(n_splits = 5)

for fold, (trn_idx, val_idx) in enumerate(kf.split(trn_data, trn_data['Transported'])):
    
    # fit_model(X_train, y_train, X_val, y_val)
    
    train_loop(trn_idx, val_idx)
    

model = torch.load('model.pth')

# pred = model(test_dataset) > 0.5

# submission = pd.read_csv('Data/test.csv')
# submission["Transported"] = pred
# submission[["PassengerId","Transported"]].to_csv('submission.csv', index=False)

# pred = model(test_dataset) > 0.5

# submission = pd.read_csv('Data/test.csv')
# submission["Transported"] = pred
# submission[["PassengerId","Transported"]].to_csv('submission.csv', index=False)


AttributeError: Can't get attribute 'NeuralNetwork' on <module '__main__'>

In [None]:
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier



trn_data['Transported'] = trn_data['Transported'].astype('int')
lables = trn_data.pop('Transported')

trn_data['nn'] = model(torch.tensor(preprocess(trn_data).to_numpy(), dtype=torch.float)).detach().numpy()
tst_data['nn'] = model(torch.tensor(preprocess(tst_data).to_numpy(), dtype=torch.float)).detach().numpy()

X, y = trn_data, lables

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
clf4 = GradientBoostingClassifier(random_state=10)
clf5 = CatBoostClassifier()

eclf = VotingClassifier(estimators=[('rf', clf2), ('gbt', clf4), ('cbt', clf5)], voting='soft', weights=[1, 1, 1])
eclf = eclf.fit(X, y)
print(eclf.predict(X))


pred = eclf.predict(tst_data) > 0.5

submission = pd.read_csv('Data/test.csv')
submission["Transported"] = pred
submission[["PassengerId","Transported"]].to_csv('submission.csv', index=False)

