In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from ipynb.fs.defs.EDA import map_space, map_apply, heat_map  ### ipynb.fs.full.EDA would run the notebook, also don't forget pip install ipynb

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

'''
## is Pclass correlated with cabin level on boat ?
heat_map(list_cols_you_dont_want=['Embarked','Pclass', 'Cabin'], tuple_cols_you_want=('fst_cl','scd_cl','thrd_cl','a_cab','b_cab','c_cab','d_cab','e_cab','f_cab','g_cab'), data_df=data_df)
#      ---> it is, particularly first, scd and third class, can use these to predict cabin level
## is the level of your cabin correlated to survival ?
heat_map(list_cols_you_dont_want=['Embarked','Pclass', 'Cabin'], tuple_cols_you_want=('Survived','a_cab','b_cab','c_cab','d_cab','e_cab','f_cab','g_cab'),data_df=data_df)
#      ---> it is, particularly lvl b to f, this comforts the idea of the importance of cabin lvl for survival prediction, to be used
## what is correlated with Age ?
heat_map(list_cols_you_dont_want=['Embarked','Pclass', 'Cabin'], tuple_cols_you_want=('f_cab','mr', 'mrs', 'miss','Parch','SibSp','fst_cl','scd_cl','thrd_cl'
                                                                                      ,'a_cab','b_cab','c_cab','d_cab','e_cab','Age','g_cab'),data_df=data_df)
#     ---> mr, mrs, miss, Parch, SibSp, fst_cl, scd_cl, thrd_ck, a_cab, c_cab, d_cab, e_cab are all correlated, use them to predict age of missing values

## what is correlated with embarkment ?                                                                                  
heat_map(list_cols_you_dont_want=['Embarked','Pclass', 'Cabin'], tuple_cols_you_want=('Age','mr', 'mrs', 'miss','Parch','SibSp','fst_cl','scd_cl','thrd_cl'
                                                                                      ,'a_cab','b_cab','c_cab','d_cab','e_cab','f_cab','g_cab', 'embark_1', 'embark_2', 'embark_3'), data_df=data_df)
#    --->mr,miss,fst_cl,scd_cl, b_cab,thrd_cl,c_cab,d_cab
'''
####################################### DATA PREP. / REG FCT. / PREDICTION / REPLACE NaN IN DF ############################################


def train_to_pred(data_df, tuple_explanatory, str_y):
    
    data = map_space(data_df).apply(map_apply, axis='columns')
    
    train_data = data.drop(data.loc[(data.loc[:,str_y].isnull())].index.values)

    x_dat = train_data.loc[:,tuple_explanatory].values
    y_dat = train_data.loc[:,str_y].values
    
    x_p = data.loc[data.loc[(data.loc[:,str_y].isnull())].index.values]
    x_to_pred = x_p.loc[:,tuple_explanatory].values
    
    
    return x_dat, y_dat, x_to_pred




def reg(data_df, tuple_explanatory, str_y, n_splits):
    
    x_dat, y_dat = train_to_pred(data_df, tuple_explanatory, str_y)[0], train_to_pred(data_df, tuple_explanatory, str_y)[1]
    kf = KFold(n_splits=n_splits)
    in_rscore = 0
    out_rscore = 0
    for train_index, test_index in kf.split(y_dat):
        
        x_train, y_train = x_dat[train_index], y_dat[train_index]
        x_test, y_test = x_dat[test_index], y_dat[test_index]
        
        model = LinearRegression().fit(x_train, y_train)
        
        in_rscore_n = model.score(x_train, y_train)
        in_rscore += in_rscore_n/n_splits
        
        out_rscore_n = model.score(x_test, y_test)
        out_rscore += out_rscore_n/n_splits
        
    return model, print( ' \n\n##### Lin. Reg. on ', str_y ,' #####\n ' ,' \n\n##### COEFFICIENTS #####\n ', model.coef_,' \n\n##### IN RSCORE #####\n ' 
                      ,in_rscore,' \n\n##### OUT RSCORE #####\n ', out_rscore)

def reg_predict(data_df, tuple_explanatory, str_y, n_splits):
    x_to_pred = train_to_pred(data_df, tuple_explanatory, str_y)[2]
    regr = reg(data_df, tuple_explanatory, str_y, n_splits)[0]
    return regr.predict(x_to_pred)


def reg_replace(data_df, tuple_explanatory, str_y, n_splits):

    null_idx = data_df.loc[pd.isnull(data_df.loc[:,str_y])].index
    pred_series = pd.Series((reg_predict(data_df, tuple_explanatory, str_y, n_splits)), index = null_idx)
    data_df.loc[:,str_y] = data_df.loc[:,str_y].fillna(pred_series)
    
    return data_df

def knn(data_df, tuple_explanatory, str_y, n_neighbors):
    
    x_dat, y_dat = train_to_pred(data_df, tuple_explanatory, str_y)[0], train_to_pred(data_df, tuple_explanatory, str_y)[1]
    model = KNeighborsClassifier(n_neighbors=n_neighbors).fit(x_dat, y_dat)

    rscore = model.score(x_dat, y_dat)
    pickle.dump(model, open('/structured_notebooks', 'wb'), pickle.HIGHEST_PROTOCOL)
        
    return model, print( ' \n\n##### knn on ', str_y ,' #####\n ' ,' \n\n##### vecor features #####\n ', tuple_explanatory,' \n\n##### RSCORE #####\n ', rscore )

def knn_predict(data_df, tuple_explanatory, str_y, n_neighbors):
    x_to_pred = train_to_pred(data_df, tuple_explanatory, str_y)[2]
    knn_mod = knn(data_df, tuple_explanatory, str_y, n_neighbors)[0]
    
    return knn_mod.predict(x_to_pred)

def knn_replace(data_df, tuple_explanatory, str_y, n_neighbors):
    
    null_idx = data_df.loc[pd.isnull(data_df.loc[:,str_y])].index
    pred_series = pd.Series((knn_predict(data_df, tuple_explanatory, str_y, n_neighbors)), index = null_idx)
    data_fill = data_df.loc[:,str_y].fillna(pred_series)
    data_df.loc[:,str_y] = data_fill
    mapped_df = map_space(data_df).apply(map_apply, axis='columns')
    data_df.loc[:,str_y] = mapped_df.loc[:,str_y]

    return data_df


def pre_proc(data_df, tr_or_ts):
    
    if tr_or_ts == 'train':
        knn_replace(data_df, ('fst_cl','scd_cl','thrd_cl'), 'Cabin', 4)
        reg_replace(data_df, ('mr', 'mrs', 'miss','Parch','SibSp','fst_cl','thrd_cl','a_cab','c_cab','d_cab','e_cab'), 'Age', 5)
        knn_replace(data_df, ('mr','miss','fst_cl','scd_cl','thrd_cl','b_cab','c_cab','d_cab'), 'Embarked', 4)
        data_df = data_df.drop(columns=['Name','Ticket','PassengerId'])
        data_df = data_df.replace({'male':0, 'female':1})

        dat_to_scale = data_df.loc[:,('Age','Fare')]
        data_df.loc[:,('Age','Fare')] = StandardScaler().fit_transform(dat_to_scale)

        data_df.to_pickle("./pp_train.pkl")
        data_df.to_csv("./pp_train.csv")
    if tr_or_ts == 'test':
        knn_replace(data_df, ('fst_cl','scd_cl','thrd_cl'), 'Cabin', 4)
        reg_replace(data_df, ('mr', 'mrs', 'miss','Parch','SibSp','fst_cl','thrd_cl','a_cab','c_cab','d_cab','e_cab'), 'Age', 5)
        reg_replace(data_df, ('mr', 'mrs', 'miss','Parch','SibSp','fst_cl','scd_cl','thrd_cl','b_cab','c_cab', 'embark_1', 'embark_2', 'embark_3'), 'Fare', 4)
        data_df = data_df.drop(columns=['Name','Ticket','PassengerId'])
        data_df = data_df.replace({'male':0, 'female':1})

        dat_to_scale = data_df.loc[:,('Age','Fare')]
        data_df.loc[:,('Age','Fare')] = StandardScaler().fit_transform(dat_to_scale)
        
        data_df.to_pickle("./pp_test.pkl")
        data_df.to_csv("./pp_test.csv")
        
    return data_df
    
pre_proc(train_df, 'train')
#pre_proc(test_df, 'test')



PermissionError: [Errno 13] Permission denied: '/structured_notebooks'