In [11]:
import pathlib
this_path = pathlib.Path().absolute()
data_path = this_path.parent / "data"

import custom_transformers as ct
import data_functions as dfun

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from sklearn.compose import  make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier

from copy import deepcopy

import itertools

from statistics import mean

from sklearn.model_selection import train_test_split

In [2]:
pd.options.display.max_columns = None

In [9]:
def get_dataframes():
    '''
    function to retrieve the data for this project as dataframes
    
    --returns:
    a tuple containing pandas dataframes in the format (x_train, x_test, y_train)
    '''
    x_train_filename = 'Pump_it_Up_Data_Mining_the_Water_Table_-_Training_set_values.csv'
    x_test_filename = 'Pump_it_Up_Data_Mining_the_Water_Table_-_Test_set_values.csv'
    y_train_filename = 'Pump_it_Up_Data_Mining_the_Water_Table_-_Training_set_labels.csv'
    
    x_train = open_local_csv(x_train_filename)
    x_test = open_local_csv(x_test_filename)
    y_train = open_local_csv(y_train_filename)
    
    return (x_train, x_test, y_train)

def open_local_csv(filename, data_path = d_path):
    '''
    checks that the csv filepath exists for given filename and returns a dataframe containing its
    values if it does exist
    
    --parameters:
    
    filename: should be a string containing the name of the csv to be opened
    
    --returns:
    
    pandas DataFrame object if csv_path exists, else prints error msg and returns None
    '''
    
    csv_path = data_path / filename
    if csv_path.exists():
        return pd.read_csv(csv_path, index_col = 'id')
    else:
        print(f'the specified filepath does not exist: {csv_path}')
        return None

In [8]:
x_train, x_test, y_train = dfun.get_dataframes()

In [18]:

def data_preprocessing(x_tr, y_tr):
    x_train = x_tr.copy()
    y_train = y_tr.copy()
    
    x_train.drop(['date_recorded','installer','funder','wpt_name', 'subvillage','ward','recorded_by','scheme_name','scheme_management','extraction_type',
                 'extraction_type_class','payment','public_meeting','permit','management','management_group','source','source_class',
                 'waterpoint_type_group','latitude','longitude','num_private','region_code','district_code'], inplace=True, axis=1)

    x_test.drop(['date_recorded','installer','funder','wpt_name', 'subvillage','ward','recorded_by','scheme_name','scheme_management','extraction_type',
                 'extraction_type_class','payment','public_meeting','permit','management','management_group','source','source_class',
                 'waterpoint_type_group','latitude','longitude','num_private','region_code','district_code'], inplace=True, axis=1)

    x_train_nums= x_train.select_dtypes(exclude="object")
    x_train_cat= x_train.select_dtypes(include="object")
    ohe=OneHotEncoder(drop='first', sparse=False)
    x_train_ohe=pd.DataFrame(ohe.fit_transform(x_train_cat), columns= ohe.get_feature_names(x_train_cat.columns), index= x_train_cat.index)
    si=SimpleImputer()
    x_nums_si=pd.DataFrame(si.fit_transform(x_train_nums), index= x_train_nums.index, columns= x_train_nums.columns)
    scale= StandardScaler()
    x_train_nums_scaled= pd.DataFrame(scale.fit_transform(x_nums_si), index= x_nums_si.index, columns= x_nums_si.columns)
    x_final= x_train_nums_scaled.join(x_train_ohe)
    
    return (x_final, y_train)

In [19]:
x_tr, x_te, y_tr = get_dataframes()

In [20]:
x_train, y_train = data_preprocessing(x_tr, y_tr)

In [21]:
x_train

Unnamed: 0_level_0,amount_tsh,gps_height,population,construction_year,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,...,source_type_rainwater harvesting,source_type_river/lake,source_type_shallow well,source_type_spring,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69572,1.895665,1.041252,-0.150399,0.733857,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
8776,-0.105970,1.054237,0.212290,0.745416,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
34310,-0.097630,0.025541,0.148660,0.744365,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
67743,-0.105970,-0.584751,-0.258570,0.720196,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
19728,-0.105970,-0.964200,-0.381587,-1.366788,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60739,-0.102634,0.781553,-0.116463,0.733857,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
27263,1.461977,0.784439,-0.262812,0.730704,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
37057,-0.105970,-0.964200,-0.381587,-1.366788,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
31282,-0.105970,-0.964200,-0.381587,-1.366788,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


making functions to return lists of features for strict/loose and categorical/numerical

In [12]:
not_strict_features = ['date_recorded', 'funder', 'installer', 'wpt_name', 'subvillage', 'recorded_by', 'scheme_name', 'extraction_type',
             'extraction_type_class', 'payment', 'management', 'management_group', 'source_class',
             'waterpoint_type_group','latitude','longitude','num_private','region_code','district_code']
strict_features = [x for x in x_tr.columns if x not in not_strict_features ]

print(strict_features)

['amount_tsh', 'gps_height', 'basin', 'region', 'lga', 'ward', 'population', 'public_meeting', 'scheme_management', 'permit', 'construction_year', 'extraction_type_group', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'waterpoint_type']


In [16]:
def get_strict_features():
    strict_features = ['amount_tsh', 'gps_height', 'installer', 'basin', 'region',
                            'lga', 'population', 'construction_year', 'extraction_type_group', 'payment_type',
                            'quality_group', 'quantity', 'source_type', 'waterpoint_type']
    return strict_features

def get_loose_features():
    loose_features = ['amount_tsh', 'gps_height', 'installer', 'basin', 'region', 'lga', 'ward', 'population',
                      'public_meeting', 'scheme_management', 'permit', 'construction_year', 'extraction_type_group',
                      'payment_type', 'water_quality', 'quantity', 'source', 'waterpoint_type']
    return loose_features

def get_numeric_features(f_names):
    numeric = ['amount_tsh', 'population', 'construction_year', 'gps_height']
    num_features = [x for x in f_names if x not in numeric]
    return numeric_features

def get_categorical_features(f_names):
    categorical = ['installer', 'basin', 'region', 'lga', 'ward',
                   'public_meeting', 'scheme_management', 'permit','extraction_type_group',
                   'payment_type', 'water_quality', 'quantity', 'source',
                   'waterpoint_type', 'source_type', 'quality_group']
    cat_features = [x for x in f_names if x not in categorical]
    return cat_features

0       20709
2010     2645
2008     2613
2009     2533
2000     2091
2007     1587
2006     1471
2003     1286
2011     1256
2004     1123
2012     1084
2002     1075
1978     1037
1995     1014
2005     1011
1999      979
1998      966
1990      954
1985      945
1980      811
1996      811
1984      779
1982      744
1994      738
1972      708
1974      676
1997      644
1992      640
1993      608
2001      540
1988      521
1983      488
1975      437
1986      434
1976      414
1970      411
1991      324
1989      316
1987      302
1981      238
1977      202
1979      192
1973      184
2013      176
1971      145
1960      102
1967       88
1963       85
1968       77
1969       59
1964       40
1962       30
1961       21
1965       19
1966       17
Name: construction_year, dtype: int64

In [22]:
class BinInstaller(TransformerMixin, BaseEstimator):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def _to_df(self, X):
        if type(X) != pd.DataFrame:
            if type(X) != list:
                if type(X) == pd.Series:
                    data = pd.DataFrame(X)
                elif type(X) == dict:
                    data = pd.DataFrame([X])
                else:
                    raise ValueError('X must be a dataframe, list, series, or dictionary  object.')
            else:
                data = pd.DataFrame(X)
        else:
            data = X.copy()
        return data
        
    def transform(self, X):
        data = self._to_df(X)
        others = data['installer'].value_counts().index[data['installer'].value_counts() < 10]
        is_other = lambda x: 'Other' if x in others else x
        data['installer'] = data['installer'].map(is_other)
        data['installer'] = data['installer'].fillna('Unknown')
        return data

    
class TransformConstructionYear(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.current_year = datetime.now().year
    
    def fit(self, X, y=None):
        return self
    
    def _to_df(self, X):
        if type(X) != pd.DataFrame:
            if type(X) != list and type(X) != np.ndarray:
                if type(X) == pd.Series:
                    data = pd.DataFrame(X)
                elif type(X) == dict:
                    data = pd.DataFrame([X])
                else:
                    raise ValueError('X must be a dataframe, list, series, or dictionary  object.')
                    
            else:
                data = pd.DataFrame(X)
        else:
            data = X.copy()
        return data
    
    def _bin_data(self, x):
        if x == 0:
            return 0 
        else:
            return self.current_year - x 
        
    def transform(self, X):
        data = self._to_df(X)
        data['construction_year'] = data['construction_year'].apply(self._bin_data)
        return data
    
class ChooseStrictFeatures(TransformerMixin, BaseEstimator):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def _to_df(self, X):
        if type(X) != pd.DataFrame:
            if type(X) != list:
                if type(X) == pd.Series:
                    data = pd.DataFrame(X)
                elif type(X) == dict:
                    data = pd.DataFrame([X])
                else:
                    raise ValueError('X must be a dataframe, list, series, or dictionary  object.')
            else:
                data = pd.DataFrame(X)
        else:
            data = X.copy()
        return data
        
    def transform(self, X):
        data = self._to_df(X)
        data = data[data_functions.get_strict_features()]
        return data
    
class ChooseLooseFeatures(TransformerMixin, BaseEstimator):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def _to_df(self, X):
        if type(X) != pd.DataFrame:
            if type(X) != list:
                if type(X) == pd.Series:
                    data = pd.DataFrame(X)
                elif type(X) == dict:
                    data = pd.DataFrame([X])
                else:
                    raise ValueError('X must be a dataframe, list, series, or dictionary  object.')
            else:
                data = pd.DataFrame(X)
        else:
            data = X.copy()
        return data
        
    def transform(self, X):
        data = self._to_df(X)
        data = data[data_functions.get_loose_features()]
        return data

In [24]:
con_year = x_tr['construction_year']

In [None]:
[('feature_selector', ct.ChooseStrictFeatures()),
('bin_installer', ct.BinInstaller()),
('column_transformer', make_column_transformer((SimpleImputer(missing_values = 0.0), ['construction_year']),
                                                (SimpleImputer(), data_functions.get_numeric_features(data_functions.get_strict_features())),
                                                (ct.TransformConstructionYear(), ['construction_year']),
                                                (StandardScaler(), data_functions.get_numeric_features(data_functions.get_strict_features())),
                                                (OneHotEncoder(handle_unknown = 'ignore', sparse = False), data_functions.get_categorical_features(data_functions.get_strict_features())),
                                                            remainder = 'drop'))]

In [None]:
# k_x_train[dfun.get_numeric_features(k_x_train.columns)] = pd.DataFrame(SI.fit_transform(k_x_train[dfun.get_numeric_features(k_x_train.columns)]), columns = dfun.get_numeric_features(k_x_train.columns))
# k_x_test[dfun.get_numeric_features(k_x_test.columns)] = pd.DataFrame(SI.transform(k_x_test[dfun.get_numeric_features(k_x_test.columns)]), columns = dfun.get_numeric_features(k_x_test.columns))

# print(k_x_train['construction_year'])

# print('simple imputing 0s in construction_year')
# k_x_train[dfun.get_numeric_features(k_x_train.columns)] = pd.DataFrame(SI.fit_transform(k_x_train[dfun.get_numeric_features(k_x_train.columns)]), columns = dfun.get_numeric_features(k_x_train.columns))
# k_x_test[dfun.get_numeric_features(k_x_test.columns)] = pd.DataFrame(SI.transform(k_x_test[dfun.get_numeric_features(k_x_test.columns)]), columns = dfun.get_numeric_features(k_x_test.columns))


# the thing i spent all night building

In [2]:
class custom_grid_search():
    def __init__(self):
        pass
        
    def _fold_data_transform(self, x_tr, y_tr, x_te, y_te, strict = True, smote_params = {}):
        k_x_train = x_tr.copy()
        k_y_train = y_tr.copy()
        k_x_test = x_te.copy()
        k_y_test = y_te.copy()
        
        CSF = ct.ChooseStrictFeatures()
        CLF = ct.ChooseLooseFeatures()
        BI = ct.BinInstaller()
        TCY = ct.TransformConstructionYear()
        SS = StandardScaler()
        SM = SMOTE()
        
        print('choosing features')
        if strict:
            k_x_train = CSF.transform(k_x_train)
            k_x_test = CSF.transform(k_x_test)
        else:
            k_x_train = CLF.transform(k_x_train)
            k_x_test = CLF.transform(k_x_test)
        
        print('simple imputing 0s in all numeric cols')
        for col_name in dfun.get_numeric_features(k_x_train.columns):
            col_mean = k_x_train[k_x_train[col_name] != 0][col_name].mean()
            convert_0 = lambda x: col_mean if (x == 0 or float(x) == np.nan) else x
            k_x_train[col_name] = k_x_train[col_name].apply(convert_0)
            k_x_test[col_name] = k_x_test[col_name].apply(convert_0)
        
        print('converting construction_year')
        k_x_train['construction_year'] = pd.DataFrame(TCY.transform(k_x_train['construction_year']))
        k_x_test['construction_year'] = pd.DataFrame(TCY.transform(k_x_test['construction_year']))
        
        print('scaling numeric data')
        k_x_train[dfun.get_numeric_features(k_x_train.columns)] = pd.DataFrame(SS.fit_transform(k_x_train[dfun.get_numeric_features(k_x_train.columns)]), columns = dfun.get_numeric_features(k_x_train.columns))
        k_x_test[dfun.get_numeric_features(k_x_test.columns)] = pd.DataFrame(SS.transform(k_x_test[dfun.get_numeric_features(k_x_test.columns)]), columns = dfun.get_numeric_features(k_x_test.columns))
        
        print('one hot encoding categorical data')
        k_x_train = pd.get_dummies(k_x_train, columns = dfun.get_categorical_features(k_x_train.columns))
        k_x_test = pd.get_dummies(k_x_test, columns = dfun.get_categorical_features(k_x_test.columns))
        
        print('binning y values')
        
        pdb.set_trace()
        
        bin_y = lambda x: 1 if x == 'functional' else 0
        y_tr_final = k_y_train['status_group'].apply(bin_y)
        y_te_final = k_y_test['status_group'].apply(bin_y)
#         SM.set_params(**smote_params)
#         k_x_train, k_y_train = SM.fit_resample(k_x_train, k_y_train['status_group'])
        
        print('finished transforming fold')
        return (k_x_train, y_tr_final, k_x_test, y_te_final)
        
    def grid_search(self, classifier, param_grid, x, y, k = 5, strict = True, smote_params = {}):
        self.estimators = []
        self.best_estimator_ = None
        keys, values = zip(*param_grid.items())
        permutation_dicts = [dict(zip(keys, v)) for v in itertools.product(*values)]
        cv_scores = [[] for _ in permutation_dicts]
        print('initializing estimators')
        for combo in permutation_dicts:
            estimator = deepcopy(classifier)
            estimator.set_params(**combo)
            self.estimators.append(estimator)
        print('making kfolds')
        kf = KFold(n_splits = k)
        i = 1
        for train_index, test_index in kf.split(x, y):
            print(f'transforming k-fold #{i}')
            i += 1
            x.index = list(range(0, x.shape[0]))
            y.index = list(range(0, y.shape[0]))
            train_mask = [(x in train_index) for x in x.index]
            test_mask = [(x in train_index) for x in x.index]
            k_x_tr, k_y_tr, k_x_te, k_y_te = self._fold_data_transform(x.loc[train_mask], y.loc[train_mask], x.loc[test_mask], y.loc[test_mask], strict, smote_params)
            for model, index in zip(self.estimators, range(0, len(self.estimators))):
                print(f'fitting estimator #{index + 1} of {len(self.estimators)}: {model}')
                
                model.fit(k_x_tr, k_y_tr)
                score = model.score(k_x_te, k_y_te)
                cv_scores[index].append(score)
        cv_means = [mean(scores) for scores in cv_scores]
        self.best_score_ = max(cv_means)
        self.best_estimator_ = self.estimators[cv_means.index(self.best_score_)]
        print(f'best estimator: {self.best_estimator_}\nbest score: {self.best_score_}')
        

# PLEASE DEAR GOD WORK FIRST TRY

it didnt work first try

In [3]:
x_train, x_test, y_train = dfun.get_dataframes()

In [4]:
y_train.columns

Index(['status_group'], dtype='object')

In [5]:
cgs = custom_grid_search()

In [6]:
p_grid = {'max_depth': [3, 5, 7], 'min_samples_leaf': [1, 3, 6]}
import pdb; pdb.set_trace()
cgs.grid_search(DecisionTreeClassifier(), p_grid, x_train, y_train)

--Return--
> <ipython-input-6-fed847eb4dd0>(2)<module>()->None
-> import pdb; pdb.set_trace()
(Pdb) p_grid
{'max_depth': [3, 5, 7], 'min_samples_leaf': [1, 3, 6]}
(Pdb) n
> /opt/anaconda3/envs/water_well-env/lib/python3.6/site-packages/IPython/core/interactiveshell.py(2881)run_code()
-> sys.excepthook = old_excepthook
(Pdb) q


BdbQuit: 

# giving this one last attempt using the preprocessing code angie was using for by-hand hyperparameter tuning

In [6]:
class custom_grid_search():
    def __init__(self):
        pass
        
    def _fold_data_transform(self, x_tr, y_tr, x_te, y_te, strict = True, smote_params = {}):
        k_x_train = x_tr.copy()
        k_y_train = y_tr.copy()
        k_x_test = x_te.copy()
        k_y_test = y_te.copy()
        
        x_train_nums= k_x_train.select_dtypes(exclude="object")
        x_train_cat= k_x_train.select_dtypes(include="object")
        x_test_nums = k_x_test.select_dtypes(exclude="object")
        x_test_cat = k_x_test.select_dtypes(include="object")
        ohe=OneHotEncoder(sparse=False, handle_unknown= 'ignore')
        x_train_ohe = pd.DataFrame(ohe.fit_transform(x_train_cat), columns= ohe.get_feature_names(x_train_cat.columns), index= x_train_cat.index)
        x_test_ohe = pd.DataFrame(ohe.transform(x_test_cat), columns= ohe.get_feature_names(x_test_cat.columns), index= x_test_cat.index)
        si=SimpleImputer()
        x_nums_si = pd.DataFrame(si.fit_transform(x_train_nums), index= x_train_nums.index, columns= x_train_nums.columns)
        x_nums_si_te = pd.DataFrame(si.transform(x_test_nums), index= x_test_nums.index, columns= x_test_nums.columns)
        scale= StandardScaler()
        x_train_nums_scaled= pd.DataFrame(scale.fit_transform(x_nums_si), index= x_nums_si.index, columns= x_nums_si.columns)
        x_test_nums_scaled= pd.DataFrame(scale.transform(x_num_si_te), index= x_nums_si_te.index, columns= x_nums_si_te.columns)
        x_tr_joined= x_train_nums_scaled.join(x_train_ohe)
        x_te_final = x_test_nums_scaled.join(x_test_ohe)
        
        print('up-sampling with SMOTE')
        bin_y = lambda x: 1 if x == 'functional' else 0
        y_tr_final = k_y_train['status_group'].apply(bin_y)
        y_te_final = k_y_test['status_group'].apply(bin_y)
        
#         sm1 = SMOTE(random_state=2020)
#         x_tr_final, y_tr_final = sm1.fit_resample(x_tr_join, k_y_train)
        
        print('finished transforming fold')
        return (x_tr_final, y_tr_final, x_te_final)
        
    def grid_search(self, classifier, param_grid, x, y, k = 5, strict = True, smote_params = {}):
        self.estimators = []
        self.best_estimator_ = None
        keys, values = zip(*param_grid.items())
        permutation_dicts = [dict(zip(keys, v)) for v in itertools.product(*values)]
        cv_scores = [[] for _ in permutation_dicts]
        print('initializing estimators')
        for combo in permutation_dicts:
            estimator = deepcopy(classifier)
            estimator.set_params(**combo)
            self.estimators.append(estimator)
        print('making kfolds')
        kf = KFold(n_splits = k)
        i = 1
        for train_index, test_index in kf.split(x, y):
            print(f'transforming k-fold #{i}')
            i += 1
            x.index = list(range(0, x.shape[0]))
            y.index = list(range(0, y.shape[0]))
            train_mask = [(x in train_index) for x in x.index]
            test_mask = [(x in train_index) for x in x.index]
            k_x_tr, k_y_tr, k_x_te, k_y_te= self._fold_data_transform(x.loc[train_mask], y.loc[train_mask], x.loc[test_mask], y.loc[test_mask], strict, smote_params)
            for model, index in zip(self.estimators, range(0, len(self.estimators))):
                print(f'fitting estimator #{index + 1} of {len(self.estimators)}: {model}')
                
                model.fit(k_x_tr, k_y_tr)
                score = model.score(k_x_te, k_y_te)
                cv_scores[index].append(score)
        cv_means = [mean(scores) for scores in cv_scores]
        self.best_score_ = max(cv_means)
        self.best_estimator_ = self.estimators[cv_means.index(self.best_score_)]
        print(f'best estimator: {self.best_estimator_}\nbest score: {self.best_score_}')
        

# still not working

In [7]:
cgs2 = custom_grid_search()
x_train, x_test, y_train = dfun.get_dataframes()
p_grid = {'max_depth': [3, 5, 7], 'min_samples_leaf': [1, 3, 6]}
cgs2.grid_search(DecisionTreeClassifier(), p_grid, x_train, y_train)

initializing estimators
making kfolds
transforming k-fold #1


ValueError: Input contains NaN

In [9]:
kf = KFold(n_splits=3)
kf.split(x_train, y_train)

<generator object _BaseKFold.split at 0x7fd8f4aca0a0>

In [18]:
np.nan

nan

In [11]:
[[] for x in range(0,3)]

[[], [], []]

In [54]:
a = [3, 4, 5]
b = [3, 4, 5, 5]
for i, j in zip(a, range(0, len(a))):
    print(i, j)

3 0
4 1
5 2


In [55]:
print(a.index(max(a)))
print(b.index(max(b)))

2
2


In [12]:
my_dict = {'key1': 1, 'key2': 2}

In [13]:
my_dict.keys

<function dict.keys>

In [14]:
my_dict.keys()

dict_keys(['key1', 'key2'])

In [16]:
[0 for x in my_dict.keys()]

[0, 0]

In [122]:
my_list = []
dtc = DecisionTreeClassifier()
my_x = [[np.random.choice(list(range(0, 1000))), np.random.choice(list(range(500, 1000))), np.random.choice(list(range(500, 1000))), np.random.choice(list(range(500, 1000)))] for _ in range(0, 200)]
my_y = [np.random.choice(list(range(0, 5000))) for _ in range(0, 200)]
my_pred_x = [[np.random.choice(list(range(0, 1000))), np.random.choice(list(range(500, 1000)))] for _ in range(0, 200)]
my_pred_y = [np.random.choice(list(range(0, 5000))) for _ in range(0, 200)]
params = {'max_features': [1, 2], 'max_depth': [3, 5, 7, 20]}
keys, values = zip(*params.items())
permutation_dicts = [dict(zip(keys, v)) for v in itertools.product(*values)]
for x in permutation_dicts:
    print(x)
    y = deepcopy(dtc).set_params(**x)
    my_list.append(y)
for x in my_list:
    x.fit(my_x, my_y)
    #print(x.predict(my_pred_x))
    print(x.score(my_pred_x, my_pred_y))

{'max_features': 1, 'max_depth': 3}
{'max_features': 1, 'max_depth': 5}
{'max_features': 1, 'max_depth': 7}
{'max_features': 1, 'max_depth': 20}
{'max_features': 2, 'max_depth': 3}
{'max_features': 2, 'max_depth': 5}
{'max_features': 2, 'max_depth': 7}
{'max_features': 2, 'max_depth': 20}


ValueError: Number of features of the model must match the input. Model n_features is 4 and input n_features is 2 

In [19]:
my_x = [[np.random.choice(list(range(0, 1000))), np.random.choice(list(range(500, 1000))), np.random.choice(list(range(500, 1000))), np.random.choice(list(range(500, 1000)))] for _ in range(0, 201)]
my_y = [np.random.choice(list(range(0, 3))) for _ in range(0, 201)]

my_x_tester = pd.DataFrame(my_x, columns = ['a', 'b', 'c', 'd'])
my_y_tester = pd.DataFrame(my_y, columns = ['target'])

In [106]:
test_dummies = pd.DataFrame([['a', 2], ['b', 7], [np.nan, 5]], columns = ['letters', 'ints'])
test_dummies

Unnamed: 0,letters,ints
0,a,2
1,b,7
2,,5


In [107]:
pd.get_dummies(test_dummies, dummy_na = True, columns = ['letters'])

Unnamed: 0,ints,letters_a,letters_b,letters_nan
0,2,1,0,0
1,7,0,1,0
2,5,0,0,1


In [56]:
k = 3
[[int(i * (my_x_tester.shape[0] / k)), int((i + 1) * (my_x_tester.shape[0] / k))] for i in range(0,k)]

[[0, 67], [67, 134], [134, 201]]

In [112]:
ss = StandardScaler()

In [140]:
 my_y_tester

Unnamed: 0,target
0,0
1,1
2,2
3,0
4,1
...,...
195,2
196,1
197,2
198,2


In [149]:
sm = SMOTE()

In [152]:
smote_out_x, smote_out_y = sm.fit_resample(my_x_tester, my_y_tester)
# pd.DataFrame(ss.fit_transform(my_x_tester[['a', 'b']]), columns = ['a', 'b'])
# my_x_tester

In [153]:
smote_out_x

Unnamed: 0,a,b,c,d
0,728,892,609,925
1,952,927,926,981
2,654,787,561,944
3,675,968,501,773
4,275,539,599,721
...,...,...,...,...
226,641,825,788,536
227,577,980,633,800
228,258,550,747,868
229,906,760,905,833


In [121]:
tester['b'] = tester['b'].apply(lambda x: x + 3)
tester

Unnamed: 0,a,b
0,164.00,632
1,681.00,938
2,828.00,841
3,860.00,507
4,500.00,902
...,...,...
197,6.00,514
198,27.00,623
199,334.00,939
200,491.07,3


In [16]:
1/np.nan

nan

In [20]:
my_x_tester.loc[np.nan]

TypeError: cannot do label indexing on <class 'pandas.core.indexes.range.RangeIndex'> with these indexers [nan] of <class 'float'>