## Import Libraries

In [1]:
#Suppress FutureWarnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd 
import numpy as np 
import seaborn as sns 

#display the graphics made by python inline with the text
%matplotlib inline
import matplotlib.pyplot as plt 


## Load data

In [2]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
submission = pd.read_csv('SampleSubmission.csv')

## Understand the Data with Descriptive Statistics

In [3]:
#Check the shape and size of datasets
train.shape, test.shape, submission.shape

((108446, 42), (46477, 41), (46477, 2))

In [4]:
#peek at the data
train.tail()

Unnamed: 0,ID,country_code,region,age,FQ1,FQ2,FQ3,FQ4,FQ5,FQ6,...,FQ27,FQ28,FQ29,FQ30,FQ31,FQ32,FQ33,FQ34,FQ37,Target
108441,ID_ZZYY7RTO,111,4,23.0,2,,,2,,2.0,...,,,,,,,1.0,,1,0
108442,ID_ZZYZTTC6,77,4,60.0,1,,,2,,,...,,,,,,,1.0,1.0,1,0
108443,ID_ZZZ3OW3S,42,2,59.0,1,,1.0,1,,,...,,,2.0,,,2.0,1.0,2.0,1,1
108444,ID_ZZZLDXE8,57,7,79.0,1,,,2,,1.0,...,,,2.0,,,,1.0,1.0,1,0
108445,ID_ZZZMYW1F,110,2,74.0,2,1.0,2.0,2,,1.0,...,,,1.0,,,,1.0,,1,1


In [5]:
#Look at first 5 records
test.head()

Unnamed: 0,ID,country_code,region,age,FQ1,FQ2,FQ3,FQ4,FQ5,FQ6,...,FQ26,FQ27,FQ28,FQ29,FQ30,FQ31,FQ32,FQ33,FQ34,FQ37
0,ID_000YI58E,39,2,22.0,2,,,2,,1.0,...,2,,,,,,2.0,1.0,1.0,0
1,ID_001SP4JF,30,2,62.0,1,,,2,,1.0,...,2,,,2.0,,1.0,1.0,1.0,1.0,0
2,ID_001VOF6S,65,4,35.0,2,1.0,,1,1.0,,...,2,,,,,,,1.0,,0
3,ID_0030LULG,123,0,24.0,2,1.0,,2,,1.0,...,2,,,2.0,,,,1.0,1.0,1
4,ID_0037PZ3R,67,2,25.0,2,,,1,,,...,2,,,1.0,,,,2.0,1.0,1


In [6]:
submission.head()

Unnamed: 0,ID,Target
0,ID_000YI58E,
1,ID_001SP4JF,
2,ID_001VOF6S,
3,ID_0030LULG,
4,ID_0037PZ3R,


In [7]:
#Statistical summary
train.describe()

Unnamed: 0,country_code,region,age,FQ1,FQ2,FQ3,FQ4,FQ5,FQ6,FQ7,...,FQ27,FQ28,FQ29,FQ30,FQ31,FQ32,FQ33,FQ34,FQ37,Target
count,108446.0,108446.0,108124.0,108446.0,49124.0,46218.0,108446.0,21185.0,60659.0,60620.0,...,3200.0,1506.0,83912.0,2115.0,869.0,60796.0,108444.0,76652.0,108446.0,108446.0
mean,68.544953,2.894242,41.857395,1.563294,1.063716,1.29971,1.824622,1.160113,1.223907,1.206961,...,1.578125,1.351262,1.86033,1.61513,1.436133,1.854744,1.178479,1.127511,0.631457,0.27397
std,41.529264,2.286505,17.876105,0.530077,0.288075,0.468503,0.435942,0.383827,0.45014,0.44078,...,0.563689,0.493938,0.382599,0.548808,0.557423,0.401499,0.398819,0.350632,0.482412,0.445996
min,0.0,-1.0,15.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,33.0,1.0,27.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,...,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0
50%,65.0,3.0,39.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,...,2.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,0.0
75%,105.0,4.0,55.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0
max,143.0,7.0,99.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1.0,1.0


Here:
- 0: Did not use mobile or internet banking
- 1: Used mobile or internet banking

This shows that less people use moble or internet banking.

## Examine Missing & Duplicated Values
Data
There are also various ways to handle missing data:
 - Remove any row with missing data
 - Remove any column with missing data
 - Impute missing values


### Duplicated values in data sets

In [8]:
#Find duplicates
train.duplicated().any()

False

In [9]:
test.duplicated().any()

False

### Missing Values

In [10]:
#Counting the Number of Null rows in each Column of the dataframe
train.isnull().sum()

ID                   0
country_code         0
region               0
age                322
FQ1                  0
FQ2              59322
FQ3              62228
FQ4                  0
FQ5              87261
FQ6              47787
FQ7              47826
FQ8                  0
FQ9                  0
FQ10                 0
FQ11             24570
FQ12                 0
FQ13                 0
FQ14                 0
FQ15                 0
FQ16                 0
FQ17             97099
FQ18                 0
FQ19             47407
FQ20             24679
FQ21             24635
FQ22                 0
FQ23                 0
FQ24             70014
FQ35             82557
FQ36             96963
FQ25                 0
FQ26                 0
FQ27            105246
FQ28            106940
FQ29             24534
FQ30            106331
FQ31            107577
FQ32             47650
FQ33                 2
FQ34             31794
FQ37                 0
Target               0
dtype: int64

In [11]:
#Counting the Number of Null rows in each Column of the dataframe
test.isnull().sum()

ID                  0
country_code        0
region              0
age               129
FQ1                 0
FQ2             25676
FQ3             26549
FQ4                 0
FQ5             37564
FQ6             20599
FQ7             20560
FQ8                 0
FQ9                 0
FQ10                0
FQ11            10565
FQ12                0
FQ13                0
FQ14                0
FQ15                0
FQ16                0
FQ17            41599
FQ18                0
FQ19            20357
FQ20            10456
FQ21            10500
FQ22                0
FQ23                0
FQ24            29912
FQ35            35425
FQ36            41577
FQ25                0
FQ26                0
FQ27            45034
FQ28            45846
FQ29            10601
FQ30            45601
FQ31            46113
FQ32            20477
FQ33                0
FQ34            13341
FQ37                0
dtype: int64

## Dealing with Missing Values

In [12]:
# Total missing values for each feature
print (train.isnull().sum()/ len(train)*100)

ID               0.000000
country_code     0.000000
region           0.000000
age              0.296922
FQ1              0.000000
FQ2             54.701879
FQ3             57.381554
FQ4              0.000000
FQ5             80.464932
FQ6             44.065249
FQ7             44.101212
FQ8              0.000000
FQ9              0.000000
FQ10             0.000000
FQ11            22.656437
FQ12             0.000000
FQ13             0.000000
FQ14             0.000000
FQ15             0.000000
FQ16             0.000000
FQ17            89.536728
FQ18             0.000000
FQ19            43.714844
FQ20            22.756948
FQ21            22.716375
FQ22             0.000000
FQ23             0.000000
FQ24            64.561164
FQ35            76.127289
FQ36            89.411320
FQ25             0.000000
FQ26             0.000000
FQ27            97.049223
FQ28            98.611290
FQ29            22.623241
FQ30            98.049721
FQ31            99.198680
FQ32            43.938919
FQ33        

In [13]:
total = train.isnull().sum().sort_values(ascending=False)
percent =total/len(train)*100
pd.concat([total,percent], axis=1, keys=['Total','Percent']).head(20)

Unnamed: 0,Total,Percent
FQ31,107577,99.19868
FQ28,106940,98.61129
FQ30,106331,98.049721
FQ27,105246,97.049223
FQ17,97099,89.536728
FQ36,96963,89.41132
FQ5,87261,80.464932
FQ35,82557,76.127289
FQ24,70014,64.561164
FQ3,62228,57.381554


In [14]:
print (test.isnull().sum()/ len(test)*100)

ID               0.000000
country_code     0.000000
region           0.000000
age              0.277557
FQ1              0.000000
FQ2             55.244530
FQ3             57.122878
FQ4              0.000000
FQ5             80.822773
FQ6             44.320847
FQ7             44.236934
FQ8              0.000000
FQ9              0.000000
FQ10             0.000000
FQ11            22.731674
FQ12             0.000000
FQ13             0.000000
FQ14             0.000000
FQ15             0.000000
FQ16             0.000000
FQ17            89.504486
FQ18             0.000000
FQ19            43.800159
FQ20            22.497149
FQ21            22.591820
FQ22             0.000000
FQ23             0.000000
FQ24            64.358715
FQ35            76.220496
FQ36            89.457151
FQ25             0.000000
FQ26             0.000000
FQ27            96.895239
FQ28            98.642339
FQ29            22.809131
FQ30            98.115197
FQ31            99.216817
FQ32            44.058351
FQ33        

In [15]:
total_test = test.isnull().sum().sort_values(ascending=False)
percent =total_test/len(train)*100
pd.concat([total,percent], axis=1, keys=['Total','Percent']).head(20)

Unnamed: 0,Total,Percent
FQ31,107577,42.521624
FQ28,106940,42.275418
FQ30,106331,42.049499
FQ27,105246,41.526658
FQ17,97099,38.359183
FQ36,96963,38.338897
FQ5,87261,34.638438
FQ35,82557,32.666027
FQ24,70014,27.582391
FQ3,62228,24.481309


Here you can clearly see that 7 columns have Null values higher than 80% so it is good to drop those columns from our data.

In [16]:
train = train [train.columns[train.isnull().mean() < 0.80]]
train.columns

Index(['ID', 'country_code', 'region', 'age', 'FQ1', 'FQ2', 'FQ3', 'FQ4',
       'FQ6', 'FQ7', 'FQ8', 'FQ9', 'FQ10', 'FQ11', 'FQ12', 'FQ13', 'FQ14',
       'FQ15', 'FQ16', 'FQ18', 'FQ19', 'FQ20', 'FQ21', 'FQ22', 'FQ23', 'FQ24',
       'FQ35', 'FQ25', 'FQ26', 'FQ29', 'FQ32', 'FQ33', 'FQ34', 'FQ37',
       'Target'],
      dtype='object')

In [17]:
test.columns

Index(['ID', 'country_code', 'region', 'age', 'FQ1', 'FQ2', 'FQ3', 'FQ4',
       'FQ5', 'FQ6', 'FQ7', 'FQ8', 'FQ9', 'FQ10', 'FQ11', 'FQ12', 'FQ13',
       'FQ14', 'FQ15', 'FQ16', 'FQ17', 'FQ18', 'FQ19', 'FQ20', 'FQ21', 'FQ22',
       'FQ23', 'FQ24', 'FQ35', 'FQ36', 'FQ25', 'FQ26', 'FQ27', 'FQ28', 'FQ29',
       'FQ30', 'FQ31', 'FQ32', 'FQ33', 'FQ34', 'FQ37'],
      dtype='object')

In [18]:
train = train.dropna(thresh = 2)                # Apply dropna() function
train

Unnamed: 0,ID,country_code,region,age,FQ1,FQ2,FQ3,FQ4,FQ6,FQ7,...,FQ24,FQ35,FQ25,FQ26,FQ29,FQ32,FQ33,FQ34,FQ37,Target
0,ID_000J8GTZ,1,6,35.0,2,,,2,,1.0,...,,1.0,2,2,1.0,,1.0,1.0,0,0
1,ID_000QLXZM,32,7,70.0,2,,,2,,1.0,...,,,1,1,2.0,,1.0,2.0,0,0
2,ID_001728I2,71,7,22.0,2,1.0,,2,,1.0,...,,,2,1,2.0,,2.0,1.0,1,0
3,ID_001R7IDN,48,3,27.0,1,,,2,2.0,,...,,,2,2,,2.0,1.0,1.0,1,0
4,ID_0029QKF8,25,0,79.0,2,,,2,,,...,2.0,,2,2,2.0,2.0,1.0,1.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108441,ID_ZZYY7RTO,111,4,23.0,2,,,2,2.0,,...,2.0,,2,2,,,1.0,,1,0
108442,ID_ZZYZTTC6,77,4,60.0,1,,,2,,,...,,,2,2,,,1.0,1.0,1,0
108443,ID_ZZZ3OW3S,42,2,59.0,1,,1.0,1,,1.0,...,,,2,2,2.0,2.0,1.0,2.0,1,1
108444,ID_ZZZLDXE8,57,7,79.0,1,,,2,1.0,,...,,,2,2,2.0,,1.0,1.0,1,0


In [19]:
test = test.dropna(thresh = 2)                # Apply dropna() function
test

Unnamed: 0,ID,country_code,region,age,FQ1,FQ2,FQ3,FQ4,FQ5,FQ6,...,FQ26,FQ27,FQ28,FQ29,FQ30,FQ31,FQ32,FQ33,FQ34,FQ37
0,ID_000YI58E,39,2,22.0,2,,,2,,1.0,...,2,,,,,,2.0,1.0,1.0,0
1,ID_001SP4JF,30,2,62.0,1,,,2,,1.0,...,2,,,2.0,,1.0,1.0,1.0,1.0,0
2,ID_001VOF6S,65,4,35.0,2,1.0,,1,1.0,,...,2,,,,,,,1.0,,0
3,ID_0030LULG,123,0,24.0,2,1.0,,2,,1.0,...,2,,,2.0,,,,1.0,1.0,1
4,ID_0037PZ3R,67,2,25.0,2,,,1,,,...,2,,,1.0,,,,2.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46472,ID_ZZYOTVBJ,112,3,38.0,1,1.0,,2,,,...,2,,,4.0,,,,1.0,1.0,1
46473,ID_ZZYSX122,82,2,77.0,2,1.0,1.0,2,,1.0,...,1,,,2.0,,,,1.0,2.0,1
46474,ID_ZZYXQDSD,93,5,26.0,2,,,2,,1.0,...,2,,,2.0,,,,1.0,1.0,1
46475,ID_ZZZH9SS4,6,7,59.0,2,1.0,,2,,1.0,...,2,,,,,,,1.0,1.0,1


In [20]:
train.dropna(subset = ['FQ33'], axis = 0, how = 'any', inplace = True)
train.isnull().sum()

ID                  0
country_code        0
region              0
age               322
FQ1                 0
FQ2             59321
FQ3             62228
FQ4                 0
FQ6             47786
FQ7             47825
FQ8                 0
FQ9                 0
FQ10                0
FQ11            24569
FQ12                0
FQ13                0
FQ14                0
FQ15                0
FQ16                0
FQ18                0
FQ19            47406
FQ20            24679
FQ21            24635
FQ22                0
FQ23                0
FQ24            70013
FQ35            82556
FQ25                0
FQ26                0
FQ29            24533
FQ32            47649
FQ33                0
FQ34            31794
FQ37                0
Target              0
dtype: int64

In [21]:
test.dropna(subset = ['FQ33'], axis = 0, how = 'any', inplace = True)
test.isnull().sum()

ID                  0
country_code        0
region              0
age               129
FQ1                 0
FQ2             25676
FQ3             26549
FQ4                 0
FQ5             37564
FQ6             20599
FQ7             20560
FQ8                 0
FQ9                 0
FQ10                0
FQ11            10565
FQ12                0
FQ13                0
FQ14                0
FQ15                0
FQ16                0
FQ17            41599
FQ18                0
FQ19            20357
FQ20            10456
FQ21            10500
FQ22                0
FQ23                0
FQ24            29912
FQ35            35425
FQ36            41577
FQ25                0
FQ26                0
FQ27            45034
FQ28            45846
FQ29            10601
FQ30            45601
FQ31            46113
FQ32            20477
FQ33                0
FQ34            13341
FQ37                0
dtype: int64

In [22]:
train = train.drop (['ID'], axis = 1)
test = test.drop(['ID','FQ5', 'FQ17', 'FQ36', 'FQ27', 'FQ28', 'FQ30', 'FQ31'], axis = 1)

In [23]:
train.shape, test.shape

((108444, 34), (46477, 33))

In [24]:
y = train.Target
X = train.drop('Target', axis=1)
#X = X.select_dtypes(exclude=['object'])

X

Unnamed: 0,country_code,region,age,FQ1,FQ2,FQ3,FQ4,FQ6,FQ7,FQ8,...,FQ23,FQ24,FQ35,FQ25,FQ26,FQ29,FQ32,FQ33,FQ34,FQ37
0,1,6,35.0,2,,,2,,1.0,2,...,2,,1.0,2,2,1.0,,1.0,1.0,0
1,32,7,70.0,2,,,2,,1.0,2,...,2,,,1,1,2.0,,1.0,2.0,0
2,71,7,22.0,2,1.0,,2,,1.0,2,...,2,,,2,1,2.0,,2.0,1.0,1
3,48,3,27.0,1,,,2,2.0,,2,...,2,,,2,2,,2.0,1.0,1.0,1
4,25,0,79.0,2,,,2,,,2,...,1,2.0,,2,2,2.0,2.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108441,111,4,23.0,2,,,2,2.0,,1,...,2,2.0,,2,2,,,1.0,,1
108442,77,4,60.0,1,,,2,,,2,...,2,,,2,2,,,1.0,1.0,1
108443,42,2,59.0,1,,1.0,1,,1.0,2,...,2,,,2,2,2.0,2.0,1.0,2.0,1
108444,57,7,79.0,1,,,2,1.0,,2,...,2,,,2,2,2.0,,1.0,1.0,1


## Train the model

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=999)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
X.head()

In [26]:
from sklearn.impute import SimpleImputer
# Impute training and test data
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
train = imputer.fit_transform(X)
test = imputer.transform(test)

from sklearn.preprocessing import MinMaxScaler
# Scale training and test data
scaler = MinMaxScaler()
train = scaler.fit_transform(X)
test = scaler.transform(test)

In [27]:
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, RepeatedStratifiedKFold


from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()

# Bundle preprocessing and modeling code in a pipeline
from sklearn.pipeline import Pipeline
pipeline = Pipeline ([('impute', imputer), ('model', dt_model)])

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=1)

'Mean accurracy: {} std: {}'.format(round(np.mean(scores), 3), round(np.std(scores), 3))

'Mean accurracy: 0.591 std: 0.004'

In [32]:
# Preprocessing of training data, fit model 
pipeline.fit(X, y)

# Preprocessing of validation data, get predictions
predictions = pipeline.predict(test)

In [33]:
submission['Target'] = predictions

In [34]:
submission.head()

Unnamed: 0,ID,Target
0,ID_000YI58E,0
1,ID_001SP4JF,0
2,ID_001VOF6S,0
3,ID_0030LULG,0
4,ID_0037PZ3R,0


In [31]:
submission.to_csv('submission_Tree.csv', index=False)

In [None]:
import lightgbm

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer

rf_model = RandomForestClassifier(n_estimators=100, random_state=10,
                               n_jobs = -1)
scorer = make_scorer(f1_score, greater_is_better=True, average = 'macro')
# 10 fold cross validation
cv_score = cross_val_score(model, trainData, trainTarget, cv=10, scoring=scorer)
print('10 Fold Cross Validation F1 Score = {} with std = {}'.format(round(cv_score.mean(), 4), round(cv_score.std(), 4)))

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
#define imputer
imputer = IterativeImputer()
#fit on the dataset
imputer.fit(X)
#transform the dataset
X_imputed = imputer.transform(X)

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)
test = pd.get_dummies(test)

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan)
X_imputed = imputer.fit_transform(X_train.values)
imputed_X_test = imputer.transform(X_test.values)
print("Imputed data:")
print(X_imputed)

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan)

imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
imputed_X_test = pd.DataFrame(imputer.transform(X_test))
imputed_test = pd.DataFrame(imputer.transform(test))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_test.columns = X_test.columns
imputed_test.columns = test.columns

# Number of missing values in each column of training data
missing_val_count_by_column = (imputed_X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

print("MAE from Approach 2 (Imputation):")
print((imputed_X_train, imputed_X_test, y_train, y_test))

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
#define imputer
imputer = IterativeImputer()
#fit on the dataset
imputer.fit(X_train)
#transform the dataset
X_trainB = imputer.transform(X_train)
X_testB = imputer.transform(X_test)

from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()
imputed_X_train = my_imputer.fit_transform(X_train)
imputed_X_test = my_imputer.transform(X_test)
print("Mean Absolute Error from Imputation:")
print(score_dataset(imputed_X_train, imputed_X_test, y_train, y_test))

#Itlearns about the data and does nothing else
my_imputer.fit(X_train)

#Calling transform to apply the learnt information on supplied data
X_train_new = my_imputer.transform(X_train)
X_test_new = my_imputer.transform(X_test)

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')


X_imputed = imputer.fit_transform(dfstd['marks'].values.reshape(-1,1))[:,0]


from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
X = pd.DataFrame(imputer.fit_transform(X),columns = X.columns)
X.head()

from sklearn.impute import KNNImputer

#Initialize KNNImputer
imputer = KNNImputer (n_neighbors = 2)

#Impute/Fill Missing values of each feature
X_imputed = pd.DataFrame(imputer.fit_transform (X_train, y_train),  columns= X.columns)

results = imputer.transform (X_test)

results.shape


from sklearn.impute import KNNImputer

#Initialize KNNImputer
imputer = KNNImputer (n_neighbors = 2)

#Impute/Fill Missing values of each feature
X_imputed = imputer.fit_transform (X)


## Model Evaluation

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, RepeatedStratifiedKFold

In [None]:
#imputer = SimpleImputer()

from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()

# Bundle preprocessing and modeling code in a pipeline
from sklearn.pipeline import Pipeline
pipeline = Pipeline ([('impute', imputer), ('model', dt_model)])

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=1)

'Mean accurracy: {} std: {}'.format(round(np.mean(scores), 3), round(np.std(scores), 3))

In [None]:
# Preprocessing of training data, fit model 
pipeline.fit(imputed_X_train, y_train)

# Preprocessing of validation data, get predictions
predictions = pipeline.predict(imputed_X_test)

In [None]:
# Number of missing values in each column of training data
missing_val_count_by_column = (imputed_X_test.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

In [None]:
submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':predictions})

In [None]:
submission['Target'] = predictions

submission = pd.DataFrame({'Target':predictions})

In [None]:
submission.head()

In [None]:
output = pd.DataFrame({'ID': test.ID, 'Target': predictions})

In [None]:
submission.head()

from xgboost import XGBRegressor

my_model = XGBRegressor()
my_model.fit(X_train, y_train)
#validate model
from sklearn.metrics import mean_absolute_error
predictions = my_model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

In [None]:
output.to_csv('submission_Tree.csv', index=False)

## Make new prediction

print(X.shape)
print(y.shape)
print(X_imputed.shape)
print(y_train.shape)

#Instantiate model and fit to data
dt_model.fit(X, y)
#Make predictions and store in 'Survived' column of df_test
Y_pred = dt_model.predict(imputed_X_test)
test['Survived'] = Y_pred

dt_model.fit(X_imputed, y_train)
predictions = dt_model.predict(imputed_X_test)

test['Target'] = predictions

dt_model = dt_model.fit(X_imputed, y)
y_predict = dt_model.predict(imputed_X_test)

# Make predictions and store in 'Survived' column of df_test
test['Target'] = y_predict

df.apply(lambda col: col.drop_duplicates().reset_index(drop=True))


In [None]:
test.shape, y_predict.shape

In [None]:
sub_ID = test['ID']
submission_df = pd.DataFrame({
                  "ID": sub_ID, 
                  "Target": y_predict})

In [None]:
submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':predictions})

#Visualize the first 5 rows
submission.head(

In [None]:
#ID = test['ID']
submission_df = pd.DataFrame({
                  "ID": test.ID, 
                  "Target": y_predict})

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf = clf.fit(imputed_X_train, y_train)
y_pred = clf.predict(imputed_X_test)

In [None]:
submission.columns

In [None]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)

In [None]:
submission_df_1.to_csv('submission_1.csv', index=False)

In [None]:
xgbr = xgb.XGBRegressor()
xgbr = xgbr.fit(X, y)
pred_values = xgbr.predict(test_scaled)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model = dt_model.fit (X, y_train)

In [None]:
print("model score: %.3f" % dt_model.score(X_testB, y_test))

In [None]:
threshold = 0.4

predicted_proba = dt_model.predict_proba(X_testB)
predicted = (predicted_proba [:,1] >= threshold).astype('int')

accuracy = f1_score(y_test, predicted)
print(accuracy)

In [None]:
test_id = test.drop('ID', axis=1)

In [None]:
pred= dt_model.predict(X_testB)
print('Score', f1_score(y_test, pred))

In [None]:
print(classification_report(y_test, predicted))

In [None]:
pred= dt_model.predict(X_testB)
print('Score', f1_score(y_test, pred))

In [None]:
dt_model.fit(X,y)

In [None]:
threshold = 0.4

predicted_proba = dt_model.predict_proba(X_test)
predicted = (predicted_proba [:,1] >= threshold).astype('int')

accuracy = f1_score(y_test, predicted)
print(accuracy)

In [None]:
pred= rf.predict(val_x)
print('Score', f1_score(val_y, pred))

In [None]:
print(classification_report(val_y, predicted))

In [None]:
rf.fit(X,y)

In [None]:

results = clf.predict (X_test)

In [None]:
X_test = test.drop(['ID', 'FQ5', 'FQ17', 'FQ36', 'FQ27', 'FQ28', 'FQ30', 'FQ31'], axis= 1)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=2)
rf_model.fit(X_imputed, y)
y_pred = model.predict(X_test)

In [None]:
output = pd.DataFrame({'ID': test.ID, 'Target': y_pred})
output.to_csv('submission_RF2.csv', index=False)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train = pd.DataFrame(scaler.fit_transform(train), columns = train.columns)
train.head()

## Imputing Missing Data Using Sklearn SimpleImputer
SimpleImputer is a class found in package sklearn.impute. It is used to impute / replace the numerical or categorical missing data related to one or more features with appropriate values. 
https://dzone.com/articles/imputing-missing-data-using-sklearn-simpleimputer


There are two columns / features (one numerical - marks, and another categorical - gender) which are having missing values and need to be imputed. In the code below, an instance of SimpleImputer is created with strategy as "mean". The missing value is represented using NaN. Note some of the following:

sklearn.impute package is used for importing SimpleImputer class.
SimpleImputer takes two argument such as missing_values and strategy.
fit_transform method is invoked on the instance of SimpleImputer to impute the missing values.

### Imputation Approach with KNNImputer
We will use the KNNImputer function from the impute module of the sklearn. KNNImputer helps to impute missing values present in the observations by finding the nearest neighbors with the Euclidean distance matrix.

In [None]:
from sklearn.impute import KNNImputer

#KNN based imputation for categorical variables
imputer = KNNImputer (n_neighbors = 2)


imputed_train = imputer.fit_transform(train[['country_code', 'region', 'age', 'FQ1', 'FQ2', 'FQ3', 'FQ4','FQ5', 'FQ6', 'FQ7', 'FQ8', 'FQ9', 'FQ10', 'FQ11', 'FQ12', 'FQ13','FQ14', 'FQ15', 'FQ16', 'FQ17', 'FQ18', 'FQ19', 'FQ20', 'FQ21', 'FQ22','FQ23', 'FQ24', 'FQ35', 'FQ36', 'FQ25', 'FQ26', 'FQ27', 'FQ28', 'FQ29','FQ30', 'FQ31', 'FQ32', 'FQ33', 'FQ34', 'FQ37']])

#print the completed dataframe
imputed_train

In [None]:
# Importing the SimpleImputer class
from sklearn.impute import SimpleImputer
  
# Imputer object using the mean strategy and 
# missing_values type for imputation
imputer = SimpleImputer(missing_values = np.nan, 
                        strategy ='mean')
  
# Fitting the data to the imputer object
imputer = imputer.fit(train)
  
# Imputing the data     
imputed_train = imputer.transform(train)
  
print("Imputed Data : \n", imputed_train)

In [None]:
from sklearn.impute import SimpleImputer

# Missing values is represented using NaN and hence specified. If it is empty field, missing values will be specified as:

imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')

dfstd.marks = imputer.fit_transform(dfstd['marks'].values.reshape(-1,1))[:,0]

dfstd

To drop all rows with 'any' NAs in a particular column, I used .dropna() and specified the subset = column.

In [None]:
train.isnull().sum()

In [None]:
train.dropna(subset = ['FQ33'], axis = 0, how = 'any', inplace = True)
train.isnull().sum()

In [None]:
train.drop(columns="cabin")

In [None]:
#Class Distribution
target_counts = train.groupby('Target').size()
print(target_counts)

You can see that there are nearly triple the number of observations with target 0 than there are with target 1.

## Examine Target Column

In [None]:
train['Target'].value_counts()

In [None]:
s = sns.countplot(x = 'Target',data = train)
sizes=[]
for p in s.patches:
    height = p.get_height()
    sizes.append(height)
    s.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(height/len(train)*100),
            ha="center", fontsize=14) 

## Create Features

sns.pairplot(train, hue='Target', size=1.5);

In [None]:
# Select first three rows
train.iloc[1:4]

## Replace Multiple Values in Multiple Columns

Target Category:
- 1: Yes
- 2: No
- 3: Don’t know 
- 4: Refused to answer

train.count()

train.replace({'FQ1': {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ2' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ3' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ4' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'},
'FQ5' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ6'{1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'},
'FQ7' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'},
'FQ8' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'},
'FQ9' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ10' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ11'{1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ12' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ13'{1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'},
'FQ14' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ15' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ16' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ17' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ18' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ19' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ20' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ21' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ22' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'},
'FQ23' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ24' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ35' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ36'{1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ25'{1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ26' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ27' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ28' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ29' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'},
'FQ30'{1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ31' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ32' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ33' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ34' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ37' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}})

In [None]:
train.columns

In [None]:
# Select unique values
train['country_code'].unique()


In [None]:
# Alternatively, value_counts will display all unique values with the number of times each value appears:
# Show counts
train['country_code'].value_counts()

In [None]:
# Select unique values
train['region'].value_counts()

Both unique and value_counts are useful for manipulating and exploring categorical
columns.

In [None]:
train['FQ1'].value_counts()

In [None]:
train['FQ4'].value_counts()

In [None]:
train.dtypes

In [None]:
#Change multiple columns with float to int
train[['FQ1', 'FQ2', 'FQ3', 'FQ4',
       'FQ5', 'FQ6', 'FQ7', 'FQ8', 'FQ9', 'FQ10', 'FQ11', 'FQ12', 'FQ13',
       'FQ14', 'FQ15', 'FQ16', 'FQ17', 'FQ18', 'FQ19', 'FQ20', 'FQ21', 'FQ22',
       'FQ23', 'FQ24', 'FQ35', 'FQ36', 'FQ25', 'FQ26', 'FQ27', 'FQ28', 'FQ29',
       'FQ30', 'FQ31', 'FQ32', 'FQ33', 'FQ34', 'FQ37']].astype(int)

In [None]:
train.dtypes