## Import Libraries

In [None]:
#Suppress FutureWarnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd 
import numpy as np 
import seaborn as sns 

#display the graphics made by python inline with the text
%matplotlib inline
import matplotlib.pyplot as plt 


## Load data

In [None]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
submission = pd.read_csv('SampleSubmission.csv')

## Understand the Data with Descriptive Statistics

In [None]:
#Check the shape and size of datasets
train.shape, test.shape, submission.shape

In [None]:
#peek at the data
train.tail()

In [None]:
#Look at first 5 records
test.head()

In [None]:
submission.head()

In [None]:
#Statistical summary
train.describe()

Here:
- 0: Did not use mobile or internet banking
- 1: Used mobile or internet banking

This shows that less people use moble or internet banking.

## Examine Missing & Duplicated Values
Data
There are also various ways to handle missing data:
 - Remove any row with missing data
 - Remove any column with missing data
 - Impute missing values


### Duplicated values in data sets

In [None]:
#Find duplicates
train.duplicated().any()

In [None]:
test.duplicated().any()

### Missing Values

In [None]:
#Counting the Number of Null rows in each Column of the dataframe
train.isnull().sum()

In [None]:
#Counting the Number of Null rows in each Column of the dataframe
test.isnull().sum()

## Dealing with Missing Values

In [None]:
# Total missing values for each feature
print (train.isnull().sum()/ len(train)*100)

In [None]:
total = train.isnull().sum().sort_values(ascending=False)
percent =total/len(train)*100
pd.concat([total,percent], axis=1, keys=['Total','Percent']).head(20)

In [None]:
print (test.isnull().sum()/ len(test)*100)

In [None]:
total_test = test.isnull().sum().sort_values(ascending=False)
percent =total_test/len(train)*100
pd.concat([total,percent], axis=1, keys=['Total','Percent']).head(20)

Here you can clearly see that 7 columns have Null values higher than 80% so it is good to drop those columns from our data.

train = train [train.columns[train.isnull().mean() < 0.80]]
train.columns

In [None]:
test.columns

train = train.dropna(thresh = 2)                # Apply dropna() function
train

test = test.dropna(thresh = 2)                # Apply dropna() function
test

In [None]:
train.dropna(subset = ['FQ33'], axis = 0, how = 'any', inplace = True)
train.isnull().sum()

In [None]:
test.dropna(subset = ['FQ33'], axis = 0, how = 'any', inplace = True)
test.isnull().sum()

In [None]:
train = train.drop (['ID'], axis = 1)
test = test.drop(['ID'], axis = 1)
#test = test.drop(['ID','FQ5', 'FQ17', 'FQ36', 'FQ27', 'FQ28', 'FQ30', 'FQ31'], axis = 1)

In [None]:
train.shape, test.shape

## Train the model

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
X.head()

In [None]:
train.columns

In [None]:
y = train.Target
X = train.drop('Target', axis=1)
#X = X.select_dtypes(exclude=['object'])

X

In [None]:
from sklearn.impute import SimpleImputer
# Impute training and test data
imputer = SimpleImputer(missing_values=np.nan)
#fit imputer to train data
X = imputer.fit_transform(X)
test = imputer.transform(test)

from sklearn.preprocessing import MinMaxScaler
# Scale training and test data
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
test = scaler.transform(test)

##Splitting data for training and testing
from sklearn.model_selection import train_test_split
#Break off validation set from training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


from sklearn.ensemble import RandomForestClassifier
#rfc= RandomForestClassifier(n_estimators= 500)
rfc= RandomForestClassifier(n_estimators= 300)
#rfm= RGFClassifier()
rfc.fit(X_train, y_train)

#Predictions
y_predict = rfc.predict(X_test)

In [None]:
## Splitting data for training and testing
from sklearn.model_selection import train_test_split
# Break off validation set from training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=999)

#Make prediction on test data and submission file

rfc.fit(X,y)

test_pred = rfc.predict(test)

submission['Target'] = test_pred

submission.head()

submission.to_csv('submission_RFC.csv', index=False)

In [None]:
import lightgbm

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold

rf_model = RandomForestClassifier(n_estimators=100, random_state=10,
                               n_jobs = -1)
scorer = make_scorer(f1_score, greater_is_better=True, average = 'macro')
# 10 fold cross validation
cv_score = cross_val_score(rf_model, X_train, y_train, cv=10, scoring=scorer)
print('10 Fold Cross Validation F1 Score = {} with std = {}'.format(round(cv_score.mean(), 4), round(cv_score.std(), 4)))

In [None]:
# Preprocessing of training data, fit model 
rf_model.fit(X, y)

# Preprocessing of validation data, get predictions
preds = rf_model.predict(X_test)

In [None]:
submission['Target'] = preds

In [None]:
submission.head()

In [None]:
submission.to_csv('submission_LGBM.csv', index=False)

In [None]:
import lightgbm

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer

rf_model = RandomForestClassifier(n_estimators=100, random_state=10,
                               n_jobs = -1)
scorer = make_scorer(f1_score, greater_is_better=True, average = 'macro')
# 10 fold cross validation
cv_score = cross_val_score(model, trainData, trainTarget, cv=10, scoring=scorer)
print('10 Fold Cross Validation F1 Score = {} with std = {}'.format(round(cv_score.mean(), 4), round(cv_score.std(), 4)))

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
#define imputer
imputer = IterativeImputer()
#fit on the dataset
imputer.fit(X)
#transform the dataset
X_imputed = imputer.transform(X)

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)
test = pd.get_dummies(test)

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan)
X_imputed = imputer.fit_transform(X_train.values)
imputed_X_test = imputer.transform(X_test.values)
print("Imputed data:")
print(X_imputed)

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan)

imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
imputed_X_test = pd.DataFrame(imputer.transform(X_test))
imputed_test = pd.DataFrame(imputer.transform(test))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_test.columns = X_test.columns
imputed_test.columns = test.columns

# Number of missing values in each column of training data
missing_val_count_by_column = (imputed_X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

print("MAE from Approach 2 (Imputation):")
print((imputed_X_train, imputed_X_test, y_train, y_test))

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
#define imputer
imputer = IterativeImputer()
#fit on the dataset
imputer.fit(X_train)
#transform the dataset
X_trainB = imputer.transform(X_train)
X_testB = imputer.transform(X_test)

from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()
imputed_X_train = my_imputer.fit_transform(X_train)
imputed_X_test = my_imputer.transform(X_test)
print("Mean Absolute Error from Imputation:")
print(score_dataset(imputed_X_train, imputed_X_test, y_train, y_test))

#Itlearns about the data and does nothing else
my_imputer.fit(X_train)

#Calling transform to apply the learnt information on supplied data
X_train_new = my_imputer.transform(X_train)
X_test_new = my_imputer.transform(X_test)

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')


X_imputed = imputer.fit_transform(dfstd['marks'].values.reshape(-1,1))[:,0]


from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
X = pd.DataFrame(imputer.fit_transform(X),columns = X.columns)
X.head()

from sklearn.impute import KNNImputer

#Initialize KNNImputer
imputer = KNNImputer (n_neighbors = 2)

#Impute/Fill Missing values of each feature
X_imputed = pd.DataFrame(imputer.fit_transform (X_train, y_train),  columns= X.columns)

results = imputer.transform (X_test)

results.shape


from sklearn.impute import KNNImputer

#Initialize KNNImputer
imputer = KNNImputer (n_neighbors = 2)

#Impute/Fill Missing values of each feature
X_imputed = imputer.fit_transform (X)


## Model Evaluation

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, RepeatedStratifiedKFold

In [None]:
#imputer = SimpleImputer()

from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()

# Bundle preprocessing and modeling code in a pipeline
from sklearn.pipeline import Pipeline
pipeline = Pipeline ([('impute', imputer), ('model', dt_model)])

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=1)

'Mean accurracy: {} std: {}'.format(round(np.mean(scores), 3), round(np.std(scores), 3))

In [None]:
# Preprocessing of training data, fit model 
pipeline.fit(imputed_X_train, y_train)

# Preprocessing of validation data, get predictions
predictions = pipeline.predict(imputed_X_test)

In [None]:
# Number of missing values in each column of training data
missing_val_count_by_column = (imputed_X_test.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

In [None]:
submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':predictions})

In [None]:
submission['Target'] = predictions

submission = pd.DataFrame({'Target':predictions})

In [None]:
submission.head()

In [None]:
output = pd.DataFrame({'ID': test.ID, 'Target': predictions})

In [None]:
submission.head()

from xgboost import XGBRegressor

my_model = XGBRegressor()
my_model.fit(X_train, y_train)
#validate model
from sklearn.metrics import mean_absolute_error
predictions = my_model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

In [None]:
output.to_csv('submission_Tree.csv', index=False)

## Make new prediction

print(X.shape)
print(y.shape)
print(X_imputed.shape)
print(y_train.shape)

#Instantiate model and fit to data
dt_model.fit(X, y)
#Make predictions and store in 'Survived' column of df_test
Y_pred = dt_model.predict(imputed_X_test)
test['Survived'] = Y_pred

dt_model.fit(X_imputed, y_train)
predictions = dt_model.predict(imputed_X_test)

test['Target'] = predictions

dt_model = dt_model.fit(X_imputed, y)
y_predict = dt_model.predict(imputed_X_test)

# Make predictions and store in 'Survived' column of df_test
test['Target'] = y_predict

df.apply(lambda col: col.drop_duplicates().reset_index(drop=True))


In [None]:
test.shape, y_predict.shape

In [None]:
sub_ID = test['ID']
submission_df = pd.DataFrame({
                  "ID": sub_ID, 
                  "Target": y_predict})

In [None]:
submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':predictions})

#Visualize the first 5 rows
submission.head(

In [None]:
#ID = test['ID']
submission_df = pd.DataFrame({
                  "ID": test.ID, 
                  "Target": y_predict})

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf = clf.fit(imputed_X_train, y_train)
y_pred = clf.predict(imputed_X_test)

In [None]:
submission.columns

In [None]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)

In [None]:
submission_df_1.to_csv('submission_1.csv', index=False)

In [None]:
xgbr = xgb.XGBRegressor()
xgbr = xgbr.fit(X, y)
pred_values = xgbr.predict(test_scaled)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model = dt_model.fit (X, y_train)

In [None]:
print("model score: %.3f" % dt_model.score(X_testB, y_test))

In [None]:
threshold = 0.4

predicted_proba = dt_model.predict_proba(X_testB)
predicted = (predicted_proba [:,1] >= threshold).astype('int')

accuracy = f1_score(y_test, predicted)
print(accuracy)

In [None]:
test_id = test.drop('ID', axis=1)

In [None]:
pred= dt_model.predict(X_testB)
print('Score', f1_score(y_test, pred))

In [None]:
print(classification_report(y_test, predicted))

In [None]:
pred= dt_model.predict(X_testB)
print('Score', f1_score(y_test, pred))

In [None]:
dt_model.fit(X,y)

In [None]:
threshold = 0.4

predicted_proba = dt_model.predict_proba(X_test)
predicted = (predicted_proba [:,1] >= threshold).astype('int')

accuracy = f1_score(y_test, predicted)
print(accuracy)

In [None]:
pred= rf.predict(val_x)
print('Score', f1_score(val_y, pred))

In [None]:
print(classification_report(val_y, predicted))

In [None]:
rf.fit(X,y)

In [None]:

results = clf.predict (X_test)

In [None]:
X_test = test.drop(['ID', 'FQ5', 'FQ17', 'FQ36', 'FQ27', 'FQ28', 'FQ30', 'FQ31'], axis= 1)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=2)
rf_model.fit(X_imputed, y)
y_pred = model.predict(X_test)

In [None]:
output = pd.DataFrame({'ID': test.ID, 'Target': y_pred})
output.to_csv('submission_RF2.csv', index=False)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train = pd.DataFrame(scaler.fit_transform(train), columns = train.columns)
train.head()

## Imputing Missing Data Using Sklearn SimpleImputer
SimpleImputer is a class found in package sklearn.impute. It is used to impute / replace the numerical or categorical missing data related to one or more features with appropriate values. 
https://dzone.com/articles/imputing-missing-data-using-sklearn-simpleimputer


There are two columns / features (one numerical - marks, and another categorical - gender) which are having missing values and need to be imputed. In the code below, an instance of SimpleImputer is created with strategy as "mean". The missing value is represented using NaN. Note some of the following:

sklearn.impute package is used for importing SimpleImputer class.
SimpleImputer takes two argument such as missing_values and strategy.
fit_transform method is invoked on the instance of SimpleImputer to impute the missing values.

### Imputation Approach with KNNImputer
We will use the KNNImputer function from the impute module of the sklearn. KNNImputer helps to impute missing values present in the observations by finding the nearest neighbors with the Euclidean distance matrix.

In [None]:
from sklearn.impute import KNNImputer

#KNN based imputation for categorical variables
imputer = KNNImputer (n_neighbors = 2)


imputed_train = imputer.fit_transform(train[['country_code', 'region', 'age', 'FQ1', 'FQ2', 'FQ3', 'FQ4','FQ5', 'FQ6', 'FQ7', 'FQ8', 'FQ9', 'FQ10', 'FQ11', 'FQ12', 'FQ13','FQ14', 'FQ15', 'FQ16', 'FQ17', 'FQ18', 'FQ19', 'FQ20', 'FQ21', 'FQ22','FQ23', 'FQ24', 'FQ35', 'FQ36', 'FQ25', 'FQ26', 'FQ27', 'FQ28', 'FQ29','FQ30', 'FQ31', 'FQ32', 'FQ33', 'FQ34', 'FQ37']])

#print the completed dataframe
imputed_train

In [None]:
# Importing the SimpleImputer class
from sklearn.impute import SimpleImputer
  
# Imputer object using the mean strategy and 
# missing_values type for imputation
imputer = SimpleImputer(missing_values = np.nan, 
                        strategy ='mean')
  
# Fitting the data to the imputer object
imputer = imputer.fit(train)
  
# Imputing the data     
imputed_train = imputer.transform(train)
  
print("Imputed Data : \n", imputed_train)

In [None]:
from sklearn.impute import SimpleImputer

# Missing values is represented using NaN and hence specified. If it is empty field, missing values will be specified as:

imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')

dfstd.marks = imputer.fit_transform(dfstd['marks'].values.reshape(-1,1))[:,0]

dfstd

To drop all rows with 'any' NAs in a particular column, I used .dropna() and specified the subset = column.

In [None]:
train.isnull().sum()

In [None]:
train.dropna(subset = ['FQ33'], axis = 0, how = 'any', inplace = True)
train.isnull().sum()

In [None]:
train.drop(columns="cabin")

In [None]:
#Class Distribution
target_counts = train.groupby('Target').size()
print(target_counts)

You can see that there are nearly triple the number of observations with target 0 than there are with target 1.

## Examine Target Column

In [None]:
train['Target'].value_counts()

In [None]:
s = sns.countplot(x = 'Target',data = train)
sizes=[]
for p in s.patches:
    height = p.get_height()
    sizes.append(height)
    s.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(height/len(train)*100),
            ha="center", fontsize=14) 

## Create Features

sns.pairplot(train, hue='Target', size=1.5);

In [None]:
# Select first three rows
train.iloc[1:4]

## Replace Multiple Values in Multiple Columns

Target Category:
- 1: Yes
- 2: No
- 3: Don’t know 
- 4: Refused to answer

train.count()

train.replace({'FQ1': {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ2' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ3' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ4' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'},
'FQ5' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ6'{1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'},
'FQ7' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'},
'FQ8' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'},
'FQ9' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ10' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ11'{1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ12' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ13'{1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'},
'FQ14' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ15' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ16' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ17' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ18' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ19' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ20' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ21' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ22' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'},
'FQ23' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ24' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ35' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ36'{1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ25'{1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ26' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ27' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ28' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ29' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'},
'FQ30'{1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ31' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ32' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ33' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ34' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}, 
'FQ37' {1: 'Yes', 2: 'No', 3: 'Don’t know', 4: 'Refused to answer'}})

In [None]:
train.columns

In [None]:
# Select unique values
train['country_code'].unique()


In [None]:
# Alternatively, value_counts will display all unique values with the number of times each value appears:
# Show counts
train['country_code'].value_counts()

In [None]:
# Select unique values
train['region'].value_counts()

Both unique and value_counts are useful for manipulating and exploring categorical
columns.

In [None]:
train['FQ1'].value_counts()

In [None]:
train['FQ4'].value_counts()

In [None]:
train.dtypes

In [None]:
#Change multiple columns with float to int
train[['FQ1', 'FQ2', 'FQ3', 'FQ4',
       'FQ5', 'FQ6', 'FQ7', 'FQ8', 'FQ9', 'FQ10', 'FQ11', 'FQ12', 'FQ13',
       'FQ14', 'FQ15', 'FQ16', 'FQ17', 'FQ18', 'FQ19', 'FQ20', 'FQ21', 'FQ22',
       'FQ23', 'FQ24', 'FQ35', 'FQ36', 'FQ25', 'FQ26', 'FQ27', 'FQ28', 'FQ29',
       'FQ30', 'FQ31', 'FQ32', 'FQ33', 'FQ34', 'FQ37']].astype(int)

In [None]:
train.dtypes