# Kaggle Code Snippets

# Import Libraries

In [1]:
# #Python Libraries
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels
import pandas_profiling

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import os
import sys
import time
import json
import random
import requests
import datetime

import missingno as msno
import math
import sys
import gc
import os

# #sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC


# #sklearn - preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# #sklearn - metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import roc_auc_score

# #XGBoost & LightGBM
import xgboost as xgb
import lightgbm as lgb

# #Missing value imputation
from fancyimpute import KNN, MICE

# #Hyperparameter Optimization
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

# #NLP
from sklearn.feature_extraction.text import TfidfVectorizer

pd.options.display.max_columns = 150
##################################################################
# #Spark

spark_home = os.environ.get('SPARK_HOME', None)

if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'C:/Users/karti/Spark/spark-2.3.0-bin-hadoop2.7/python/lib/py4j-0.10.6-src.zip'))

filename=os.path.join(spark_home, 'python/pyspark/shell.py')
exec(compile(open(filename, "rb").read(), filename, 'exec'))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.0
      /_/

Using Python version 3.6.3rc1 (v3.6.3rc1:d8c174a, Sep 19 2017 16:39:51)
SparkSession available as 'spark'.


# Directory Structure

# EDA - Exploratory Data Analysis

In [None]:
df_project_train = pd.read_csv("../data/train.csv")
df_project_test = pd.read_csv("../data/test.csv")

df_project_train.head()
df_project_test.head()

df_project_train.shape
df_project_test.shape

df_project_train.info()
df_project_test.info()



## Missing values

In [None]:
# #For both Train and Test datasets
msno.matrix(df_train)
msno.bar(df_train)
msno.heatmap(df_train, figsize=(20,20))
msno.dendrogram(df_train)

In [None]:
# #At a column-level: Total number of missing data points, Percentage of missing data points
def f_missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

f_missing_data(df_train)

In [None]:
# #Total number of missing data points, for each column
df_train.isnull().sum(axis = 0)

# #Total number of missing data points across the entire dataset
df_train.isnull().sum(axis = 0).sum()

In [None]:
# #Missing value imputation via MICE
df_train_imputed = MICE().complete(df_train)
df_train_imputed = pd.DataFrame(df_train_imputed, columns=df_train.columns)

# Data Pre-processing

## JOINS

In [None]:
df_join_A_B = df_A.merge(df_B, on="<column_name>")

# Feature Engineering

## Groupby + Transform - count, sum, mean, min, max, diff, lambda ops

In [None]:
df['NEW_FEATURE'] = df.groupby('COLUMN_TO_GROUPBY')['COLUMN_TO_TRANSFORM'].transform('count')

df['NEW_FEATURE'] = df.groupby('COLUMN_TO_GROUPBY')['COLUMN_TO_TRANSFORM'].transform(np.sum)
df['NEW_FEATURE'] = df.groupby('COLUMN_TO_GROUPBY')['COLUMN_TO_TRANSFORM'].transform(np.mean)
df['NEW_FEATURE'] = df.groupby('COLUMN_TO_GROUPBY')['COLUMN_TO_TRANSFORM'].transform(np.min)
df['NEW_FEATURE'] = df.groupby('COLUMN_TO_GROUPBY')['COLUMN_TO_TRANSFORM'].transform(np.max)
df['NEW_FEATURE'] = df.groupby('COLUMN_TO_GROUPBY')['COLUMN_TO_TRANSFORM'].transform(np.diff)

df['NEW_FEATURE'] = df.groupby('COLUMN_TO_GROUPBY')['COLUMN_TO_TRANSFORM'].transform(lambda x:x+1)

## Groupby + Successive rows difference

In [None]:
temp = df_bureau.copy()
temp.sort_values(['COLUMN_TO_GROUPBY', 'COLUMN_TO_TRANSFORM'], inplace=True)
temp['temp_successive_diff'] = temp.groupby('COLUMN_TO_GROUPBY')['COLUMN_TO_TRANSFORM'].transform(lambda ele: ele.diff())
temp['FEATURE_SUCCESSIVE_DIFF_MEAN'] = temp.groupby('COLUMN_TO_GROUPBY')['temp_successive_diff'].transform(np.mean)

## Drop Duplicate Rows at the end of Groupby Operations

In [None]:
df = df[['', 'LIST_OF_COLUMNS_TO_KEEP', '']].drop_duplicates()

## Categorical Data Encoding - One-hot Encoding

In [None]:
train_y_ohe = pd.get_dummies(train_y)
test_y_ohe = pd.get_dummies(test_y)

## Categorical Data Encoding - Label Encoding

In [None]:
df[var_col].astype('category').cat.codes

In [None]:
arr_categorical_columns = df_train.select_dtypes(['object']).columns
for var_col in arr_categorical_columns:
    df_train[var_col] = df_train[var_col].astype('category').cat.codes
    df_train[var_col] = df_train[var_col].astype('category').cat.codes

## Categorical Data Encoding - Frequency Encoding

### Only one column

In [None]:
encoding = df.groupby('key').size()
encoding = encoding/len(df)

df['freq_encode'] = df['key'].map(encoding)

### Encoding on one column while group by on the parent column

In [None]:
temp_df = df.groupby(['PARENT_COL','CHILD_COL']).size()/df.groupby(['PARENT_COL']).size()
temp_df = temp_df.to_frame().reset_index().rename(columns= {0: 'CHILD_COL_FREQENCODE'})
df = pd.merge(df_bureau, temp_df[['PARENT_COL', 'CHILD_COL_FREQENCODE']], on="PARENT_COL", how="left", suffixes=('_parent', '_child'))
del temp_df

## PCA

In [None]:
features = df_train.columns
# Separating out the features
x = df_train.loc[:, features].values
# Separating out the target
y = df_train.loc[:,['TARGET']].values
# Standardizing the features
x = StandardScaler().fit_transform(x)

In [None]:
from sklearn.decomposition import PCA

input_columns = df_train.columns
input_columns = input_columns[input_columns != 'TARGET']
target_column = 'TARGET'

pca = PCA(0.99)
pca.fit(df_train[input_columns])

df_train_pca = pca.transform(df_train[input_columns])
df_test_pca = pca.transform(df_test)

df_train_pca = pd.DataFrame(data= df_train_pca)
df_test_pca = pd.DataFrame(data= df_test_pca)

### TF-IDF

In [None]:
tfidf = TfidfVectorizer(binary=True)

X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

### Label Encoding of the Target Variable

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y)

# #After making the prediction
y_pred_invtransformed = le.inverse_transform(y_pred)

# Model Building

In [None]:
# #Train-Validation Split
input_columns = df_train.columns
input_columns = input_columns[input_columns != 'TARGET']
target_column = 'TARGET'

X = df_train[input_columns]
y = df_train[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Random Forest

In [None]:
model_rf = RandomForestClassifier(n_estimators=1000, random_state=42)

model_rf.fit(X_train, y_train)
y_test = model_rf.predict(X_test)


## XGBoost

In [None]:
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'objective': 'binary:logistic',
    'eval_metric': 'auc', 
    'max_depth': 6,
    'num_parallel_tree': 1,
    'min_child_weight': 5,
}

In [None]:
watchlist = [(xgb.DMatrix(X_train, y_train), 'train'), (xgb.DMatrix(X_test, y_test), 'valid')]
model = xgb.train(xgb_params, xgb.DMatrix(X_train, y_train), 270, watchlist, maximize=True, verbose_eval=100)

In [None]:
df_predict = model.predict(xgb.DMatrix(df_test), ntree_limit=model.best_ntree_limit)

In [None]:
submission = pd.DataFrame()
submission["PRED_COLUMN"] =  df_test["PRED_COLUMN"]
submission["TARGET"] =  df_predict

submission.to_csv("../submissions/model.csv", index=False)

## XGBoost with Hyperparameter Optimization with Hyperopt

In [None]:
input_columns = df_application_train.columns
input_columns = input_columns[input_columns != 'TARGET']
target_column = 'TARGET'

X = df_application_train[input_columns]
y = df_application_train[target_column]
gc.collect()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.7)

In [None]:
num_train, num_feature = X_train.shape

In [None]:
xgb_train = xgb.DMatrix(X_train, y_train)
xgb_eval  = xgb.DMatrix(X_test, y_test)

xgb_test   = xgb.DMatrix(df_application_test)

In [None]:
# #Params for the Hyperopt algo
N_HYPEROPT_PROBES = 8 # #Number of evaluation cycles
HYPEROPT_ALGO = tpe.suggest  # #Tree-of-Parzen-Estimators algo

# #Params for XGBoost CV
NUM_BOOST_ROUNDS = 270
NB_CV_FOLDS = 10
EARLY_STOPPING = 200
HOLDOUT_SIZE = 0.20
# HOLDOUT_SEED = 123456
# SEED0 = random.randint(1,1000000000)

In [None]:
obj_call_count = 0
cur_best_score = 0

In [None]:
def objective(space):
    # #Global Variable Definition
    global obj_call_count, cur_best_score, X_train, y_train, test, X_val, y_val

    obj_call_count += 1
    print('\nXGBoost objective call #{} cur_best_score={:7.5f}'.format(obj_call_count,cur_best_score))

    sorted_params = sorted(space.items(), key=lambda z: z[0])
    print('Params:', str.join(' ', ['{}={}'.format(k, v) for k, v in sorted_params if not k.startswith('column:')]))


    xgb_params = sample(space)
    model = xgb.cv(xgb_params, xgb_train,
                   num_boost_round = NUM_BOOST_ROUNDS,
                    nfold=NB_CV_FOLDS,
                    stratified=False,
                    early_stopping_rounds=EARLY_STOPPING,
                    verbose_eval=100,
                    show_stdv=False)

    n_rounds = len(model["test-auc-mean"])
    cv_score = model["test-auc-mean"][n_rounds-1]
    print('CV finished n_rounds={} cv_score={:7.5f}'.format(n_rounds, cv_score ))
    
    xgb_model = xgb.train(
                        xgb_params,
                        xgb_train,
                        num_boost_round=n_rounds,
                        verbose_eval=True)
    
    predictions = xgb_model.predict(xgb_eval, ntree_limit =n_rounds)
    score = roc_auc_score(y_test, predictions)
    print('valid score={}'.format(score))
    
    if score > cur_best_score:
        cur_best_score = score
        print('NEW BEST SCORE={}'.format(cur_best_score))
       
    loss = 1 - score
    return {'loss': loss, 'status': STATUS_OK}

In [None]:
# #NOTE: Any change in `space`, needs to be changed in xgb_default_params as well
space ={
    'booster '    : 'gbtree',       
    'objective'   : 'binary:logistic',
    'eval_metric' : 'auc',
    'seed'        : 42,
    'silent'      : 0,      #Messages would be printed
    'n_thread'    : -1,     #-1: all cores are used
    'subsample'   : 0.8,
    'colsample_bytree': 0.7,
    
    'eta'         : hp.uniform('eta', 0.025, 0.25),   # #Learning rate - Step size shrinkage to handle overfitting
    'min_child_weight': hp.choice("min_child_weight", np.arange(5, 15,dtype=int)), # #Tradeoff b/n over and underfitting
    'max_depth'   : hp.choice("max_depth", np.arange(4, 8,dtype=int)), # #Tradeoff b/n over and underfitting
    'alpha'       : hp.uniform('alpha', 0.5, 5), # #L1 regularization term - increase this value will make model more conservative.
    'lambda'      : hp.uniform('lambda', 0.5, 5), # #L2 regularization term - increase this value will make model more conservative.
    'gamma'       : hp.uniform('gamma', 0.6, 0.8),
   }

In [None]:
# #Trials keep track of all the experiments
trials = Trials()

# #MAIN function to run all the experiments
best = fmin(fn=objective,
                     space=space,
                     algo=HYPEROPT_ALGO,
                     max_evals=N_HYPEROPT_PROBES,
                     trials=trials,
                     verbose=1)


print('-'*50)
print('The best params:')
print( best )
print('\n\n')

In [None]:
best

In [None]:
# #LB: 0.779
xgb_params = {
 'alpha': 3.160842634951819, # #This was 20 earlier
 'booster ': 'gbtree',
 'colsample_bytree': 0.7,
 'eta': 0.1604387053222455,
 'eval_metric': 'auc',
 'gamma': 0.6236454630290655, # #This was 0.85 earlier
 'lambda': 4.438488456929287, 
 'max_depth': 4,
 'min_child_weight': 9,
 'n_thread': -1,
 'objective': 'binary:logistic',
 'seed': 42,
 'silent': 0,
 'subsample': 0.8
}

In [None]:
# #Final Model
gc.collect()
watchlist = [(xgb.DMatrix(X_train, y_train), 'train'), (xgb.DMatrix(X_test, y_test), 'valid')]
model = xgb.train(xgb_params, xgb.DMatrix(X, y), 270, watchlist, maximize=True, verbose_eval=100)

In [None]:
df_predict = model.predict(xgb.DMatrix(df_application_test), ntree_limit=model.best_ntree_limit)

In [None]:
submission = pd.DataFrame()
submission["SK_ID_CURR"] =  df_application_test["SK_ID_CURR"]
submission["TARGET"] =  df_predict

submission.to_csv("../submissions/model_1_xgbstarter_updatedParams_v8.csv", index=False)

In [None]:
# #Should be 48744, 2
submission.shape

### LightGBM - Classification

In [None]:
params = {
    'boosting_type': 'dart',
    'objective': 'multiclass',
    'num_class': 20,               # #Types of cuisine
    'metric': {'multi_error'},
    'num_leaves': 60,
    'learning_rate': 0.06,
}


In [None]:
d_train = lgb.Dataset(X_train, label=y_train)
d_valid = lgb.Dataset(X_valid, y_valid, reference=d_train)

model_lgb = lgb.train(params, 
                d_train,
                num_boost_round=1000,
                valid_sets=d_valid,
                verbose_eval=100,
                early_stopping_rounds=200)

In [None]:
# #Predict
y_pred = model_lgb.predict(X_test, num_iteration=gbm.best_iteration).argmax(axis=1)
