# Kaggle Code Snippets

# Import Libraries

In [None]:
# #Python Libraries
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels
import pandas_profiling

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import os
import sys
import time
import requests
import datetime

import missingno as msno
import math
import sys
import gc
import os

# #sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing

# #sklearn - metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

# #XGBoost & LightGBM
import xgboost as xgb
import lightgbm as lgb

# #Missing value imputation
from fancyimpute import KNN, MICE

pd.options.display.max_columns = 99

##################################################################
# #Spark

spark_home = os.environ.get('SPARK_HOME', None)

if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'C:/Users/karti/Spark/spark-2.3.0-bin-hadoop2.7/python/lib/py4j-0.10.6-src.zip'))

filename=os.path.join(spark_home, 'python/pyspark/shell.py')
exec(compile(open(filename, "rb").read(), filename, 'exec'))

# Directory Structure

# EDA - Exploratory Data Analysis

In [None]:
df_project_train = pd.read_csv("../data/train.csv")
df_project_test = pd.read_csv("../data/test.csv")

df_project_train.head()
df_project_test.head()

df_project_train.shape
df_project_test.shape

## Missing values

In [None]:
# #For both Train and Test datasets
msno.matrix(df_train)
msno.bar(df_train)
msno.heatmap(df_train, figsize=(20,20))
msno.dendrogram(df_train)

In [None]:
# #At a column-level: Total number of missing data points, Percentage of missing data points
def f_missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

f_missing_data(df_train)

In [None]:
# #Total number of missing data points, for each column
df_train.isnull().sum(axis = 0)

# #Total number of missing data points across the entire dataset
df_train.isnull().sum(axis = 0).sum()

In [None]:
# #Missing value imputation via MICE
df_train_imputed = MICE().complete(df_train)
df_train_imputed = pd.DataFrame(df_train_imputed, columns=df_train.columns)

# Data Pre-processing

## Categorical Data Encoding

In [None]:
arr_categorical_columns = df_train.select_dtypes(['object']).columns
for var_col in arr_categorical_columns:
    df_train[var_col] = df_train[var_col].astype('category').cat.codes
    df_train[var_col] = df_train[var_col].astype('category').cat.codes

## JOINS

In [None]:
df_join_A_B = df_A.merge(df_B, on="<column_name>")

# Feature Engineering

## PCA

In [None]:
features = df_train.columns
# Separating out the features
x = df_train.loc[:, features].values
# Separating out the target
y = df_train.loc[:,['TARGET']].values
# Standardizing the features
x = StandardScaler().fit_transform(x)

In [None]:
from sklearn.decomposition import PCA

input_columns = df_train.columns
input_columns = input_columns[input_columns != 'TARGET']
target_column = 'TARGET'

pca = PCA(0.99)
pca.fit(df_train[input_columns])

df_train_pca = pca.transform(df_train[input_columns])
df_test_pca = pca.transform(df_test)

df_train_pca = pd.DataFrame(data= df_train_pca)
df_test_pca = pd.DataFrame(data= df_test_pca)

# Model Building

In [None]:
# #Train-Validation Split
input_columns = df_train.columns
input_columns = input_columns[input_columns != 'TARGET']
target_column = 'TARGET'

X = df_train[input_columns]
y = df_train[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## XGBoost

In [None]:
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'objective': 'binary:logistic',
    'eval_metric': 'auc', 
    'max_depth': 6,
    'num_parallel_tree': 1,
    'min_child_weight': 5,
}

In [None]:
watchlist = [(xgb.DMatrix(X_train, y_train), 'train'), (xgb.DMatrix(X_test, y_test), 'valid')]
model = xgb.train(xgb_params, xgb.DMatrix(X_train, y_train), 270, watchlist, maximize=True, verbose_eval=100)

In [None]:
df_predict = model.predict(xgb.DMatrix(df_test), ntree_limit=model.best_ntree_limit)

In [None]:
submission = pd.DataFrame()
submission["PRED_COLUMN"] =  df_test["PRED_COLUMN"]
submission["TARGET"] =  df_predict

submission.to_csv("../submissions/model.csv", index=False)