
### Gradient Boosting with XGboost

In [1]:
from xgboost import XGBClassifier, XGBRegressor


import pandas as pd
import numpy as np


import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

sns.set_style('darkgrid')
sns.set_palette('husl')

In [2]:
data = pd.read_csv('insurance.csv')

In [3]:
data.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [4]:
data.head(4)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061


### Assignment 

Perform an extensive exploratory data analysis on this dataset

In [5]:
data_enc = pd.get_dummies(data, ['sex','region','smoker'])

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

num_cols = scaler.fit_transform(data_enc[['age','bmi','children']])
cat_cols = data_enc.drop(columns=['age','bmi','children','charges']).values

X = np.concatenate((num_cols,cat_cols), axis= 1)
y = data_enc['charges']

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

reg = XGBRegressor(random_state= 23, n_estimators = 50,learning_rate = 0.1,
                  eval_metric = 'rmse')

model = reg.fit(X_train, y_train)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

from sklearn.metrics import mean_squared_error


print(f'test_rmse: {mean_squared_error(y_test, test_pred, squared= False)}')
print(f'train_rmse: {mean_squared_error(y_train, train_pred, squared=False)}')

test_rmse: 4573.38967835942
train_rmse: 2902.7569778724433


# Classification with Xgboost

In [8]:
from xgboost import XGBClassifier


from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score,recall_score,f1_score,confusion_matrix

In [9]:
train_weather = pd.read_csv('weather_train.csv')
test_weather = pd.read_csv('weather_test.csv')

In [10]:
train_weather.columns

Index(['Unnamed: 0', 'Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall',
       'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am',
       'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am',
       'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm',
       'RainToday', 'RainTomorrow', 'WindGustDir_E', 'WindGustDir_ENE',
       'WindGustDir_ESE', 'WindGustDir_N', 'WindGustDir_NE', 'WindGustDir_NNE',
       'WindGustDir_NNW', 'WindGustDir_NW', 'WindGustDir_S', 'WindGustDir_SE',
       'WindGustDir_SSE', 'WindGustDir_SSW', 'WindGustDir_SW', 'WindGustDir_W',
       'WindGustDir_WNW', 'WindGustDir_WSW', 'WindDir9am_E', 'WindDir9am_ENE',
       'WindDir9am_ESE', 'WindDir9am_N', 'WindDir9am_NE', 'WindDir9am_NNE',
       'WindDir9am_NNW', 'WindDir9am_NW', 'WindDir9am_S', 'WindDir9am_SE',
       'WindDir9am_SSE', 'WindDir9am_SSW', 'WindDir9am_SW', 'WindDir9am_W',
       'WindDir9am_WNW', 'WindDir9am_WSW', 'WindDir3pm_E', 'WindDir3pm_ENE',
       'WindDir3pm_ESE', 'Wi

In [11]:
# split the data
xtrain = train_weather.drop(columns=['Unnamed: 0', 'Date', 'Location','RainTomorrow'])
xtest = test_weather.drop(columns=['Unnamed: 0', 'Date', 'Location','RainTomorrow'])
ytrain = np.array([0 if x.lower() == 'yes'else 1 for x in train_weather['RainTomorrow']])
ytest = np.array([0 if x.lower() == 'yes'else 1 for x in test_weather['RainTomorrow']])

In [12]:
classifier = XGBClassifier(random_state = 0, learning_rate = 0.1, n_estimators = 600)

model_classifier = classifier.fit(xtrain, ytrain)

test_prediction = model_classifier.predict(xtest)
train_prediction = model_classifier.predict(xtrain)

In [13]:
print(f'train accuracy: {accuracy_score(ytrain, train_prediction)}')
print(f'test accuracy: {accuracy_score(ytest, test_prediction)}')

train accuracy: 0.9044262970656753
test accuracy: 0.8490416839251421


In [14]:
f1_score(ytest,  test_prediction)

0.906600191039769

In [15]:
confusion_matrix(ytest, test_prediction)

array([[ 3091,  2855],
       [ 1154, 19457]], dtype=int64)

In [16]:
print(precision_score(ytest, test_prediction))
print(recall_score(ytest, test_prediction))

0.8720419505198996
0.9440104798408617


### cross validation

In [17]:
from sklearn.model_selection import KFold


In [36]:
frames = [train_weather, test_weather]

data = pd.concat(frames)

X = data.drop(columns=['RainTomorrow','Unnamed: 0', 'Date', 'Location'])
y = np.array([0 if x.lower() == 'yes'else 1 for x in data['RainTomorrow']])
y = pd.DataFrame(data = y, columns=['RainTomorrow'])

In [40]:
kfold = KFold(n_splits=10)


In [20]:
def train_and_evaluate(X_train, y_train, X_test, y_test, **params):
    model = XGBClassifier(random_state=42, n_jobs=-1, **params)
    model.fit(X_train, y_train)
    train_accuracy = accuracy_score(model.predict(X_train), y_train)
    test_accuracy = accuracy_score(model.predict(X_test), y_test)
    return model, train_accuracy, test_accuracy

In [41]:
models = []

for train_idxs, val_idxs in kfold.split(X):
    X_train, y_train = X.iloc[train_idxs], y.iloc[train_idxs]
    X_test, y_test = X.iloc[val_idxs], y.iloc[val_idxs]
    model, train_accuracy, test_accuracy = train_and_evaluate(X_train, 
                                                     y_train, 
                                                     X_test, 
                                                     y_test, 
                                                     max_depth=4, 
                                                     n_estimators=20)
    models.append(model)
    print('Train Accuracy: {}, Test Accuracy: {}'.format(train_accuracy, test_accuracy))

Train Accuracy: 0.8486181768183693, Test Accuracy: 0.8587927952701774
Train Accuracy: 0.8521930427608965, Test Accuracy: 0.8264127595215179
Train Accuracy: 0.8489466367233451, Test Accuracy: 0.8439433521242954
Train Accuracy: 0.8510319751898193, Test Accuracy: 0.8284064347586966
Train Accuracy: 0.8507111538872848, Test Accuracy: 0.8371373573490994
Train Accuracy: 0.8491834333990254, Test Accuracy: 0.8523305376048398
Train Accuracy: 0.8478772323815634, Test Accuracy: 0.8529492644025849
Train Accuracy: 0.8484348503597782, Test Accuracy: 0.8523305376048398
Train Accuracy: 0.8500618726797745, Test Accuracy: 0.8477244603327375
Train Accuracy: 0.8500924270895397, Test Accuracy: 0.8424996562628901


In [None]:
age = [1,2,3,4,5,6,7,8,9,10]
sample_1 = [1,4,5,2,3,8,9,10]
sum(sample_1)/len(sample_1)



5.25

In [34]:
X

Unnamed: 0.1,Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,0,2008-12-01,Albury,13.4,22.9,0.6,4.8,8.4,44.0,20.0,...,0,0,0,0,0,0,0,0,1,0
1,1,2008-12-02,Albury,7.4,25.1,0.0,4.8,8.4,44.0,4.0,...,0,0,0,0,0,0,0,0,0,1
2,2,2008-12-03,Albury,12.9,25.7,0.0,4.8,8.4,46.0,19.0,...,0,0,0,0,0,0,0,0,0,1
3,3,2008-12-04,Albury,9.2,28.0,0.0,4.8,8.4,24.0,11.0,...,0,0,0,0,0,0,0,0,0,0
4,4,2008-12-05,Albury,17.5,32.3,1.0,4.8,8.4,41.0,7.0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26552,145455,2017-06-21,Uluru,2.8,23.4,0.0,4.8,8.4,31.0,13.0,...,0,0,0,0,0,0,0,0,0,0
26553,145456,2017-06-22,Uluru,3.6,25.3,0.0,4.8,8.4,22.0,13.0,...,0,0,0,0,0,0,0,0,0,0
26554,145457,2017-06-23,Uluru,5.4,26.9,0.0,4.8,8.4,37.0,9.0,...,0,0,0,0,0,0,0,0,1,0
26555,145458,2017-06-24,Uluru,7.8,27.0,0.0,4.8,8.4,28.0,13.0,...,0,0,0,0,0,0,0,0,0,0
