
### Gradient Boosting with XGboost

In [1]:
from xgboost import XGBClassifier, XGBRegressor


import pandas as pd
import numpy as np


import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

sns.set_style('darkgrid')
sns.set_palette('husl')

In [2]:
data = pd.read_csv('insurance.csv')

In [3]:
data.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [4]:
data.head(4)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061


### Assignment 

Perform an extensive exploratory data analysis on this dataset

In [5]:
data_enc = pd.get_dummies(data, ['sex','region','smoker'])

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

num_cols = scaler.fit_transform(data_enc[['age','bmi','children']])
cat_cols = data_enc.drop(columns=['age','bmi','children','charges']).values

X = np.concatenate((num_cols,cat_cols), axis= 1)
y = data_enc['charges']

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

reg = XGBRegressor(random_state= 23, n_estimators = 50,learning_rate = 0.1,
                  eval_metric = 'rmse')

model = reg.fit(X_train, y_train)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

from sklearn.metrics import mean_squared_error


print(f'test_rmse: {mean_squared_error(y_test, test_pred, squared= False)}')
print(f'train_rmse: {mean_squared_error(y_train, train_pred, squared=False)}')

test_rmse: 4573.38967835942
train_rmse: 2902.7569778724433


# Classification with Xgboost

In [57]:
from xgboost import XGBClassifier


from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score,recall_score,f1_score,confusion_matrix

In [58]:
train_weather = pd.read_csv('weather_train.csv')
test_weather = pd.read_csv('weather_test.csv')

In [59]:
train_weather.columns

Index(['Unnamed: 0', 'Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall',
       'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am',
       'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am',
       'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm',
       'RainToday', 'RainTomorrow', 'WindGustDir_E', 'WindGustDir_ENE',
       'WindGustDir_ESE', 'WindGustDir_N', 'WindGustDir_NE', 'WindGustDir_NNE',
       'WindGustDir_NNW', 'WindGustDir_NW', 'WindGustDir_S', 'WindGustDir_SE',
       'WindGustDir_SSE', 'WindGustDir_SSW', 'WindGustDir_SW', 'WindGustDir_W',
       'WindGustDir_WNW', 'WindGustDir_WSW', 'WindDir9am_E', 'WindDir9am_ENE',
       'WindDir9am_ESE', 'WindDir9am_N', 'WindDir9am_NE', 'WindDir9am_NNE',
       'WindDir9am_NNW', 'WindDir9am_NW', 'WindDir9am_S', 'WindDir9am_SE',
       'WindDir9am_SSE', 'WindDir9am_SSW', 'WindDir9am_SW', 'WindDir9am_W',
       'WindDir9am_WNW', 'WindDir9am_WSW', 'WindDir3pm_E', 'WindDir3pm_ENE',
       'WindDir3pm_ESE', 'Wi

In [60]:
# split the data
xtrain = train_weather.drop(columns=['Unnamed: 0', 'Date', 'Location','RainTomorrow'])
xtest = test_weather.drop(columns=['Unnamed: 0', 'Date', 'Location','RainTomorrow'])
ytrain = np.array([0 if x.lower() == 'yes'else 1 for x in train_weather['RainTomorrow']])
ytest = np.array([0 if x.lower() == 'yes'else 1 for x in test_weather['RainTomorrow']])

In [72]:
classifier = XGBClassifier(random_state = 0, learning_rate = 0.1, n_estimators = 600)

model_classifier = classifier.fit(xtrain, ytrain)

test_prediction = model_classifier.predict(xtest)
train_prediction = model_classifier.predict(xtrain)

In [73]:
print(f'train accuracy: {accuracy_score(ytrain, train_prediction)}')
print(f'test accuracy: {accuracy_score(ytest, test_prediction)}')

train accuracy: 0.9044262970656753
test accuracy: 0.8490416839251421


In [74]:

f1_score(ytest,  test_prediction)

0.906600191039769

In [64]:
confusion_matrix(ytest, test_prediction)

array([[ 2885,  3061],
       [ 1050, 19561]], dtype=int64)

In [75]:
print(precision_score(ytest, test_prediction))
print(recall_score(ytest, test_prediction))

0.8720419505198996
0.9440104798408617


In [None]:
conf