<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#importing-Librarys" data-toc-modified-id="importing-Librarys-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>importing Librarys</a></span></li><li><span><a href="#preprocessing" data-toc-modified-id="preprocessing-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>preprocessing</a></span></li><li><span><a href="#Regression-with-XGBoost" data-toc-modified-id="Regression-with-XGBoost-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Regression with XGBoost</a></span></li><li><span><a href="#XGBoost-Classifier" data-toc-modified-id="XGBoost-Classifier-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>XGBoost Classifier</a></span></li></ul></div>

## importing Librarys

In [1]:
import pandas as pd 
import numpy as np

## preprocessing

In [2]:
data = pd.read_csv("Datafiles/biketrain.csv")
data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [3]:
import datetime
data["datetime"] = pd.to_datetime(data["datetime"])

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   casual      10886 non-null  int64         
 10  registered  10886 non-null  int64         
 11  count       10886 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(8)
memory usage: 1020.7 KB


In [5]:
# checking missing value
data.isnull().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
dtype: int64

In [6]:
data = data.drop("datetime", axis=1)

In [7]:
data = data.drop(["casual","registered"],axis=1)

In [8]:
data.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,1,0,0,1,9.84,14.395,81,0.0,16
1,1,0,0,1,9.02,13.635,80,0.0,40
2,1,0,0,1,9.02,13.635,80,0.0,32
3,1,0,0,1,9.84,14.395,75,0.0,13
4,1,0,0,1,9.84,14.395,75,0.0,1


## Regression with XGBoost

In [9]:
# split data
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [10]:
y.head()

0    16
1    40
2    32
3    13
4     1
Name: count, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X,y,random_state=33)
print(xtrain.shape)
print(ytrain.shape)
print(xtest.shape)
print(ytest.shape)

(8164, 8)
(8164,)
(2722, 8)
(2722,)


In [12]:
from xgboost import XGBRegressor
xg_reg = XGBRegressor()
xg_reg.fit(xtrain,ytrain)

In [13]:
y_pred = xg_reg.predict(xtest)
y_pred

array([222.1306 , 128.99864,  71.72782, ..., 163.48778, 156.50006,
        83.42532], dtype=float32)

In [14]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(ytest, y_pred)
print("MSE : {:.2f}".format(mse))

MSE : 22771.10


In [15]:
rmse = np.sqrt(mse)
print("RMSE : {:.2f}".format(rmse))

RMSE : 150.90


In [16]:
from sklearn.model_selection import cross_val_score
model = XGBRegressor(objective="reg:squarederror")
scores = cross_val_score(model, X,y,scoring="neg_mean_squared_error",cv=10)
scores

array([-14062.21914973, -16220.34062812, -35346.35043826, -20529.8327947 ,
       -23036.44015503, -18510.06277223, -40926.62158632, -40486.00420289,
       -48493.15466064, -35862.59428839])

In [17]:
rmse = np.sqrt(-scores)
print(rmse)

[118.58422808 127.35910108 188.00625106 143.2823534  151.77760097
 136.05169154 202.30329109 201.21134213 220.21161336 189.37421759]


In [18]:
type(rmse)

numpy.ndarray

In [19]:
print("min: ",min(rmse))
print("max: ",max(rmse))

min:  118.58422808167883
max:  220.21161336461094


## XGBoost Classifier

In [20]:
df = pd.read_csv("Datafiles/census.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [21]:
df.shape

(32561, 15)

In [22]:
# droping all row with value ? 
df = df[(df != '?').all(axis=1)]
df.shape

(30162, 15)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30162 entries, 1 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             30162 non-null  int64 
 1   workclass       30162 non-null  object
 2   fnlwgt          30162 non-null  int64 
 3   education       30162 non-null  object
 4   education.num   30162 non-null  int64 
 5   marital.status  30162 non-null  object
 6   occupation      30162 non-null  object
 7   relationship    30162 non-null  object
 8   race            30162 non-null  object
 9   sex             30162 non-null  object
 10  capital.gain    30162 non-null  int64 
 11  capital.loss    30162 non-null  int64 
 12  hours.per.week  30162 non-null  int64 
 13  native.country  30162 non-null  object
 14  income          30162 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [24]:
df["workclass"].value_counts()

Private             22286
Self-emp-not-inc     2499
Local-gov            2067
State-gov            1279
Self-emp-inc         1074
Federal-gov           943
Without-pay            14
Name: workclass, dtype: int64

In [25]:
df.drop(['education'],axis=1,inplace=True)

In [26]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,82,Private,132870,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
3,54,Private,140359,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,216864,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,150601,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K


In [27]:
df['income'].value_counts()

<=50K    22654
>50K      7508
Name: income, dtype: int64

In [28]:
df.shape

(30162, 14)

In [29]:
# creating dummy variable for all labels for all categorical column
df = pd.get_dummies(df)
df.head()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,...,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia,income_<=50K,income_>50K
1,82,132870,9,0,4356,18,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
3,54,140359,4,0,3900,40,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
4,41,264663,10,0,3900,40,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
5,34,216864,9,0,3770,45,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
6,38,150601,6,0,3770,40,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0


In [30]:
df.columns

Index(['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss',
       'hours.per.week', 'workclass_Federal-gov', 'workclass_Local-gov',
       'workclass_Private', 'workclass_Self-emp-inc',
       'workclass_Self-emp-not-inc', 'workclass_State-gov',
       'workclass_Without-pay', 'marital.status_Divorced',
       'marital.status_Married-AF-spouse', 'marital.status_Married-civ-spouse',
       'marital.status_Married-spouse-absent', 'marital.status_Never-married',
       'marital.status_Separated', 'marital.status_Widowed',
       'occupation_Adm-clerical', 'occupation_Armed-Forces',
       'occupation_Craft-repair', 'occupation_Exec-managerial',
       'occupation_Farming-fishing', 'occupation_Handlers-cleaners',
       'occupation_Machine-op-inspct', 'occupation_Other-service',
       'occupation_Priv-house-serv', 'occupation_Prof-specialty',
       'occupation_Protective-serv', 'occupation_Sales',
       'occupation_Tech-support', 'occupation_Transport-moving',
       'rela

In [31]:
df['income_<=50K'].value_counts()

1    22654
0     7508
Name: income_<=50K, dtype: int64

In [32]:
print(df.shape)
df.drop(['income_<=50K'],axis=1,inplace=True)
print(df.shape)

(30162, 90)
(30162, 89)


In [33]:
df['income_>50K'].value_counts()

0    22654
1     7508
Name: income_>50K, dtype: int64

In [34]:
# split data
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X,y,random_state=33)
print(xtrain.shape)
print(ytrain.shape)
print(xtest.shape)
print(ytest.shape)

(22621, 88)
(22621,)
(7541, 88)
(7541,)


In [35]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
def cross_val(classifier, num_splits=10):
    model = classifier
    scores = cross_val_score(model, X, y, cv=10)
    print(np.round(scores,2))
    print(scores.mean())

In [36]:
cross_val(XGBClassifier(n_estimators=10))

[0.62 0.9  0.73 0.81 0.82 0.82 0.83 0.83 0.83 0.81]
0.8012755416037678
