In [3]:
#pip install xgboost
#pip install imblearn

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-1.7.5-py3-none-win_amd64.whl (70.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.7.5
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Program Files\Python310\python.exe -m pip install --upgrade pip' command.


In [1]:
import numpy as np
import pandas as pd

# import libraries for plotting
import matplotlib.pyplot as plt
import matplotlib.style as style
style.use('ggplot')
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer 

# hiding warnings
import warnings

warnings.filterwarnings('ignore')

from patsy import dmatrices
from sklearn.preprocessing import LabelEncoder

# models
from xgboost import XGBClassifier

from sklearn.metrics import classification_report,f1_score


In [2]:
# importing dataset
data = './weatherAUS.csv'

df = pd.read_csv(data)

df.shape

(145460, 23)

In [3]:
df.isnull().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

# features

In [4]:
df= df.drop(['Location','Date'],axis=1)

In [5]:
categorical_vars = [var for var in df.columns if df[var].dtype=='O']
print('Number of variables: ',len(categorical_vars))
print('Variables :', categorical_vars)

Number of variables:  5
Variables : ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']


In [6]:
for col in categorical_vars:
    df[col] = df[col].replace(np.nan,df[col].mode()[0])

In [7]:
num_cols = []
for col in df:
    if df[col].dtype == 'float64':
        num_cols.append(col)
print('Numerical Cols:',num_cols)

Numerical Cols: ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']


# Null treatment

In [8]:
df[categorical_vars].isnull().sum()

WindGustDir     0
WindDir9am      0
WindDir3pm      0
RainToday       0
RainTomorrow    0
dtype: int64

In [138]:
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn import set_config
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

set_config(display='diagram')


def numeric_imputer(x):
    # After checking the normalization
    # Replacing NaN values to Mean of respective column

    for col in num_cols:
        if col == 'Rainfall':
            x[col] = x[col].replace(np.nan, df[col].median())
        else:
            x[col] = x[col].replace(np.nan, df[col].mean())

    return x

c_vars = ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']

def dummies_generator(x):
    for index,row in x.iterrows():
        if df['Rainfall'][index] > 0:
            x['RainToday'][index] = 1
        else: 
            x['RainToday'][index] = 0
    x = pd.get_dummies(data=x, columns=x.columns,drop_first=True)
    return x

def outliers_remover(x):
    for i in num_cols:
        IQR= x[i].quantile(.75)-x[i].quantile(.25)
        lower=x[i].quantile(.25) - (1.5 * IQR)
        upper=x[i].quantile(.75) + (1.5 * IQR)
        x[i]=np.where(x[i]<lower,lower,x[i])
        x[i]=np.where(x[i]>upper,upper,x[i])
    return x


In [163]:
from sklearn.model_selection import train_test_split, GridSearchCV


# Preprocessing Pipelines + column Transformer + Classifier
# numeric_processor = Pipeline(steps=[('imputation-mean', SimpleImputer(strategy='median',fill_value=np.nan)), ('scaler',StandardScaler())])
numeric_processor = Pipeline(steps=
                             [
                                 ('imputation-mean', FunctionTransformer(numeric_imputer)),
                                 ('outlier-remover',FunctionTransformer(outliers_remover)),
                                 ('scaler',StandardScaler())
                            ])
categorical_processor = Pipeline(steps=
                                 [
                                     ('dummies', FunctionTransformer(dummies_generator)),
                                     ('one-hot-encoder',OneHotEncoder(handle_unknown='ignore')),
                                ])

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical',numeric_processor, num_cols),
        ('categorical',categorical_processor,c_vars)
    ],
    remainder='passthrough'
    )

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(random_state=166))
])

x = df.drop(['RainTomorrow'],axis=1)
# y = LabelEncoder().fit_transform(df[['RainTomorrow']])
y2 = LabelEncoder().fit_transform(df[['RainTomorrow']])

X_train,X_test,y_train,y_test= train_test_split(x,y2,random_state=166,test_size=.30)

pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print('Train fit',pipe.score(X_train,y_train))
print('Test fit: ',pipe.score(X_test,y_test))
print('F1 score:',f1_score(y_pred,y_test))

Train fit 0.8841900571585708
Test fit:  0.8559512351620148
F1 score: 0.6199975819127072


# Tunning

In [157]:
# param_grid = {
#     'classifier__n_estimators': [50, 100, 200],
#     'classifier__max_depth': [5,8,10],
#     'classifier__learning_rate': [0.01, 0.05, 0.08, 0.1],
#     'classifier__colsample_bytree':[0.8,1.0],
#     'classifier__subsample':[0.9,0.1]
# }

param_grid = {
    'classifier__colsample_bytree': [0.8],
    'classifier__learning_rate': [0.05],
    'classifier__max_depth': [10],
    'classifier__n_estimators': [200],
    'classifier__subsample': [0.9]
 }

gcv = GridSearchCV(estimator = pipe, param_grid = param_grid, n_jobs=-1, scoring = 'f1_score',cv=3).fit(X_train,y_train)
print('Train fit',gcv.score(X_train,y_train))
print('Test fit:',gcv.score(X_test,y_test))
print('F1 score:',f1_score(gcv.predict(X_test),y_test))

Train fit 0.9704576075903819
Test fit: 0.8872471932543673
F1 score: 0.6217508582638548


In [158]:
# {'classifier__colsample_bytree': 0.8,
#  'classifier__learning_rate': 0.05,
#  'classifier__max_depth': 10,
#  'classifier__n_estimators': 200,
#  'classifier__subsample': 0.9}
gcv.best_params_

{'classifier__colsample_bytree': 0.8,
 'classifier__learning_rate': 0.05,
 'classifier__max_depth': 10,
 'classifier__n_estimators': 200,
 'classifier__subsample': 0.9}

# Oversamplling with SMOTE

In [173]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import imblearn

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical',numeric_processor, num_cols),
        ('categorical',categorical_processor,c_vars)
    ],
    remainder='passthrough'
    )

pipe = imblearn.pipeline.Pipeline(
    [
        ('preprocessor', preprocessor),
        ('sampler', SMOTE()),
        ('classifier', XGBClassifier(random_state=166))
    ]
)

X_train,X_test,y_train,y_test= train_test_split(x,y2,random_state=166,test_size=.30)

param_grid = {
    'classifier__colsample_bytree': [0.8],
    'classifier__learning_rate': [0.05],
    'classifier__max_depth': [10],
    'classifier__n_estimators': [200],
    'classifier__subsample': [0.9]
 }

gcv = GridSearchCV(estimator = pipe, param_grid = param_grid, n_jobs=-1, scoring = 'f1').fit(X_train,y_train)
print('Train fit',gcv.score(X_train,y_train))
print('Test fit:',gcv.score(X_test,y_test))
print('F1 score:',f1_score(gcv.predict(X_test),y_test))

Train fit 0.7362044440084368
Test fit: 0.6295488462863047
F1 score: 0.6295488462863047


# Undersampling with RandomUnderSampler

In [174]:
from imblearn.under_sampling import RandomUnderSampler

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical',numeric_processor, num_cols),
        ('categorical',categorical_processor,c_vars)
    ],
    remainder='passthrough'
    )

pipe = imblearn.pipeline.Pipeline(
    [
        ('preprocessor', preprocessor),
        ('sampler', RandomUnderSampler()),
        ('classifier', XGBClassifier(random_state=166))
    ]
)

X_train,X_test,y_train,y_test= train_test_split(x,y2,random_state=166,test_size=.30)

param_grid = {
    'classifier__colsample_bytree': [0.8],
    'classifier__learning_rate': [0.05],
    'classifier__max_depth': [10],
    'classifier__n_estimators': [200],
    'classifier__subsample': [0.9]
 }

gcv = GridSearchCV(estimator = pipe, param_grid = param_grid, n_jobs=-1, scoring = 'f1').fit(X_train,y_train)
print('Train fit',gcv.score(X_train,y_train))
print('Test fit:',gcv.score(X_test,y_test))
print('F1 score:',f1_score(gcv.predict(X_test),y_test))

Train fit 0.7407486898821342
Test fit: 0.6317333892265773
F1 score: 0.6317333892265773


# Removing all null values and model with undersampling

In [179]:
data = './weatherAUS.csv'

df = pd.read_csv(data)
df = df.dropna(axis=0)
df = df.drop(['Date','Location'],axis=1)
df

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,2017-06-21,Uluru,2.8,23.4,0.0,,,E,31.0,SE,...,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No
145456,2017-06-22,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,...,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No
145457,2017-06-23,Uluru,5.4,26.9,0.0,,,N,37.0,SE,...,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No
145458,2017-06-24,Uluru,7.8,27.0,0.0,,,SE,28.0,SSE,...,51.0,24.0,1019.4,1016.5,3.0,2.0,15.1,26.0,No,No


Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
6049,17.9,35.2,0.0,12.0,12.3,SSW,48.0,ENE,SW,6.0,...,20.0,13.0,1006.3,1004.4,2.0,5.0,26.6,33.4,No,No
6050,18.4,28.9,0.0,14.8,13.0,S,37.0,SSE,SSE,19.0,...,30.0,8.0,1012.9,1012.1,1.0,1.0,20.3,27.0,No,No
6052,19.4,37.6,0.0,10.8,10.6,NNE,46.0,NNE,NNW,30.0,...,42.0,22.0,1012.3,1009.2,1.0,6.0,28.7,34.9,No,No
6053,21.9,38.4,0.0,11.4,12.2,WNW,31.0,WNW,WSW,6.0,...,37.0,22.0,1012.7,1009.1,1.0,5.0,29.1,35.6,No,No
6054,24.2,41.0,0.0,11.2,8.4,WNW,35.0,NW,WNW,17.0,...,19.0,15.0,1010.7,1007.4,1.0,6.0,33.6,37.6,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142298,19.3,33.4,0.0,6.0,11.0,ENE,35.0,SE,NE,9.0,...,63.0,32.0,1013.9,1010.5,0.0,1.0,24.5,32.3,No,No
142299,21.2,32.6,0.0,7.6,8.6,E,37.0,SE,SE,13.0,...,56.0,28.0,1014.6,1011.2,7.0,0.0,24.8,32.0,No,No
142300,20.7,32.8,0.0,5.6,11.0,E,33.0,E,W,17.0,...,46.0,23.0,1015.3,1011.8,0.0,0.0,24.8,32.1,No,No
142301,19.5,31.8,0.0,6.2,10.6,ESE,26.0,SE,NNW,9.0,...,62.0,58.0,1014.9,1010.7,1.0,1.0,24.8,29.2,No,No


In [191]:
from imblearn.under_sampling import RandomUnderSampler

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical',numeric_processor, num_cols),
        ('categorical',categorical_processor,c_vars)
    ],
    remainder='passthrough'
    )

pipe = imblearn.pipeline.Pipeline(
    [
        ('preprocessor', preprocessor),
        ('sampler', RandomUnderSampler()),
        ('classifier', XGBClassifier(random_state=166))
    ]
)

x = df.drop(['RainTomorrow'],axis=1)
y2 = LabelEncoder().fit_transform(df[['RainTomorrow']])

X_train,X_test,y_train,y_test= train_test_split(x,y2,random_state=166,test_size=.30)

param_grid = {
    'classifier__colsample_bytree': [0.8],
    'classifier__learning_rate': [0.05],
    'classifier__max_depth': [10],
    'classifier__n_estimators': [200],
    'classifier__subsample': [0.9]
 }

gcv = GridSearchCV(estimator = pipe, param_grid = param_grid, n_jobs=-1, scoring = 'f1',cv=5).fit(X_train,y_train)
print('Train fit',gcv.score(X_train,y_train))
print('Test fit:',gcv.score(X_test,y_test))
print('F1 score:',f1_score(gcv.predict(X_test),y_test))

Train fit 0.7910378668641793
Test fit: 0.6493951397066696
F1 score: 0.6493951397066696


In [192]:
print(classification_report(gcv.predict(X_test),y_test))

              precision    recall  f1-score   support

           0       0.81      0.94      0.87     11334
           1       0.81      0.54      0.65      5592

    accuracy                           0.81     16926
   macro avg       0.81      0.74      0.76     16926
weighted avg       0.81      0.81      0.79     16926

