https://www.analyticsvidhya.com/blog/2020/10/overcoming-class-imbalance-using-smote-techniques/

### Lets load the saved data and build different models on top of it

In [8]:
import pandas as pd

train = pd.read_csv('data_train.csv')
test = pd.read_csv('data_test.csv')

train.shape, test.shape

((99535, 22), (42658, 22))

In [9]:
train.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,1.0,0.398585,0.357006,0.024798,0.027586,0.482517,17.0,0.341085,16.0,14.0,...,0.515152,0.55,0.471074,0.542071,0.666667,0.75,0.401728,0.337812,1.0,1
1,0.0,0.53066,0.62572,0.0,0.053517,0.837762,1.0,0.124031,3.0,15.0,...,0.484848,0.3,0.676033,0.676375,0.4,0.55,0.602592,0.62572,0.0,0
2,47.0,0.353774,0.639155,0.000539,0.041103,0.675524,3.0,0.224806,4.0,3.0,...,0.525253,0.27,0.707438,0.695793,0.622222,0.85,0.611231,0.639155,0.0,0
3,43.0,0.683962,0.761996,0.0,0.052414,0.685315,2.0,0.395349,1.0,1.0,...,0.525253,0.33,0.598347,0.600324,0.222222,0.5,0.710583,0.75048,0.0,0
4,40.0,0.341981,0.650672,0.0,0.04331,0.742657,15.0,0.255814,17.0,15.0,...,0.464646,0.25,0.52562,0.514563,0.377778,0.1,0.552916,0.641075,0.0,0


In [2]:
X_train, y_train = train.iloc[:,:-1], train.iloc[:,-1]
X_test, y_test = test.iloc[:,:-1], test.iloc[:,-1]

### Selecting the top 6 independent features

In [3]:
from sklearn.feature_selection import SelectKBest, chi2

ordered_rank_features=SelectKBest(score_func=chi2, k=6)
ordered_feature=ordered_rank_features.fit(X_train,y_train)

dfscores=pd.DataFrame(ordered_feature.scores_)
dfcolumns=pd.DataFrame(X_train.columns)
features_rank = pd.DataFrame({
                        'features' : X_train.columns,
                        'score' : ordered_feature.scores_}).sort_values(by='score', ascending=False)


features_rank

Unnamed: 0,features,score
20,RainToday,7528.216717
13,Humidity3pm,1650.814886
17,Cloud3pm,1507.069814
5,Sunshine,1112.666914
16,Cloud9am,1105.214964
6,WindGustDir,1001.546071
3,Rainfall,460.283766
9,WindDir3pm,373.629406
12,Humidity9am,360.872523
7,WindGustSpeed,195.051651


In [4]:
features = features_rank['features'][:6].values.tolist()
features.append('RainTomorrow')

In [5]:
features

['RainToday',
 'Humidity3pm',
 'Cloud3pm',
 'Sunshine',
 'Cloud9am',
 'WindGustDir',
 'RainTomorrow']

In [6]:
train = train[features]
test = test[features]
train.shape, test.shape

((99535, 7), (42658, 7))

In [7]:
train.head()

Unnamed: 0,RainToday,Humidity3pm,Cloud3pm,Sunshine,Cloud9am,WindGustDir,RainTomorrow
0,1.0,0.55,0.75,0.482517,0.666667,17.0,1
1,0.0,0.3,0.55,0.837762,0.4,1.0,0
2,0.0,0.27,0.85,0.675524,0.622222,3.0,0
3,0.0,0.33,0.5,0.685315,0.222222,2.0,0
4,0.0,0.25,0.1,0.742657,0.377778,15.0,0


### Defining a function for the conveniance to display various metrics

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

def build_model(sampler):
    
    X_train, y_train = train.iloc[:,:-1], train.iloc[:,-1]
    X_test, y_test = test.iloc[:,:-1], test.iloc[:,-1]
    
    print('Pre Sampling Class Counts', y_train.value_counts(), '--' * 30, sep='\n')
    
    dt = DecisionTreeClassifier(random_state=10)
    
    if sampler==None:
        pass
    else:
        X_train, y_train = sampler.fit_resample(X_train, y_train)
        
    dt.fit(X_train, y_train)
    
    print('Post Sampling Class Counts', y_train.value_counts(), '--' * 30, sep='\n')
    
    y_pred = dt.predict(X_test)
    print('Pre Optimization Metrics', classification_report(y_test, y_pred), '--' * 60, sep='\n')
    
    grid_param = {
                'criterion': ['gini', 'entropy'],
                'max_depth' : range(3, 22, 3),
                'min_samples_leaf' : range(80, 121, 10),
                'min_samples_split': range(100, 301, 50),
                'splitter' : ['best', 'random']
                }                
    
    dt = DecisionTreeClassifier(random_state=10)
    grid_search = GridSearchCV(estimator=dt, param_grid=grid_param, scoring='recall', cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_, '--' * 58, sep='\n')
    
    dt_optimized = grid_search.best_estimator_
    dt_optimized.fit(X_train, y_train)
    y_pred = dt_optimized.predict(X_test)
    print('Post Optimization Metrics', classification_report(y_test, y_pred), '--' * 60, sep='\n')   
    
    return dt_optimized

In [8]:
model1 = build_model(None)

Pre Sampling Class Counts
0    77255
1    22280
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Post Sampling Class Counts
0    77255
1    22280
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Pre Optimization Metrics
              precision    recall  f1-score   support

           0       0.84      0.83      0.84     33061
           1       0.45      0.47      0.46      9597

    accuracy                           0.75     42658
   macro avg       0.64      0.65      0.65     42658
weighted avg       0.75      0.75      0.75     42658

------------------------------------------------------------------------------------------------------------------------
Fitting 5 folds for each of 700 candidates, totalling 3500 fits
{'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 80, 'min_samples_split': 100, 'splitter': 'best'}
-------------------------------------------------------------------

In [9]:
from imblearn.over_sampling import SMOTE

model2 = build_model(SMOTE(random_state=10))

Pre Sampling Class Counts
0    77255
1    22280
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Post Sampling Class Counts
1    77255
0    77255
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Pre Optimization Metrics
              precision    recall  f1-score   support

           0       0.84      0.81      0.83     33061
           1       0.43      0.48      0.45      9597

    accuracy                           0.74     42658
   macro avg       0.64      0.65      0.64     42658
weighted avg       0.75      0.74      0.74     42658

------------------------------------------------------------------------------------------------------------------------
Fitting 5 folds for each of 700 candidates, totalling 3500 fits
{'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 110, 'min_samples_split': 100, 'splitter': 'best'}
------------------------------------------------------------------

In [10]:
from imblearn.combine import SMOTEENN

model3=build_model(SMOTEENN(random_state=10))

Pre Sampling Class Counts
0    77255
1    22280
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Post Sampling Class Counts
1    55503
0    46986
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Pre Optimization Metrics
              precision    recall  f1-score   support

           0       0.89      0.77      0.82     33061
           1       0.45      0.66      0.53      9597

    accuracy                           0.74     42658
   macro avg       0.67      0.71      0.68     42658
weighted avg       0.79      0.74      0.76     42658

------------------------------------------------------------------------------------------------------------------------
Fitting 5 folds for each of 700 candidates, totalling 3500 fits
{'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 80, 'min_samples_split': 200, 'splitter': 'best'}
---------------------------------------------------------------------

In [11]:
from imblearn.under_sampling import NearMiss

model4=build_model(NearMiss())

Pre Sampling Class Counts
0    77255
1    22280
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Post Sampling Class Counts
0    22280
1    22280
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Pre Optimization Metrics
              precision    recall  f1-score   support

           0       0.82      0.45      0.58     33061
           1       0.26      0.65      0.37      9597

    accuracy                           0.49     42658
   macro avg       0.54      0.55      0.47     42658
weighted avg       0.69      0.49      0.53     42658

------------------------------------------------------------------------------------------------------------------------
Fitting 5 folds for each of 700 candidates, totalling 3500 fits
{'criterion': 'gini', 'max_depth': 18, 'min_samples_leaf': 90, 'min_samples_split': 300, 'splitter': 'best'}
---------------------------------------------------------------------

### From the above results SMOTEENN data has given the best recall. So, lets save the model3. Also need to save the column names and min max values for validation.

In [13]:
X_test

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,8.0,0.695755,0.577735,0.001078,0.017931,0.055944,10.0,0.341085,10.0,10.0,...,0.218391,0.848485,0.80,0.542149,0.545307,0.777778,0.875,0.639309,0.566219,0.0
1,45.0,0.622642,0.788868,0.000000,0.068966,0.615385,6.0,0.224806,4.0,6.0,...,0.195402,0.676768,0.37,0.652893,0.653722,0.333333,0.875,0.663067,0.773512,0.0
2,34.0,0.334906,0.414587,0.000000,0.024552,0.285315,8.0,0.069767,16.0,11.0,...,0.045977,0.979798,0.75,0.814876,0.773463,0.888889,0.750,0.311015,0.404990,0.0
3,15.0,0.455189,0.404990,0.021563,0.027586,0.489510,11.0,0.511628,14.0,11.0,...,0.103448,0.650505,0.50,0.492562,0.585761,0.311111,0.475,0.403888,0.351248,1.0
4,8.0,0.724057,0.700576,0.000000,0.042759,0.741259,5.0,0.139535,10.0,5.0,...,0.149425,0.585859,0.58,0.581818,0.572816,0.555556,0.250,0.775378,0.685221,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42653,3.0,0.386792,0.566219,0.000000,0.027586,0.692308,0.0,0.240310,0.0,1.0,...,0.252874,0.535354,0.28,0.649587,0.616505,0.111111,0.125,0.464363,0.568138,0.0
42654,3.0,0.476415,0.470250,0.000000,0.022069,0.664336,0.0,0.255814,0.0,1.0,...,0.252874,0.646465,0.44,0.730579,0.711974,0.666667,0.875,0.440605,0.464491,0.0
42655,26.0,0.455189,0.520154,0.005930,0.014069,0.377622,2.0,0.193798,16.0,2.0,...,0.252874,0.676768,0.62,0.517355,0.498382,0.533333,0.750,0.488121,0.470250,1.0
42656,38.0,0.375000,0.364683,0.000000,0.024828,0.377622,11.0,0.387597,16.0,11.0,...,0.402299,0.636364,0.60,0.656198,0.699029,0.444444,0.625,0.406048,0.360845,0.0


In [14]:
X_test, y_test = test.iloc[:,:-1], test.iloc[:,-1]

y_pred = model3.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.73      0.81     33061
           1       0.45      0.74      0.56      9597

    accuracy                           0.74     42658
   macro avg       0.68      0.74      0.69     42658
weighted avg       0.80      0.74      0.76     42658



In [15]:
### results reproduced, lets see probabilities

y_proba = model3.predict_proba(X_test)
y_proba

array([[0.07746479, 0.92253521],
       [0.25842697, 0.74157303],
       [0.32692308, 0.67307692],
       ...,
       [0.27624309, 0.72375691],
       [0.75531915, 0.24468085],
       [0.5923913 , 0.4076087 ]])

In [17]:
import pickle

with open('prediction_service/model/dtclassifier.pkl', 'wb') as f:
    pickle.dump(model3, f)
    print('model file is saved.')

model file is saved.


In [48]:
features

['RainToday', 'Humidity3pm', 'Cloud3pm', 'Sunshine', 'Cloud9am', 'WindGustDir']

In [51]:
with open('prediction_service/model/column_names.pkl', 'wb') as f:
    pickle.dump(features[:-1], f)

### We need to get the min, max from the original .csv file as X_train and train are scaled dataframes.

In [49]:
import pandas as pd

data = pd.read_csv('weatherAUS.csv')

schema = data[features[:-1]].describe().T[['min', 'max']] 
schema.to_json('prediction_service/schema_in.json', orient='index')

In [10]:
data = pd.read_csv('weatherAUS.csv')
data[features].head()

Unnamed: 0,RainToday,Humidity3pm,Cloud3pm,Sunshine,Cloud9am,WindGustDir,RainTomorrow
0,No,22.0,,,8.0,W,No
1,No,25.0,,,,WNW,No
2,No,30.0,2.0,,,WSW,No
3,No,16.0,,,,NE,No
4,No,33.0,8.0,,7.0,W,No
