https://www.analyticsvidhya.com/blog/2020/10/overcoming-class-imbalance-using-smote-techniques/

### Lets load the saved data and build different models on top of it

In [1]:
import pandas as pd

train = pd.read_csv('data_train_10pct.csv')
test = pd.read_csv('data_test_10pct.csv')

train.shape, test.shape

((9953, 22), (4266, 22))

### Defining a function for the conveniance to display various metrics

In [43]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

def build_model(sampler):
    
    X_train, y_train = train.iloc[:,:-1], train.iloc[:,-1]
    X_test, y_test = test.iloc[:,:-1], test.iloc[:,-1]
    
    print('Pre Sampling Class Counts', y_train.value_counts(), '--' * 30, sep='\n')
    
    dt = DecisionTreeClassifier(random_state=10)
    
    if sampler==None:
        pass
    else:
        X_train, y_train = sampler.fit_resample(X_train, y_train)
        
    dt.fit(X_train, y_train)
    
    print('Post Sampling Class Counts', y_train.value_counts(), '--' * 30, sep='\n')
    
    y_pred = dt.predict(X_test)
    print('Pre Optimization Metrics', classification_report(y_test, y_pred), '--' * 60, sep='\n')
    
    grid_param = {
                'criterion': ['gini', 'entropy'],
                'max_depth' : range(3, 22, 3),
                'min_samples_leaf' : range(80, 121, 10),
                'min_samples_split': range(100, 301, 50),
                'splitter' : ['best', 'random']
                }                
    
    dt = DecisionTreeClassifier(random_state=10)
    grid_search = GridSearchCV(estimator=dt, param_grid=grid_param, scoring='recall', cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_, '--' * 58, sep='\n')
    
    dt_optimized = grid_search.best_estimator_
    dt_optimized.fit(X_train, y_train)
    y_pred = dt_optimized.predict(X_test)
    print('Post Optimization Metrics', classification_report(y_test, y_pred), '--' * 60, sep='\n')   


## Building the decision tree model for best recall, pre and post balancing the data.

In [41]:
build_model(None)

Pre Sampling Class Counts
0    7668
1    2285
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Post Sampling Class Counts
0    7668
1    2285
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Pre Optimization Metrics
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      3298
           1       0.48      0.52      0.50       968

    accuracy                           0.76      4266
   macro avg       0.67      0.68      0.67      4266
weighted avg       0.77      0.76      0.77      4266

------------------------------------------------------------------------------------------------------------------------
Fitting 5 folds for each of 700 candidates, totalling 3500 fits
{'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 80, 'min_samples_split': 100, 'splitter': 'best'}
-----------------------------------------------------------------------

In [42]:
from imblearn.over_sampling import SMOTE

build_model(SMOTE(random_state=10))

Pre Sampling Class Counts
0    7668
1    2285
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Post Sampling Class Counts
0    7668
1    7668
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Pre Optimization Metrics
              precision    recall  f1-score   support

           0       0.87      0.80      0.83      3298
           1       0.46      0.59      0.52       968

    accuracy                           0.75      4266
   macro avg       0.67      0.69      0.68      4266
weighted avg       0.78      0.75      0.76      4266

------------------------------------------------------------------------------------------------------------------------
Fitting 5 folds for each of 700 candidates, totalling 3500 fits
{'criterion': 'entropy', 'max_depth': 12, 'min_samples_leaf': 80, 'min_samples_split': 100, 'splitter': 'best'}
----------------------------------------------------------------------

In [44]:
from imblearn.combine import SMOTEENN

build_model(SMOTEENN(random_state=10))

Pre Sampling Class Counts
0    7668
1    2285
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Post Sampling Class Counts
1    5722
0    3965
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Pre Optimization Metrics
              precision    recall  f1-score   support

           0       0.87      0.76      0.81      3298
           1       0.43      0.63      0.51       968

    accuracy                           0.73      4266
   macro avg       0.65      0.69      0.66      4266
weighted avg       0.77      0.73      0.75      4266

------------------------------------------------------------------------------------------------------------------------
Fitting 5 folds for each of 700 candidates, totalling 3500 fits
{'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 80, 'min_samples_split': 100, 'splitter': 'random'}
------------------------------------------------------------------------

In [46]:
from imblearn.under_sampling import NearMiss

build_model(NearMiss())

Pre Sampling Class Counts
0    7668
1    2285
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Post Sampling Class Counts
0    2285
1    2285
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Pre Optimization Metrics
              precision    recall  f1-score   support

           0       0.87      0.66      0.75      3298
           1       0.37      0.67      0.47       968

    accuracy                           0.66      4266
   macro avg       0.62      0.67      0.61      4266
weighted avg       0.76      0.66      0.69      4266

------------------------------------------------------------------------------------------------------------------------
Fitting 5 folds for each of 700 candidates, totalling 3500 fits
{'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 80, 'min_samples_split': 100, 'splitter': 'best'}
-----------------------------------------------------------------------

### The best sampling seems to be Undersampling with SMOTEENN as the recall has increased from 0.63 to 0.84