https://www.analyticsvidhya.com/blog/2020/10/overcoming-class-imbalance-using-smote-techniques/

### Lets load the saved data and build different models on top of it

In [1]:
import pandas as pd

train = pd.read_csv('data_train_10pct.csv')
test = pd.read_csv('data_test_10pct.csv')

train.shape, test.shape

((9953, 22), (4266, 22))

In [2]:
X_train, y_train = train.iloc[:,:-1], train.iloc[:,-1]
X_test, y_test = test.iloc[:,:-1], test.iloc[:,-1]

### Selecting the top 6 independent features

In [3]:
from sklearn.feature_selection import SelectKBest, chi2

ordered_rank_features=SelectKBest(score_func=chi2, k=6)
ordered_feature=ordered_rank_features.fit(X_train,y_train)

dfscores=pd.DataFrame(ordered_feature.scores_)
dfcolumns=pd.DataFrame(X_train.columns)
features_rank = pd.DataFrame({
                        'features' : X_train.columns,
                        'score' : ordered_feature.scores_}).sort_values(by='score', ascending=False)


features_rank

Unnamed: 0,features,score
20,RainToday,737.988433
13,Humidity3pm,165.709871
17,Cloud3pm,121.12372
16,Cloud9am,109.212947
5,Sunshine,85.899658
6,WindGustDir,83.72273
3,Rainfall,69.280665
9,WindDir3pm,51.208016
12,Humidity9am,37.998513
7,WindGustSpeed,27.320968


In [4]:
features = features_rank['features'][:6].values.tolist()
features.append('RainTomorrow')

In [5]:
features

['RainToday',
 'Humidity3pm',
 'Cloud3pm',
 'Cloud9am',
 'Sunshine',
 'WindGustDir',
 'RainTomorrow']

In [6]:
train = train[features]
test = test[features]
train.shape, test.shape

((9953, 7), (4266, 7))

### Defining a function for the conveniance to display various metrics

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

def build_model(sampler):
    
    X_train, y_train = train.iloc[:,:-1], train.iloc[:,-1]
    X_test, y_test = test.iloc[:,:-1], test.iloc[:,-1]
    
    print('Pre Sampling Class Counts', y_train.value_counts(), '--' * 30, sep='\n')
    
    dt = DecisionTreeClassifier(random_state=10)
    
    if sampler==None:
        pass
    else:
        X_train, y_train = sampler.fit_resample(X_train, y_train)
        
    dt.fit(X_train, y_train)
    
    print('Post Sampling Class Counts', y_train.value_counts(), '--' * 30, sep='\n')
    
    y_pred = dt.predict(X_test)
    print('Pre Optimization Metrics', classification_report(y_test, y_pred), '--' * 60, sep='\n')
    
    grid_param = {
                'criterion': ['gini', 'entropy'],
                'max_depth' : range(3, 22, 3),
                'min_samples_leaf' : range(80, 121, 10),
                'min_samples_split': range(100, 301, 50),
                'splitter' : ['best', 'random']
                }                
    
    dt = DecisionTreeClassifier(random_state=10)
    grid_search = GridSearchCV(estimator=dt, param_grid=grid_param, scoring='recall', cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_, '--' * 58, sep='\n')
    
    dt_optimized = grid_search.best_estimator_
    dt_optimized.fit(X_train, y_train)
    y_pred = dt_optimized.predict(X_test)
    print('Post Optimization Metrics', classification_report(y_test, y_pred), '--' * 60, sep='\n')   


## Building the decision tree model for best recall, pre and post hyper parameter optimization.

In [8]:
build_model(None)

Pre Sampling Class Counts
0    7670
1    2283
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Post Sampling Class Counts
0    7670
1    2283
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Pre Optimization Metrics
              precision    recall  f1-score   support

           0       0.84      0.81      0.82      3308
           1       0.41      0.45      0.43       958

    accuracy                           0.73      4266
   macro avg       0.62      0.63      0.63      4266
weighted avg       0.74      0.73      0.74      4266

------------------------------------------------------------------------------------------------------------------------
Fitting 5 folds for each of 700 candidates, totalling 3500 fits
{'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 120, 'min_samples_split': 100, 'splitter': 'best'}
-------------------------------------------------------------------------

In [9]:
from imblearn.over_sampling import SMOTE

build_model(SMOTE(random_state=10))

Pre Sampling Class Counts
0    7670
1    2283
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Post Sampling Class Counts
0    7670
1    7670
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Pre Optimization Metrics
              precision    recall  f1-score   support

           0       0.84      0.79      0.81      3308
           1       0.40      0.49      0.44       958

    accuracy                           0.72      4266
   macro avg       0.62      0.64      0.63      4266
weighted avg       0.74      0.72      0.73      4266

------------------------------------------------------------------------------------------------------------------------
Fitting 5 folds for each of 700 candidates, totalling 3500 fits
{'criterion': 'entropy', 'max_depth': 15, 'min_samples_leaf': 80, 'min_samples_split': 200, 'splitter': 'best'}
----------------------------------------------------------------------

In [10]:
from imblearn.combine import SMOTEENN

build_model(SMOTEENN(random_state=10))

Pre Sampling Class Counts
0    7670
1    2283
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Post Sampling Class Counts
1    5570
0    4360
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Pre Optimization Metrics
              precision    recall  f1-score   support

           0       0.90      0.73      0.80      3308
           1       0.43      0.71      0.53       958

    accuracy                           0.72      4266
   macro avg       0.66      0.72      0.67      4266
weighted avg       0.79      0.72      0.74      4266

------------------------------------------------------------------------------------------------------------------------
Fitting 5 folds for each of 700 candidates, totalling 3500 fits
{'criterion': 'gini', 'max_depth': 9, 'min_samples_leaf': 80, 'min_samples_split': 100, 'splitter': 'best'}
--------------------------------------------------------------------------

In [11]:
from imblearn.under_sampling import NearMiss

build_model(NearMiss())

Pre Sampling Class Counts
0    7670
1    2283
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Post Sampling Class Counts
0    2283
1    2283
Name: RainTomorrow, dtype: int64
------------------------------------------------------------
Pre Optimization Metrics
              precision    recall  f1-score   support

           0       0.82      0.51      0.63      3308
           1       0.27      0.62      0.38       958

    accuracy                           0.54      4266
   macro avg       0.55      0.57      0.50      4266
weighted avg       0.70      0.54      0.57      4266

------------------------------------------------------------------------------------------------------------------------
Fitting 5 folds for each of 700 candidates, totalling 3500 fits
{'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 100, 'min_samples_split': 100, 'splitter': 'random'}
-----------------------------------------------------------------------

## Results
* Best Sampling Technique: SMOTEENN
* Recall (class = 1) for plain decision tree: 0.71
* Recall (class = 1) for optimized decision tree: 0.80