# Handling Imbalanced Stock Data with SMOTE and Near Miss Algorithm

Synthetic Minority Oversampling Technique (SMOTE) is imbalanced classification that use to developing predictive models on classification datasets that have a severe class imbalance. Near-miss is an algorithm that can help in balancing an imbalanced dataset. 

In [1]:
import pandas as pd
import numpy as np
import math

import os
import sys
import platform

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import yfinance as yf
yf.pdr_override()

In [2]:
print("numpy: ", np.__version__)
print("pandas: ", pd.__version__)
print("sklearn: ", sklearn.__version__)
print("yfinance: ", yf.__version__)
print("os system: ", os.name)
print("Python Version:", sys.version)
print("Platform System: ", platform.system())

numpy:  1.19.5
pandas:  1.1.5
sklearn:  0.22.1
yfinance:  0.1.77
os system:  nt
Python Version: 3.6.13 |Anaconda, Inc.| (default, Mar 16 2021, 11:37:27) [MSC v.1916 64 bit (AMD64)]
Platform System:  Windows


In [3]:
stock = 'AAPL'
start = '2016-01-01' 
end = '2022-01-01'
dataset = yf.download(stock, start, end)
dataset.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-04 00:00:00-05:00,25.6525,26.342501,25.5,26.3375,24.151493,270597600
2016-01-05 00:00:00-05:00,26.4375,26.4625,25.602501,25.6775,23.546274,223164000
2016-01-06 00:00:00-05:00,25.139999,25.592501,24.967501,25.174999,23.08548,273829600
2016-01-07 00:00:00-05:00,24.67,25.032499,24.1075,24.112499,22.111166,324377600
2016-01-08 00:00:00-05:00,24.637501,24.7775,24.190001,24.24,22.228085,283192000


In [4]:
dataset['Increase_Decrease'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],1,0)
dataset['Buy_Sell_on_Open'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],1,0)
dataset['Buy_Sell'] = np.where(dataset['Adj Close'].shift(-1) > dataset['Adj Close'],1,0)
dataset['Returns'] = dataset['Adj Close'].pct_change()
dataset['Returns_Buy_Sell'] = np.where(dataset['Returns'].shift(-1) > dataset['Returns'],1,0)
dataset = dataset.dropna()

In [5]:
dataset = dataset.reset_index()
dataset

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Increase_Decrease,Buy_Sell_on_Open,Buy_Sell,Returns,Returns_Buy_Sell
0,2016-01-05 00:00:00-05:00,26.437500,26.462500,25.602501,25.677500,23.546274,223164000,1,0,0,-0.025059,1
1,2016-01-06 00:00:00-05:00,25.139999,25.592501,24.967501,25.174999,23.085480,273829600,1,0,0,-0.019570,0
2,2016-01-07 00:00:00-05:00,24.670000,25.032499,24.107500,24.112499,22.111166,324377600,0,0,1,-0.042205,1
3,2016-01-08 00:00:00-05:00,24.637501,24.777500,24.190001,24.240000,22.228085,283192000,0,1,1,0.005288,1
4,2016-01-11 00:00:00-05:00,24.742500,24.764999,24.334999,24.632500,22.588015,198957600,0,1,1,0.016193,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1505,2021-12-27 00:00:00-05:00,177.089996,180.419998,177.070007,180.330002,179.586868,74919600,1,1,0,0.022975,0
1506,2021-12-28 00:00:00-05:00,180.160004,181.330002,178.529999,179.289993,178.551132,79144300,0,0,1,-0.005767,1
1507,2021-12-29 00:00:00-05:00,179.330002,180.630005,178.139999,179.380005,178.640778,62348900,0,1,0,0.000502,0
1508,2021-12-30 00:00:00-05:00,179.470001,180.570007,178.089996,178.199997,177.465637,59773000,1,0,0,-0.006578,1


In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1510 entries, 0 to 1509
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype                           
---  ------             --------------  -----                           
 0   Date               1510 non-null   datetime64[ns, America/New_York]
 1   Open               1510 non-null   float64                         
 2   High               1510 non-null   float64                         
 3   Low                1510 non-null   float64                         
 4   Close              1510 non-null   float64                         
 5   Adj Close          1510 non-null   float64                         
 6   Volume             1510 non-null   int64                           
 7   Increase_Decrease  1510 non-null   int32                           
 8   Buy_Sell_on_Open   1510 non-null   int32                           
 9   Buy_Sell           1510 non-null   int32                           
 10  Returns     

In [7]:
# Normalize the "Buy_Sell" column
dataset['nearClose'] = StandardScaler().fit_transform(np.array(dataset['Adj Close']).reshape(-1, 1))

# drop Date and Close columns as they are not relevant for prediction purpose 
dataset = dataset.drop(['Date', 'Close'], axis = 1)

dataset['Buy_Sell'].value_counts()

1    817
0    693
Name: Buy_Sell, dtype: int64

In [8]:
X = dataset.drop(['Returns_Buy_Sell'], axis=1)
y = dataset['Returns_Buy_Sell']

In [9]:
# split into 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
  
# describes info about train and test set
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

Number transactions X_train dataset:  (1057, 10)
Number transactions y_train dataset:  (1057,)
Number transactions X_test dataset:  (453, 10)
Number transactions y_test dataset:  (453,)


In [10]:
# logistic regression object
lr = LogisticRegression()
  
# train the model on train set
lr.fit(X_train, y_train.ravel())
  
predictions = lr.predict(X_test)
  
# print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.49      1.00      0.66       222
           1       0.00      0.00      0.00       231

    accuracy                           0.49       453
   macro avg       0.25      0.50      0.33       453
weighted avg       0.24      0.49      0.32       453



In [11]:
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))
  
# import SMOTE module from imblearn library
# pip install imblearn (if you don't have imblearn in your system)
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())
  
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))
  
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))

Before OverSampling, counts of label '1': 517
Before OverSampling, counts of label '0': 540 

After OverSampling, the shape of train_X: (1080, 10)
After OverSampling, the shape of train_y: (1080,) 

After OverSampling, counts of label '1': 540
After OverSampling, counts of label '0': 540


Using TensorFlow backend.


In [12]:
lr1 = LogisticRegression()
lr1.fit(X_train_res, y_train_res.ravel())
predictions = lr1.predict(X_test)
  
# print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       222
           1       0.51      1.00      0.68       231

    accuracy                           0.51       453
   macro avg       0.25      0.50      0.34       453
weighted avg       0.26      0.51      0.34       453



In [13]:
print("Before Undersampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before Undersampling, counts of label '0': {} \n".format(sum(y_train == 0)))
  
# apply near miss
from imblearn.under_sampling import NearMiss
nr = NearMiss()
  
X_train_miss, y_train_miss = nr.fit_sample(X_train, y_train.ravel())
  
print('After Undersampling, the shape of train_X: {}'.format(X_train_miss.shape))
print('After Undersampling, the shape of train_y: {} \n'.format(y_train_miss.shape))
  
print("After Undersampling, counts of label '1': {}".format(sum(y_train_miss == 1)))
print("After Undersampling, counts of label '0': {}".format(sum(y_train_miss == 0)))

Before Undersampling, counts of label '1': 517
Before Undersampling, counts of label '0': 540 

After Undersampling, the shape of train_X: (1034, 10)
After Undersampling, the shape of train_y: (1034,) 

After Undersampling, counts of label '1': 517
After Undersampling, counts of label '0': 517


In [14]:
# train the model on train set
lr2 = LogisticRegression()
lr2.fit(X_train_miss, y_train_miss.ravel())
predictions = lr2.predict(X_test)
  
# print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       222
           1       0.51      1.00      0.68       231

    accuracy                           0.51       453
   macro avg       0.25      0.50      0.34       453
weighted avg       0.26      0.51      0.34       453



In [None]:
The first model of the accuracy comes out to be 100% and the second model of the accuracy comes out to be 100%.  They are both biased twoards majority class.