# Ensemble Learning

## Initial Imports

In [30]:
import warnings
warnings.filterwarnings('ignore')

In [31]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [32]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

## Read the CSV and Perform Basic Data Cleaning

In [33]:
# Load the data
file_path = Path('Data/solana_sentiment_df.csv')
df = pd.read_csv(file_path, index_col="date_x")
df.columns

# Preview the data
df=df.drop(columns=["Unnamed: 0","price"])

df

Unnamed: 0_level_0,compound,positive,negative,neutral,binary score,new score,change
date_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-04-07,0.938200,0.280000,0.000000,0.720000,1.000000,1,0.089202
2021-04-09,-0.318200,0.080000,0.089000,0.831000,0.000000,0,0.080658
2021-04-10,0.807400,0.128000,0.000000,0.872000,1.000000,1,0.028867
2021-04-11,0.000000,0.000000,0.000000,1.000000,0.000000,0,-0.034218
2021-04-12,0.479500,0.087000,0.030000,0.883000,0.000000,0,0.036663
...,...,...,...,...,...,...,...
2021-08-28,0.465323,0.101154,0.009923,0.888923,0.461538,0,0.155749
2021-08-29,0.168300,0.058250,0.032625,0.909125,0.125000,0,0.103429
2021-08-30,0.275933,0.078619,0.027333,0.894048,0.476190,0,-0.031493
2021-08-31,0.125087,0.058708,0.033667,0.907625,0.291667,0,0.166333


## Split the Data into Training and Testing

In [34]:
# Create our features
X = df.drop('change', axis = 1)

# Create our target
y=df[["change"]]
y= np.where(y >= 0, 1 , 0)
y=pd.DataFrame(y)
y

Unnamed: 0,0
0,1
1,1
2,1
3,0
4,1
...,...
118,1
119,1
120,0
121,1


In [35]:
X.describe()

Unnamed: 0,compound,positive,negative,neutral,binary score,new score
count,123.0,123.0,123.0,123.0,123.0,123.0
mean,0.285434,0.085939,0.020812,0.893241,0.365731,0.382114
std,0.279733,0.053888,0.024752,0.056037,0.334239,0.487892
min,-0.7645,0.0,0.0,0.69,0.0,0.0
25%,0.119119,0.054,0.0,0.86955,0.0,0.0
50%,0.295218,0.078619,0.015,0.900143,0.333333,0.0
75%,0.462062,0.110119,0.031167,0.926077,0.6,1.0
max,0.9382,0.293,0.114,1.0,1.0,1.0


In [36]:
# Check the balance of our target values
y.value_counts()

1    68
0    55
dtype: int64

In [37]:
# Split the X and y into X_train, X_test, y_train, y_test


from sklearn.model_selection import train_test_split
    

X_train, X_test, y_train, y_test= train_test_split(X, y, random_state=1, stratify=y)

X_train.shape
X_train

Unnamed: 0_level_0,compound,positive,negative,neutral,binary score,new score
date_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-08-28,0.465323,0.101154,0.009923,0.888923,0.461538,0
2021-05-09,0.509350,0.067000,0.000000,0.933000,0.000000,0
2021-07-29,0.340000,0.082000,0.000000,0.918000,0.000000,0
2021-04-22,0.421500,0.097000,0.000000,0.903000,0.000000,0
2021-05-24,-0.764500,0.000000,0.111000,0.889000,0.000000,0
...,...,...,...,...,...,...
2021-07-21,-0.381800,0.000000,0.035000,0.965000,0.000000,0
2021-06-23,0.067133,0.067500,0.044500,0.887833,0.333333,0
2021-06-08,0.515350,0.086000,0.000000,0.914000,0.000000,0
2021-08-13,0.258817,0.069000,0.032000,0.899000,0.333333,0


## Data Pre-Processing

Scale the training and testing data using the `StandardScaler` from `sklearn`. Remember that when scaling the data, you only scale the features data (`X_train` and `X_testing`).

In [38]:
# Create the StandardScaler instance

scaler = StandardScaler()

In [39]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset

X_scaler = scaler.fit(X_train)

In [40]:
# Scale the training and testing data

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Balanced Random Forest Classifier

In [41]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brf = BalancedRandomForestClassifier(n_estimators=1000, random_state=1)
brf.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(n_estimators=1000, random_state=1)

In [42]:
# Calculated the balanced accuracy score
predictions = brf.predict(X_test_scaled)
bas_brf = balanced_accuracy_score(y_test, predictions)
print(bas_brf)

0.592436974789916


In [43]:
# Display the confusion matrix
pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=["Positive", "Negative"],
    columns=["Positive Sentiment", "Negative Sentiment"]
)

Unnamed: 0,Positive,Negative
Positive,10,4
Negative,9,8


In [44]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.53      0.71      0.47      0.61      0.58      0.34        14
          1       0.67      0.47      0.71      0.55      0.58      0.33        17

avg / total       0.60      0.58      0.60      0.58      0.58      0.34        31



In [45]:
# List the features sorted in descending order by feature importance
importances = brf.feature_importances_
importances_sorted = sorted(zip(brf.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.2523504228036731, 'positive'),
 (0.241486844652378, 'neutral'),
 (0.22124564068818398, 'compound'),
 (0.15109386502728325, 'negative'),
 (0.11907216880548373, 'binary score'),
 (0.014751058022997924, 'new score')]

## Easy Ensemble Classifier

In [59]:
# Train the Classifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec = eec.fit(X_train, y_train)
eec

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [60]:
# Calculated the balanced accuracy score
predictions = eec.predict(X_test)
balanced_accuracy_score(y_test, predictions)

0.6743697478991597

In [61]:
# Display the confusion matrix
pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=["Positive", "Negative"],
    columns=["Positive Sentiment", "Negative Sentiment"]
)

Unnamed: 0,Positive Sentiment,Negative Sentiment
Positive,9,5
Negative,5,12


In [62]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.64      0.64      0.71      0.64      0.67      0.45        14
          1       0.71      0.71      0.64      0.71      0.67      0.46        17

avg / total       0.68      0.68      0.67      0.68      0.67      0.45        31



## Stochastic Gradient Descent

In [50]:
# Instantiate and fit model
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=5)
clf.fit(X_train, y_train)

SGDClassifier(max_iter=5)

In [51]:
#Predict y values
predictions=clf.predict(X_test)

In [52]:
#Compute coefficients for SDGClassifier
clf.coef_

array([[ 2.19591627e+01, -1.08591040e-01, -1.21067892e+00,
         1.30751489e+00,  4.33987923e+00, -1.94607866e-14]])

In [53]:
#Compute intercept for SDGClassifier 
clf.intercept_

array([-0.66254069])

In [54]:
# Compute Balance accuracy score
balanced_accuracy_score(y_test, predictions)

0.542016806722689

In [56]:
# Displays confusion matrix
pd.DataFrame(
    confusion_matrix(y_test, predictions),
    index=["Positive", "Negative"],
    columns=["Positive Sentiment", "Negative Sentiment"]
)


Unnamed: 0,Positive,Negative
Positive Sentiment,2,12
Negative Sentiment,1,16


In [29]:
#Displays classification report
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      1.00      0.00      0.00      0.00        14
          1       0.55      1.00      0.00      0.71      0.00      0.00        17

avg / total       0.30      0.55      0.45      0.39      0.00      0.00        31

