# Ensemble Learning

## Initial Imports

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

## Read the CSV and Perform Basic Data Cleaning

In [11]:
# Load the data
file_path = Path('solana_sentiment_df.csv')
df = pd.read_csv(file_path, index_col="date")

# Preview the data
df

Unnamed: 0_level_0,compound,positive,negative,neutral,binary score,new score
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-04-06,0.658450,0.153000,0.000000,0.847000,1.000000,1
2021-04-07,0.938200,0.280000,0.000000,0.720000,1.000000,1
2021-04-09,-0.318200,0.080000,0.089000,0.831000,0.000000,0
2021-04-10,0.807400,0.128000,0.000000,0.872000,1.000000,1
2021-04-11,0.000000,0.000000,0.000000,1.000000,0.000000,0
...,...,...,...,...,...,...
2021-08-28,0.465323,0.101154,0.009923,0.888923,0.461538,0
2021-08-29,0.168300,0.058250,0.032625,0.909125,0.125000,0
2021-08-30,0.275933,0.078619,0.027333,0.894048,0.476190,0
2021-08-31,0.125087,0.058708,0.033667,0.907625,0.291667,0


## Split the Data into Training and Testing

In [12]:
# Create our features
X = df.drop('new score', axis = 1)

# Create our target
y=df["new score"]

In [13]:
X.describe()

Unnamed: 0,compound,positive,negative,neutral,binary score
count,124.0,124.0,124.0,124.0,124.0
mean,0.288442,0.08648,0.020644,0.892868,0.370846
std,0.280601,0.054005,0.024722,0.055963,0.337715
min,-0.7645,0.0,0.0,0.69,0.0
25%,0.122103,0.054,0.0,0.8677,0.0
50%,0.296459,0.078643,0.0147,0.899571,0.333333
75%,0.466067,0.112111,0.031083,0.926038,0.60625
max,0.9382,0.293,0.114,1.0,1.0


In [14]:
# Check the balance of our target values
y.value_counts()

0    76
1    48
Name: new score, dtype: int64

In [15]:
# Split the X and y into X_train, X_test, y_train, y_test


from sklearn.model_selection import train_test_split
    

X_train, X_test, y_train, y_test= train_test_split(X, y, random_state=1, stratify=y)

X_train.shape
X_train

Unnamed: 0_level_0,compound,positive,negative,neutral,binary score
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-05-01,0.509833,0.107000,0.031333,0.861667,0.666667
2021-04-27,0.468300,0.073000,0.004333,0.922333,0.000000
2021-06-23,0.067133,0.067500,0.044500,0.887833,0.333333
2021-06-27,0.445050,0.127500,0.012500,0.860000,0.500000
2021-06-16,0.302043,0.091429,0.000000,0.908571,0.142857
...,...,...,...,...,...
2021-06-20,0.659700,0.069000,0.000000,0.931000,0.000000
2021-07-19,-0.095050,0.125000,0.105000,0.770000,0.500000
2021-08-13,0.258817,0.069000,0.032000,0.899000,0.333333
2021-07-13,-0.250000,0.073000,0.114000,0.813000,0.000000


## Data Pre-Processing

Scale the training and testing data using the `StandardScaler` from `sklearn`. Remember that when scaling the data, you only scale the features data (`X_train` and `X_testing`).

In [16]:
# Create the StandardScaler instance

scaler = StandardScaler()

In [17]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset

X_scaler = scaler.fit(X_train)

In [18]:
# Scale the training and testing data

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Balanced Random Forest Classifier

In [19]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brf = BalancedRandomForestClassifier(n_estimators=1000, random_state=1)
brf.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(n_estimators=1000, random_state=1)

In [20]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test_scaled)
bas_brf = balanced_accuracy_score(y_test, y_pred)
print(bas_brf)

1.0


In [22]:
# Display the confusion matrix
pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=["Positive", "Negative"],
    columns=["Positive", "Negative"]
)

Unnamed: 0,Positive,Negative
Positive,19,0
Negative,0,12


In [23]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00        19
          1       1.00      1.00      1.00      1.00      1.00      1.00        12

avg / total       1.00      1.00      1.00      1.00      1.00      1.00        31



In [24]:
# List the features sorted in descending order by feature importance
importances = brf.feature_importances_
importances_sorted = sorted(zip(brf.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.5485700176805374, 'binary score'),
 (0.25090703788736113, 'positive'),
 (0.11775800825376952, 'neutral'),
 (0.06956692703309153, 'compound'),
 (0.013198009145240516, 'negative')]

## Easy Ensemble Classifier

In [25]:
# Train the Classifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec = eec.fit(X_train, y_train)
eec

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [26]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

1.0

In [27]:
# Display the confusion matrix
pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=["Positive", "Negative"],
    columns=["Positive", "Negative"]
)

Unnamed: 0,Positive,Negative
Positive,19,0
Negative,0,12


In [28]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00        19
          1       1.00      1.00      1.00      1.00      1.00      1.00        12

avg / total       1.00      1.00      1.00      1.00      1.00      1.00        31



## Support Vector Machine

In [32]:
# Instantiate a linear SVM model
from sklearn.svm import SVC
classifier = SVC(kernel='linear')
classifier

SVC(kernel='linear')

In [33]:
# Fit the data
classifier.fit(X_train, y_train)

SVC(kernel='linear')

In [34]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.946236559139785
Testing Data Score: 0.967741935483871


In [35]:
# Make predictions using the test data
predictions = classifier.predict(X_test)
results = pd.DataFrame({
    "Prediction": predictions, 
    "Actual": y_test
}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,1,1
1,0,0
2,0,0
3,1,1
4,1,1


In [36]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[19,  0],
       [ 1, 11]])

In [37]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97        19
           1       1.00      0.92      0.96        12

    accuracy                           0.97        31
   macro avg       0.97      0.96      0.97        31
weighted avg       0.97      0.97      0.97        31

