In [12]:
# To manipulate and analyze data
import pandas as pd
import numpy as np
import tqdm as notebook_tqdm

# To visualize data
import matplotlib.pyplot as plt
import seaborn as sns

# To used time-related functions
import time

# To parse JSON data
import json

# to load the natural language toolkit
import nltk
nltk.download('stopwords')    # loading the stopwords
# nltk.download('punkt')    # loading the punkt module used in tokenization
# nltk.download('omw-1.4')    # dependency for tokenization
nltk.download('wordnet')   # loading the wordnet module that is used in stemming


# To build, tune, and evaluate ML models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    BaggingClassifier,
)
from xgboost import XGBClassifier


from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

# To load/create word embeddings
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# To work with transformer models
import torch
from sentence_transformers import SentenceTransformer

# To implement progress bar related functionalities
from tqdm import tqdm
tqdm.pandas()

# To ignore unnecessary warnings
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/christophebuffard/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/christophebuffard/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [115]:
stock_news = pd.read_csv("stock_news.csv")
df = stock_news.copy()

In [116]:
df['Date'] = pd.to_datetime(df['Date'])

In [117]:
df['Avg_Price'] = (df['Open'] + df['Close'] + df['High'] + df['Low']) / 4


In [118]:

label = {
    1: 'positive',
    0: 'neutral',
    -1: 'negative'
}
df['Label'] = df['Label'].map(label)

In [119]:
rev_label = {
    'positive': 2,
    'neutral': 1,
    'negative': 0
}
df['Label'] = df['Label'].map(rev_label)

In [120]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from nltk.stem.porter import PorterStemmer
import re
# to remove common stop words
from nltk.corpus import stopwords

# Create a custom transformer for text preprocessing
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stopwords = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()

    def remove_special_characters(self, text):
        pattern = '[^A-Za-z0-9]+'
        return re.sub(pattern, ' ', text)

    def to_lowercase(self, text):
        return text.lower()

    def remove_extra_whitespaces(self, text):
        return text.strip()

    def remove_stopwords(self, text):
        return ' '.join([word for word in text.split() if word not in self.stopwords])

    def apply_stemming(self, text):
        return ' '.join([self.stemmer.stem(word) for word in text.split()])

    def preprocess(self, text):
        text = self.remove_special_characters(text)
        text = self.to_lowercase(text)
        text = self.remove_extra_whitespaces(text)
        text = self.remove_stopwords(text)
        return self.apply_stemming(text)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(self.preprocess)

# Create a pipeline with the custom transformer
pipeline = Pipeline(steps=[
    ('text_preprocessing', FunctionTransformer(lambda x: TextPreprocessor().fit_transform(x), validate=False))
])

In [121]:
df['News_clean'] = pipeline.transform(df['News'])

In [122]:
df.head()

Unnamed: 0,Date,News,Open,High,Low,Close,Volume,Label,Avg_Price,News_clean
0,2019-01-02,The tech sector experienced a significant dec...,41.740002,42.244999,41.482498,40.246914,130672400,0,41.428603,tech sector experienc signific declin aftermar...
1,2019-01-02,Apple lowered its fiscal Q1 revenue guidance ...,41.740002,42.244999,41.482498,40.246914,130672400,0,41.428603,appl lower fiscal q1 revenu guidanc 84 billion...
2,2019-01-02,Apple cut its fiscal first quarter revenue fo...,41.740002,42.244999,41.482498,40.246914,130672400,0,41.428603,appl cut fiscal first quarter revenu forecast ...
3,2019-01-02,This news article reports that yields on long...,41.740002,42.244999,41.482498,40.246914,130672400,0,41.428603,news articl report yield long date u treasuri ...
4,2019-01-02,Apple's revenue warning led to a decline in U...,41.740002,42.244999,41.482498,40.246914,130672400,0,41.428603,appl revenu warn led declin usd jpi pair gain ...


In [123]:
print(f"Date set starting date: {df['Date'].min()}")
start_date = df['Date'].min()
print(f"Datea set end date: {df['Date'].max()}")
print(f"Duration: {df['Date'].max() - df['Date'].min()}")

Date set starting date: 2019-01-02 00:00:00
Datea set end date: 2019-04-30 00:00:00
Duration: 118 days 00:00:00


In [124]:
train_point = (df['Date'].max() - df['Date'].min())*0.7 # 60% of data
print(f"Train point: {train_point}")
test_val_split = (df['Date'].max() - df['Date'].min())*0.15 # 50% of the remaining %30
print(f"Test-Val Split: {test_val_split}")

Train point: 82 days 14:24:00
Test-Val Split: 17 days 16:48:00


In [125]:
X_train = df[(df['Date'] < start_date + train_point)].reset_index()    #Complete the code to select all rows where the 'Date' is before '2019-04-01'
X_val = df[(df['Date'] >= start_date + train_point) & (df['Date'] < start_date + train_point + test_val_split)].reset_index()    #Complete the code to select all rows where the 'Date' is from '2019-04-01 to '2019-04-16' (excluded)
X_test = df[df['Date'] >= start_date + train_point + test_val_split].reset_index()    #Complete the code to select all rows where the 'Date' is from '2019-04-16' till the end.

In [126]:
X_train.shape, X_val.shape, X_test.shape

((274, 11), (31, 11), (44, 11))

In [127]:
X_train.drop(columns=['index', 'Date'], inplace=True)
X_val.drop(columns=['index', 'Date'], inplace=True)
X_test.drop(columns=['index', 'Date'], inplace=True)

In [128]:
X_train.shape, X_val.shape, X_test.shape

((274, 9), (31, 9), (44, 9))

In [129]:
y_train = X_train['Label']
y_val = X_val['Label']
y_test = X_test['Label']

In [130]:
y_train.head(20)

0     0
1     0
2     0
3     0
4     0
5     1
6     2
7     0
8     0
9     0
10    1
11    0
12    0
13    1
14    0
15    0
16    2
17    1
18    0
19    1
Name: Label, dtype: int64

In [131]:
X_train.head(20)

Unnamed: 0,News,Open,High,Low,Close,Volume,Label,Avg_Price,News_clean
0,The tech sector experienced a significant dec...,41.740002,42.244999,41.482498,40.246914,130672400,0,41.428603,tech sector experienc signific declin aftermar...
1,Apple lowered its fiscal Q1 revenue guidance ...,41.740002,42.244999,41.482498,40.246914,130672400,0,41.428603,appl lower fiscal q1 revenu guidanc 84 billion...
2,Apple cut its fiscal first quarter revenue fo...,41.740002,42.244999,41.482498,40.246914,130672400,0,41.428603,appl cut fiscal first quarter revenu forecast ...
3,This news article reports that yields on long...,41.740002,42.244999,41.482498,40.246914,130672400,0,41.428603,news articl report yield long date u treasuri ...
4,Apple's revenue warning led to a decline in U...,41.740002,42.244999,41.482498,40.246914,130672400,0,41.428603,appl revenu warn led declin usd jpi pair gain ...
5,Apple CEO Tim Cook discussed the company's Q1 ...,41.740002,42.244999,41.482498,40.246914,130672400,1,41.428603,appl ceo tim cook discuss compani q1 warn cnbc...
6,Roku Inc has announced plans to offer premium...,41.740002,42.244999,41.482498,40.246914,130672400,2,41.428603,roku inc announc plan offer premium video chan...
7,Wall Street saw modest gains on Wednesday but...,41.740002,42.244999,41.482498,40.246914,130672400,0,41.428603,wall street saw modest gain wednesday threaten...
8,Apple's fiscal first quarter revenue came in ...,41.740002,42.244999,41.482498,40.246914,130672400,0,41.428603,appl fiscal first quarter revenu came analyst ...
9,Apple Inc. lowered its quarterly sales foreca...,41.740002,42.244999,41.482498,40.246914,130672400,0,41.428603,appl inc lower quarterli sale forecast fiscal ...


In [132]:
# Creating a list of all words in our data
words_list = [item.split(" ") for item in df['News_clean'].values]

In [133]:
# Creating an instance of Word2Vec
vec_size = 300
model_W2V = Word2Vec(words_list, vector_size = vec_size, min_count = 1, window=5, workers = 6)

In [134]:
# Retrieving the words present in the Word2Vec model's vocabulary
words = list(model_W2V.wv.key_to_index.keys())

# Retrieving word vectors for all the words present in the model's vocabulary
wvs = model_W2V.wv[words].tolist()

# Creating a dictionary of words and their corresponding vectors
word_vector_dict = dict(zip(words, wvs))


In [135]:
# compute the average
def average_vectorizer_Word2Vec(doc, words, word_vector):
    """
    Computes the average vector representation of a sentence using Word2Vec.
    :param doc:
    :param words: list of words in the model vocabulary
    :param word_vector: dictionary of words and their corresponding vectors
    :return: feature vector
    """
    # Initializing a feature vector for the sentence
    feature_vector = np.zeros((vec_size,), dtype="float64")

    # Creating a list of words in the sentence that are present in the model vocabulary
    words_in_vocab = [word for word in doc.split() if word in words]

    # adding the vector representations of the words
    for word in words_in_vocab:
        feature_vector += np.array(word_vector[word])

    # Dividing by the number of words to get the average vector
    if len(words_in_vocab) != 0:
        feature_vector /= len(words_in_vocab)

    return feature_vector

In [136]:
def vectorized_document(df, target, words, word_vector):
    """
    Vectorized a document using Word2Vec.
    :param df: the datafram to vectorize
    :param target: the target column to vectorize
    :param words: words in the model vocabulary
    :param word_vector: vocabulary of words and their corresponding vectors
    :return: a pd.Datafram
    """
    tmp = pd.DataFrame(df[target].apply(lambda x: average_vectorizer_Word2Vec(x, words, word_vector).tolist()))
    return pd.DataFrame(tmp[target].tolist(),columns=['Feature' + str(i) for i in range(vec_size)])


In [137]:
X_train_wv = vectorized_document(X_train, 'News_clean', words, word_vector_dict)
X_train_wv[['Volume', 'Avg_Price']] = X_train[['Volume', 'Avg_Price']].astype(float)

In [138]:
X_val_wv = vectorized_document(X_val, 'News_clean', words, word_vector_dict)
X_val_wv[['Volume', 'Avg_Price']] = X_val[['Volume', 'Avg_Price']].astype(float)

In [139]:
X_test_wv = vectorized_document(X_test, 'News_clean', words, word_vector_dict)
X_test_wv[['Volume', 'Avg_Price']] = X_val[['Volume', 'Avg_Price']].astype(float)

In [158]:
X_train_wv.head()

Unnamed: 0,Feature0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9,...,Feature292,Feature293,Feature294,Feature295,Feature296,Feature297,Feature298,Feature299,Volume,Avg_Price
0,0.000216,0.004676,-0.000964,0.002499,0.000301,-0.006099,0.003429,0.009592,0.002219,-0.001178,...,0.005197,0.000752,0.00517,0.006167,0.000384,-0.002627,0.003587,-0.000261,130672400.0,41.428603
1,0.000816,0.005574,-0.001678,0.003457,-0.000452,-0.007578,0.003915,0.011049,0.002374,-0.000585,...,0.005898,0.000718,0.005173,0.006782,0.001071,-0.003221,0.004376,-9.8e-05,130672400.0,41.428603
2,0.000437,0.004267,-0.00106,0.002809,-0.000246,-0.006333,0.002605,0.009335,0.002152,-0.000636,...,0.005287,0.000401,0.004533,0.005981,0.000992,-0.003503,0.003969,-0.000123,130672400.0,41.428603
3,-0.000498,0.005355,-0.001262,0.003202,-0.000261,-0.006521,0.003137,0.009843,0.002009,-0.001162,...,0.004688,0.000438,0.004628,0.005598,0.000988,-0.00263,0.003194,0.000119,130672400.0,41.428603
4,0.00069,0.00339,-0.000706,0.002732,-0.000233,-0.005372,0.002772,0.007674,0.001986,-0.000168,...,0.004659,0.000782,0.004412,0.004935,0.000705,-0.002466,0.003496,-0.000119,130672400.0,41.428603


### GloVe


In [140]:
# Converting the Stanford GloVe model vector format to word2vec
from gensim.scripts.glove2word2vec import glove2word2vec

glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)
from gensim.models import KeyedVectors

# load the Stanford GloVe model
filename = 'glove.6B.100d.txt.word2vec'
glove_model = KeyedVectors.load_word2vec_format(filename, binary=False)
# Checking the size of the vocabulary
print("Length of the vocabulary is", len(glove_model.index_to_key))

Length of the vocabulary is 400000


In [141]:
glove_words = glove_model.index_to_key

In [142]:
glove_word_vector_dict = dict(zip(glove_model.index_to_key,list(glove_model.vectors)))

In [143]:
# compute average
def average_vectorizer_GloVe(doc):
    """
    Computes the average vector representation of a sentence using GloVe.
    :param doc:
    :return: vector
    """
    # Initializing a feature vector for the sentence
    feature_vector = np.zeros((vec_size,), dtype="float64")

    # Creating a list of words in the sentence that are present in the model vocabulary
    words_in_vocab = [word for word in doc.split() if word in glove_words]

    # adding the vector representations of the words
    for word in words_in_vocab:
        feature_vector += np.array(glove_word_vector_dict[word])

    # Dividing by the number of words to get the average vector
    if len(words_in_vocab) != 0:
        feature_vector /= len(words_in_vocab)

    return feature_vector

In [144]:
vec_size=100

In [145]:
X_train_gl = vectorized_document(X_train, 'News_clean', glove_words, glove_word_vector_dict)
X_train_gl[['Volume', 'Avg_Price']] = X_train[['Volume', 'Avg_Price']].astype(float)

In [146]:
X_val_gl = vectorized_document(X_val, 'News_clean', glove_words, glove_word_vector_dict)
X_val_gl[['Volume', 'Avg_Price']] = X_train[['Volume', 'Avg_Price']].astype(float)

In [147]:
X_test_gl = vectorized_document(X_test, 'News_clean', glove_words, glove_word_vector_dict)
X_test_gl[['Volume', 'Avg_Price']] = X_train[['Volume', 'Avg_Price']].astype(float)

In [148]:
X_train_gl.head()

Unnamed: 0,Feature0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9,...,Feature92,Feature93,Feature94,Feature95,Feature96,Feature97,Feature98,Feature99,Volume,Avg_Price
0,0.021671,0.096785,-0.047464,-0.066726,-0.215078,-0.603108,-0.089054,-0.001979,0.114239,-0.132121,...,-0.146394,-0.217284,-0.13773,0.162514,0.208869,0.056668,0.186156,-0.02554,130672400.0,41.428603
1,0.171825,0.341351,0.23465,-0.042054,-0.08284,-0.600302,-0.060674,-0.101788,-0.159387,0.023604,...,-0.190244,-0.13607,-0.442046,0.225178,0.17598,-0.036701,0.41009,-0.159631,130672400.0,41.428603
2,0.010512,0.270341,0.301482,-0.087113,0.075485,-0.476547,-0.039534,-0.014161,-0.123561,-0.054536,...,-0.095302,-0.220799,-0.559601,0.130351,0.039756,-0.088995,0.484577,-0.204774,130672400.0,41.428603
3,-0.147407,0.22697,0.377158,0.184659,-0.110523,-0.447424,-0.103314,0.033698,-0.021951,-0.041905,...,-0.276519,0.03475,-0.347012,0.131816,0.201519,-0.220721,0.328415,-0.101222,130672400.0,41.428603
4,0.040798,0.198312,0.07146,0.037332,-0.098294,-0.407203,-0.078844,-0.102327,-0.13238,-0.031861,...,-0.051261,-0.364269,-0.216124,0.242409,0.24328,-0.071136,0.173813,-0.131392,130672400.0,41.428603


### Sentence transformer

In [149]:
from sentence_transformers import SentenceTransformer

In [150]:
#Defining the model
model_st = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [151]:
import torch

In [152]:
# setting the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [153]:
# encoding the dataset
X_train_st = model_st.encode(X_train['News'], show_progress_bar=True, device=device)
X_val_st = model_st.encode(X_val['News'], show_progress_bar=True, device=device)
X_test_st = model_st.encode(X_test['News'], show_progress_bar=True, device=device)

Batches: 100%|██████████| 9/9 [00:02<00:00,  4.30it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.03it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  5.65it/s]


In [154]:
size = X_train_st.shape[1]

In [155]:
X_train_st = pd.DataFrame(X_train_st)
X_train_st.columns = ['Feature' + str(i) for i in range(size)]
X_train_st[['Avg_Price', 'Volume']] = X_train[['Avg_Price', 'Volume']].astype(float)

X_val_st = pd.DataFrame(X_val_st)
X_val_st.columns = ['Feature' + str(i) for i in range(size)]
X_val_st[['Avg_Price', 'Volume']] = X_val[['Avg_Price', 'Volume']].astype(float)

X_test_st = pd.DataFrame(X_test_st)
X_test_st.columns = ['Feature' + str(i) for i in range(size)]
X_test_st[['Avg_Price', 'Volume']] = X_test[['Avg_Price', 'Volume']].astype(float)

In [156]:
X_train_st.head()

Unnamed: 0,Feature0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9,...,Feature376,Feature377,Feature378,Feature379,Feature380,Feature381,Feature382,Feature383,Avg_Price,Volume
0,-0.002023,-0.036774,0.077354,0.046713,0.032552,0.002102,0.043283,0.039535,0.058228,0.008875,...,0.003257,-0.000594,-0.055096,-0.027134,-0.003575,-0.131502,0.074163,0.05751,41.428603,130672400.0
1,0.013749,0.048934,0.089868,0.047753,-0.016835,0.044599,0.058056,0.064367,0.060613,0.045722,...,-0.002635,-0.020136,-0.077832,-0.023604,0.053693,-0.123782,0.061468,0.003924,41.428603,130672400.0
2,0.030976,0.002636,0.090139,0.065713,0.003364,-0.001101,0.054763,0.017497,0.027577,0.037116,...,-0.028524,-0.053376,-0.107999,-0.035857,0.007203,-0.111301,0.011414,0.078812,41.428603,130672400.0
3,-0.005294,-0.009259,-0.00967,0.057635,-0.025521,-0.037319,-0.060817,0.088207,-0.001648,0.016527,...,-0.047937,0.001252,-0.067532,-0.055194,0.034575,-0.112039,0.041667,0.098242,41.428603,130672400.0
4,-0.002146,0.016797,-0.004066,0.08201,0.000499,0.013296,0.095996,0.113363,0.051394,-0.006462,...,-0.043031,-0.021357,-0.055635,0.054795,0.052306,-0.15425,0.005198,0.008437,41.428603,130672400.0


In [157]:
X_val_st.head()

Unnamed: 0,Feature0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9,...,Feature376,Feature377,Feature378,Feature379,Feature380,Feature381,Feature382,Feature383,Avg_Price,Volume
0,0.000158,-0.029377,0.027311,0.009435,0.005225,0.013579,-0.035999,0.061863,0.049671,0.00967,...,-0.086341,0.028364,-0.049615,-0.039821,0.046718,-0.117746,0.015545,0.069136,46.904922,199202000.0
1,-0.045317,-0.045913,0.111688,0.099022,0.053137,0.001026,0.019429,0.02221,-0.019778,-0.026104,...,-0.045251,0.041996,-0.044994,-0.061237,0.072046,-0.127659,-0.022423,0.131794,46.904922,199202000.0
2,-0.022719,-0.031737,0.002823,0.034345,-0.044363,0.025113,-0.005354,0.007245,0.047596,-0.107752,...,-0.044412,-0.037354,0.031592,-0.081242,0.105062,-0.080331,0.048208,-0.031003,46.904922,199202000.0
3,-0.11917,0.008578,-0.011554,-0.01001,-0.008426,0.058581,0.052131,-0.01745,-0.006025,0.063895,...,0.049777,-0.077668,-0.022233,-0.021424,0.015831,-0.123832,0.051015,0.126861,46.904922,199202000.0
4,0.039347,-0.093859,0.048436,0.042893,0.045026,-0.001638,-0.006283,0.077525,0.013749,0.00445,...,-0.071864,0.021245,-0.043629,-0.015793,-0.018176,-0.069422,0.044052,0.065704,46.75312,119393600.0


## Sentiment Analysis

In [161]:
param_grid = {
    'classifier': [
        RandomForestClassifier(),
        AdaBoostClassifier(),
        BaggingClassifier(),
        GradientBoostingClassifier(),
        DecisionTreeClassifier(),
        XGBClassifier(use_label_encoder=False, eval_metric='logloss')  # XGBoost specific params
    ],
    # For tree-based classifiers
    'classifier__n_estimators': [50, 100, 200],  # Hyperparameter for ensemble classifiers
    'classifier__max_depth': [None, 10, 20, 30],  # Max depth for tree-based models
    # For boosting models
    'classifier__learning_rate': np.logspace(-3, 0, 4),  # Learning rate for boosting algorithms
}

In [165]:
param_grid

{'classifier': [RandomForestClassifier(),
  AdaBoostClassifier(),
  BaggingClassifier(),
  GradientBoostingClassifier(),
  DecisionTreeClassifier(),
  XGBClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=None, device=None, early_stopping_rounds=None,
                enable_categorical=False, eval_metric='logloss',
                feature_types=None, gamma=None, grow_policy=None,
                importance_type=None, interaction_constraints=None,
                learning_rate=None, max_bin=None, max_cat_threshold=None,
                max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
                max_leaves=None, min_child_weight=None, missing=nan,
                monotone_constraints=None, multi_strategy=None, n_estimators=None,
                n_jobs=None, num_parallel_tree=None, random_state=None, ...)],
 'classifier__n_estimators': [100, 200, 300],
 'classifier__max_

In [164]:
param_grid = {
    'classifier': [
        RandomForestClassifier(),
        AdaBoostClassifier(),
        BaggingClassifier(),
        GradientBoostingClassifier(),
        DecisionTreeClassifier(),
        XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    ],
    # Hyperparameters for RandomForestClassifier
    'classifier__n_estimators': [100, 200, 500],  # Number of trees in the forest
    'classifier__max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'classifier__min_samples_split': [2, 5, 10],  # Minimum samples required to split an internal node
    'classifier__min_samples_leaf': [1, 2, 4],  # Minimum number of samples at a leaf node
    'classifier__bootstrap': [True, False],  # Whether bootstrap samples are used

    # Hyperparameters for AdaBoostClassifier
    'classifier__n_estimators': [50, 100, 200],  # Number of boosting rounds
    'classifier__learning_rate': [0.01, 0.1, 0.5, 1.0],  # Learning rate shrinks the contribution of each classifier
    'classifier__algorithm': ['SAMME', 'SAMME.R'],  # Boosting algorithm

    # Hyperparameters for BaggingClassifier
    'classifier__n_estimators': [50, 100, 300],  # Number of base estimators in the ensemble
    'classifier__max_samples': [0.6, 0.8, 1.0],  # Max number of samples to draw from X to train each base estimator
    'classifier__max_features': [0.6, 0.8, 1.0],  # Max features to draw from X to train each base estimator
    'classifier__bootstrap': [True, False],  # Whether bootstrap samples are used
    'classifier__bootstrap_features': [True, False],  # Whether bootstrap samples are used when selecting features

    # Hyperparameters for GradientBoostingClassifier
    'classifier__n_estimators': [100, 200, 300],  # Number of boosting stages to be run
    'classifier__learning_rate': [0.01, 0.1, 0.2],  # Shrinks contribution of each tree
    'classifier__max_depth': [3, 5, 10],  # Maximum depth of the individual regression estimators
    'classifier__min_samples_split': [2, 5, 10],  # Minimum samples required to split an internal node
    'classifier__min_samples_leaf': [1, 2, 4],  # Minimum number of samples at a leaf node
    'classifier__subsample': [0.6, 0.8, 1.0],  # Subsample ratio of the training instances

    # Hyperparameters for DecisionTreeClassifier
    'classifier__criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
    'classifier__splitter': ['best', 'random'],  # Strategy used to split at each node
    'classifier__max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'classifier__min_samples_split': [2, 5, 10],  # Minimum samples required to split an internal node
    'classifier__min_samples_leaf': [1, 2, 4],  # Minimum number of samples at a leaf node

    # Hyperparameters for XGBClassifier
    'classifier__n_estimators': [100, 200, 300],  # Number of boosting rounds
    'classifier__learning_rate': [0.01, 0.1, 0.2],  # Learning rate shrinks the contribution of each classifier
    'classifier__max_depth': [3, 5, 10],  # Maximum depth of the trees
    'classifier__min_child_weight': [1, 3, 5],  # Minimum sum of instance weight needed in a child
    'classifier__subsample': [0.6, 0.8, 1.0],  # Subsample ratio of the training instances
    'classifier__colsample_bytree': [0.6, 0.8, 1.0],  # Subsample ratio of columns when constructing each tree
    'classifier__gamma': [0, 0.1, 0.2],  # Minimum loss reduction required to make a further partition
    'classifier__scale_pos_weight': [1, 10, 25],  # Controls balance of positive and negative weights for imbalanced classes
}

In [162]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import train_test_split

# Assuming your word embeddings are already prepared in X (as dense vectors) and y (target labels)
# Example: Simulating imbalanced dataset
# X, y = make_classification(n_samples=1000, n_features=300, n_classes=2, weights=[0.9, 0.1], random_state=42)  # Imbalanced dataset

# Split the data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with a placeholder for the classifier
pipeline = Pipeline([
    ('classifier', RandomForestClassifier())  # Placeholder for the classifier
])



# Create a custom F1 scorer for imbalanced datasets using 'weighted' average
f1_scorer = make_scorer(f1_score, average='weighted')

# Create RandomizedSearchCV with the pipeline
random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid, n_iter=10, scoring=f1_scorer, cv=5, random_state=42)

# Fit the model
random_search.fit(X_train_wv, y_train)

# Print the best estimator and the best F1 score
print(f"Best estimator: {random_search.best_estimator_}")
print(f"Best F1 score (cross-validated): {random_search.best_score_}")

# Evaluate on the test set
y_pred = random_search.predict(X_test)
test_f1_score = f1_score(y_test, y_pred, average='weighted')
print(f"Test set F1 score: {test_f1_score}")

ValueError: Invalid parameter learning_rate for estimator RandomForestClassifier(max_depth=30, n_estimators=50). Check the list of available parameters with `estimator.get_params().keys()`.