# Import libraries

In [1]:
import pandas as pd
import numpy as np
import sklearn
import xgboost
import matplotlib.pyplot as plt
import boto3

In [2]:
import warnings
warnings.simplefilter("ignore")

# Setting up MLFLOW for tracking

In [3]:
import mlflow
from mlflow.tracking.client import MlflowClient

In [4]:
import os
mlflow_tracking_url = "http://mlflow-server:8089"
minio_s3_url="http://minio:9000"
os.environ['MLFLOW_TRACKING_USERNAME'] = 'lee'
os.environ['MLFLOW_TRACKING_PASSWORD'] = 'leesecret'
os.environ['MLFLOW_TRACKING_URI'] = mlflow_tracking_url
os.environ['MLFLOW_S3_ENDPOINT_URL'] = minio_s3_url
os.environ['MLFLOW_DEFAULT_ARTIFACT_ROOT']="s3://mlflow/"
os.environ['AWS_ACCESS_KEY_ID'] = 'lee'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'leesecret'

In [5]:
# setting mlflow tracking url
mlflow.set_tracking_uri(uri=mlflow_tracking_url)

# set experiment name
mlflow.set_experiment(experiment_name="Stock News Sentiment")

<Experiment: artifact_location='s3://mlflow/1', creation_time=1763235110985, experiment_id='1', last_update_time=1763235110985, lifecycle_stage='active', name='Stock News Sentiment', tags={}>

## Import dataset

In [6]:
dataset_file = "./financial_news_events.csv"

# read dataset file
df = pd.read_csv(dataset_file)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3024 entries, 0 to 3023
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Date                  3024 non-null   object 
 1   Headline              2876 non-null   object 
 2   Source                3024 non-null   object 
 3   Market_Event          3024 non-null   object 
 4   Market_Index          3024 non-null   object 
 5   Index_Change_Percent  2863 non-null   float64
 6   Trading_Volume        3024 non-null   float64
 7   Sentiment             2853 non-null   object 
 8   Sector                3024 non-null   object 
 9   Impact_Level          3024 non-null   object 
 10  Related_Company       3024 non-null   object 
 11  News_Url              2871 non-null   object 
dtypes: float64(2), object(10)
memory usage: 283.6+ KB


## Cleanse the data

In [8]:
### Remove the rows that has blank Headlines or Sentiments
df = df.dropna(subset=["Headline","Sentiment"])

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2714 entries, 2 to 3023
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Date                  2714 non-null   object 
 1   Headline              2714 non-null   object 
 2   Source                2714 non-null   object 
 3   Market_Event          2714 non-null   object 
 4   Market_Index          2714 non-null   object 
 5   Index_Change_Percent  2570 non-null   float64
 6   Trading_Volume        2714 non-null   float64
 7   Sentiment             2714 non-null   object 
 8   Sector                2714 non-null   object 
 9   Impact_Level          2714 non-null   object 
 10  Related_Company       2714 non-null   object 
 11  News_Url              2578 non-null   object 
dtypes: float64(2), object(10)
memory usage: 275.6+ KB


In [10]:
df["Impact_Level"].unique()

array(['Medium', 'Low', 'High'], dtype=object)

In [11]:
### Select only required columns
df = df[["Headline","Sector","Related_Company","Trading_Volume","Impact_Level","Sentiment"]]


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2714 entries, 2 to 3023
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Headline         2714 non-null   object 
 1   Sector           2714 non-null   object 
 2   Related_Company  2714 non-null   object 
 3   Trading_Volume   2714 non-null   float64
 4   Impact_Level     2714 non-null   object 
 5   Sentiment        2714 non-null   object 
dtypes: float64(1), object(5)
memory usage: 148.4+ KB


In [13]:
df["Sentiment"].unique()

array(['Neutral', 'Positive', 'Negative'], dtype=object)

In [14]:
### Assign unique numeric value to each Sentiment category for training purpose
df["Sentiment"] = df["Sentiment"].astype("category").cat.codes
# df["Headline"] = df["Headline"].astype(str)
# df["Sector"] = df["Sector"].astype(str)
# df["Related_Company"] = df["Related_Company"].astype(str)
# df["Impact_Level"]= df["Impact_Level"].astype(str)

from sklearn import preprocessing 
lbl = preprocessing.LabelEncoder() 
df["Headline"] = lbl.fit_transform(df["Headline"].astype(str))
df["Sector"] = lbl.fit_transform(df["Sector"].astype(str))
df["Related_Company"] = lbl.fit_transform(df["Related_Company"].astype(str))
df["Impact_Level"] = lbl.fit_transform(df["Impact_Level"].astype("category"))
# df["Sentiment"] = lbl.fit_transform(df["Sentiment"].astype("category"))

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2714 entries, 2 to 3023
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Headline         2714 non-null   int64  
 1   Sector           2714 non-null   int64  
 2   Related_Company  2714 non-null   int64  
 3   Trading_Volume   2714 non-null   float64
 4   Impact_Level     2714 non-null   int64  
 5   Sentiment        2714 non-null   int8   
dtypes: float64(1), int64(4), int8(1)
memory usage: 129.9 KB


In [16]:
df["Sentiment"].unique()

array([1, 2, 0], dtype=int8)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2714 entries, 2 to 3023
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Headline         2714 non-null   int64  
 1   Sector           2714 non-null   int64  
 2   Related_Company  2714 non-null   int64  
 3   Trading_Volume   2714 non-null   float64
 4   Impact_Level     2714 non-null   int64  
 5   Sentiment        2714 non-null   int8   
dtypes: float64(1), int64(4), int8(1)
memory usage: 129.9 KB


## Train and Test Dataset

In [18]:
from sklearn.model_selection import train_test_split

### Split into training & test set. Test is taken as 30% and 70% data as Training
X_Train, X_Test, Y_Train, Y_Test = train_test_split(df.drop("Sentiment", axis=1),df["Sentiment"],test_size=0.3)

In [19]:
X_Train.count()

Headline           1899
Sector             1899
Related_Company    1899
Trading_Volume     1899
Impact_Level       1899
dtype: int64

In [20]:
Y_Train.info()

<class 'pandas.core.series.Series'>
Index: 1899 entries, 1722 to 909
Series name: Sentiment
Non-Null Count  Dtype
--------------  -----
1899 non-null   int8 
dtypes: int8(1)
memory usage: 16.7 KB


In [21]:
X_Test.count()

Headline           815
Sector             815
Related_Company    815
Trading_Volume     815
Impact_Level       815
dtype: int64

In [22]:
Y_Test.info()

<class 'pandas.core.series.Series'>
Index: 815 entries, 1010 to 1089
Series name: Sentiment
Non-Null Count  Dtype
--------------  -----
815 non-null    int8 
dtypes: int8(1)
memory usage: 7.2 KB


## Train the model

In [23]:
### enable autologging of xgboost model training in mlflow platform
mlflow.xgboost.autolog()

In [24]:
from xgboost import XGBClassifier

### use_label_encoder=False : disables automatic conversion of label by xgboost. Encoding by ourself.
### logloss for binary classification
model = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")

In [25]:
## Start capturing run info in mlflow
run =mlflow.start_run()

In [26]:
model.fit(X_Train,Y_Train,eval_set=[(X_Test, Y_Test)], verbose=False)



0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [48]:
y_pred = model.predict(X_Test)
y_pred.shape

(815,)

In [49]:
Y_Test.shape

(815,)

In [53]:
from sklearn.metrics import accuracy_score, recall_score,f1_score
accuracy = f1_score(Y_Test,y_pred,average="weighted")
recall = f1_score(Y_Test,y_pred,average="weighted")

In [58]:
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("Recall: %.2f%%" % (recall * 100.0))

Accuracy: 29.60%
Recall: 29.60%


In [59]:
mlflow.end_run()

üèÉ View run invincible-pig-400 at: http://mlflow-server:8089/#/experiments/1/runs/7a3bebf6e87940c58c585dfba2df5275
üß™ View experiment at: http://mlflow-server:8089/#/experiments/1
