In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import os

In [2]:
#Set notebook preferences
pd.set_option('display.max_colwidth', None)

plt.style.use('ggplot')

In [12]:
#Read in rawData
path= r'/Users/ksharma/Documents/ML Engineer/Machine Learning/Projects/FinancialNewsSentimentAnalysis/'
config_name= 'config.yaml'

#read yaml file
with open(os.path.join(path, config_name)) as file:
    config = yaml.safe_load(file)
    
rawData= pd.read_csv(config['paths']['rawData'], header=None)
rawData.columns= ['sentiment', 'newsHeadline']

**Preview Data**

In [None]:
print('Data Shape: %s\n' % (str(rawData.shape)))
print('Value counts: \n%s\n' % (rawData['sentiment'].value_counts()))
display(rawData.head())

Preprocess data

In [None]:
os.chdir('/Users/ksharma/Documents/ML Engineer/Machine Learning/Projects/FinancialNewsSentimentAnalysis')
from src.preprocessing.preprocessing import Preprocessor

#Normalize, tokenize and lemmatize text
tokens= Preprocessor(rawData.loc[:,'newsHeadline'])

tokens.normalization()
tokens.tokenization()
tokens.lemmatization()

#Append lemma to raw data 
cleanDf= rawData.copy()
cleanDf['lemmaTokens']= tokens.lemmaTokens

#Join lemmaTokens into a single string and append to cleanDf
sentences= cleanDf['lemmaTokens'].apply(lambda x: ' '.join(x))
cleanDf.loc[:, 'lemmaTokensSentences']= sentences

#Check
display(cleanDf.head())

Split data

In [None]:
X= cleanDf.loc[:, 'lemmaTokensSentences']
y= cleanDf.loc[:, 'sentiment'].values

In [None]:
#Initialize data representation object
from src.preprocessing.preprocessing import DataRepresentation

trial1= DataRepresentation(X, y)
trial1.split_data()

Evaluate Base Models

In [None]:
#Create BoW representation of data
X_train_BoW= trial1.bag_of_words()

In [None]:
#Init Models object and evaluate base models
from src.modeling.modeling import Models

baseModels= Models(X_train_BoW, trial1.y_train)

baseModels.base_model_evaluation()

**We'll optimize for Logistic Regression, Random Forest Classifier, & Linear SVC**

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%reload_ext autoreload

In [14]:
config['tuning']['logisticRegression']['param_grid']

{'penalty': ['l1', 'l2', 'elasticnet', 'none'],
 'C': [0.1, 0.01, 1],
 'kernel': "('linear', 'rbf')",
 'random_state': 24,
 'n_jobs': '-1}'}