# stock news sentiment analysis

In [1]:
import pandas as pd

## Step 1 : News headline extraction of few selected companies from fiviz website

### company_names = 'qualcomm','american express','morgan stanley','Deutsche Bank','Citigroup','bank of montreal','Barclays','bank of hawaii','Bank of america'

In [None]:
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup


In [None]:
finviz_url = "https://finviz.com/quote.ashx?t="
tickers = ['QCOM','AXP','MS','BCS','DB','C','BMO','BOH','BAC']

news_tables = {}
for ticker in tickers:
    url = finviz_url + ticker

    req = Request(url=url, headers={'user-agent': 'my-app'})
    response = urlopen(req)

    html = BeautifulSoup(response, features='html.parser')
    news_table = html.find(id='news-table')
    news_tables[ticker] = news_table

parsed_data = []

In [None]:
for ticker, news_table in news_tables.items():
    for row in news_table.findAll('tr'):
        title = row.a.text.strip()
        date_time_text = row.td.text.strip()
        
        # Initialize date and time variables
        date = ''
        time = ''
        
        if ' ' in date_time_text:  # Both date and time are present
            date, time = date_time_text.split(' ', 1)
        else:  # Only time is present
            time = date_time_text

        parsed_data.append([ticker, date, time, title])

df3 = pd.DataFrame(parsed_data, columns=['ticker', 'date', 'time', 'title'])


def convert_date_format(date_str):
    try:
       
        date = pd.to_datetime(date_str, format='%b-%d-%y')
        
        return date.strftime('%d-%m-%Y')
    except ValueError:
        
        return date_str


df3['date'] = df3['date'].apply(convert_date_format)





In [None]:
df3['date'] = df3['date'].replace('', pd.NA).ffill()
df3['date'] = df3['date'].replace('Today','17-06-2024')
# changing all dates from today to 17th june
print(df3.head(2))
df3.to_csv('raw_stock_news.csv')

  ticker        date     time  \
0   QCOM  17-06-2024  09:34PM   
1   QCOM  17-06-2024  05:06PM   

                                               title  
0  Qualcomm to benefit most from slow Samsung 3nm...  
1  Is Qualcomm (NASDAQ:QCOM) the Best AI PC Stock...  


In [63]:
df_n = pd.read_csv('raw_stock_news.csv')

### selecting most important news from multiple news articles for a sepecific date

In [3]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import matplotlib.pyplot as plt
import nltk

In [4]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [6]:
vader = SentimentIntensityAnalyzer()




f = lambda title: vader.polarity_scores(title)['compound']
df_n['compound'] = df_n['title'].apply(f)

print(df_n)

     Unnamed: 0 ticker        date     time  \
0             0   QCOM  17-06-2024  09:34PM   
1             1   QCOM  17-06-2024  05:06PM   
2             2   QCOM  17-06-2024  04:57PM   
3             3   QCOM  17-06-2024  04:51PM   
4             4   QCOM  17-06-2024  02:44PM   
..          ...    ...         ...      ...   
895         895    BAC  04-06-2024  08:02AM   
896         896    BAC  04-06-2024  06:15AM   
897         897    BAC  03-06-2024  05:45PM   
898         898    BAC  03-06-2024  05:06PM   
899         899    BAC  03-06-2024  02:34PM   

                                                 title  compound  
0    Qualcomm to benefit most from slow Samsung 3nm...    0.4588  
1    Is Qualcomm (NASDAQ:QCOM) the Best AI PC Stock...    0.6369  
2    Nvidia Turbocharges Semiconductor ETFs Even As...    0.5994  
3    Qualcomm reaches $75 million settlement over s...    0.0516  
4    Stocks to Watch Tuesday: Qualcomm, Micron Tech...    0.2732  
..                               

In [12]:

df_filtered = df_n.loc[df_n.groupby(['ticker', 'date'])['compound'].apply(lambda x: x.abs().idxmax())]


df_filtered.reset_index(drop=True, inplace=True)


print(df_filtered['Unnamed: 0'][0])
df_filtered.drop('Unnamed: 0',axis=1,inplace=True)
print(df_filtered)

df_filtered.to_csv('M2_news_data.csv')

138
    ticker        date     time  \
0      AXP  03-06-2024  05:45PM   
1      AXP  04-06-2024  06:45AM   
2      AXP  05-06-2024  07:10AM   
3      AXP  06-06-2024  06:36AM   
4      AXP  07-06-2024  02:04PM   
..     ...         ...      ...   
273   QCOM  26-05-2024  04:03PM   
274   QCOM  28-05-2024  06:22AM   
275   QCOM  29-05-2024  09:11AM   
276   QCOM  30-05-2024  06:43AM   
277   QCOM  31-05-2024  01:38PM   

                                                 title  compound  
0    American Express (AXP) Stock Declines While Ma...    0.4215  
1    3 Blue-Chip Stocks You'll Regret Not Buying in...   -0.4215  
2      The 3 Best Blue-Chip Stocks to Buy in June 2024    0.6369  
3    Hidden Treasures: 3 Dow Stocks That Deserve MU...    0.8062  
4    American Express (AXP) Rises 36% in a Year: Mo...    0.0000  
..                                                 ...       ...  
273  10 Best AI Stocks to Buy for 2024 According to...    0.6369  
274  If You Can Only Buy One Stock in M

## Data extraction of true stock values of selected companies in 2024 

### From yahoo finance historical data

In [None]:


file_names = ['QCOM.csv','AXP.csv','MS.csv','DB.csv','C.csv','BMO.csv','BCS.csv','BOH.csv','BAC.csv']
company_names = ['qualcomm','american express','morgan stanley','Deutsche Bank','Citigroup','bank of montreal','Barclays','bank of hawaii','Bank of america']

def add_bullish_bearish_columns_and_combine(file_names, company_names, output_file, start_date='01-01-2024'):
    combined_df = pd.DataFrame()
    
    for csv_file, company_name in zip(file_names, company_names):
        
        df = pd.read_csv(csv_file, parse_dates=['Date'], dayfirst=True)  ## day first true is for date format
        
        
        df = df[df['Date'] >= pd.to_datetime(start_date, format='%d-%m-%Y')]
        
     
        df['Bullish/Bearish'] = df.apply(lambda row: 'Bullish' if row['Close'] > row['Open'] else 'Bearish', axis=1)
        df['ticker'] = csv_file.split('.')[0]
        df['company_name'] = company_name
        
      
        df['Date'] = df['Date'].dt.strftime('%d-%m-%Y')
        
        
        combined_df = pd.concat([combined_df, df], ignore_index=True)
    
    
    combined_df.to_csv(output_file, index=False)


output_file = 'combined_stock_data.csv'
add_bullish_bearish_columns_and_combine(file_names, company_names, output_file)


In [None]:
df4 = pd.read_csv('combined_stock_data.csv')
df4.head()


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Bullish/Bearish,ticker,company_name
0,02-01-2024,142.190002,142.199997,138.779999,140.229996,138.951874,8495800,Bearish,QCOM,qualcomm
1,03-01-2024,138.889999,138.889999,136.990005,137.600006,136.345856,8133300,Bearish,QCOM,qualcomm
2,04-01-2024,135.440002,137.330002,134.940002,136.169998,134.928879,6770300,Bullish,QCOM,qualcomm
3,05-01-2024,136.160004,138.070007,135.850006,136.729996,135.483765,6826500,Bullish,QCOM,qualcomm
4,08-01-2024,136.990005,139.149994,136.639999,139.029999,137.762802,7729400,Bullish,QCOM,qualcomm


## Merge datasets on date and ticker for final dataframe

#### if a date value is missing in df4(i.e. true data) , data of next date is used 

In [69]:
df4 = pd.read_csv('combined_stock_data.csv')
df4.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Bullish/Bearish,ticker,company_name
0,02-01-2024,142.190002,142.199997,138.779999,140.229996,138.951874,8495800,Bearish,QCOM,qualcomm
1,03-01-2024,138.889999,138.889999,136.990005,137.600006,136.345856,8133300,Bearish,QCOM,qualcomm
2,04-01-2024,135.440002,137.330002,134.940002,136.169998,134.928879,6770300,Bullish,QCOM,qualcomm
3,05-01-2024,136.160004,138.070007,135.850006,136.729996,135.483765,6826500,Bullish,QCOM,qualcomm
4,08-01-2024,136.990005,139.149994,136.639999,139.029999,137.762802,7729400,Bullish,QCOM,qualcomm


In [94]:
df4['Date'] = pd.to_datetime(df4['Date'], format='%d-%m-%Y')
df4 = df4.sort_values(by='Date').reset_index(drop=True)


df_filtered['date'] = pd.to_datetime(df_filtered['date'], format='%d-%m-%Y')


merged_data = []
for index, row in df_filtered.iterrows():
    ticker = row['ticker']
    date = row['date']
    
    
    matching_row = df4[(df4['ticker'] == ticker) & (df4['Date'] == date)]
    
    if matching_row.empty:
        
        next_available_row = df4[(df4['ticker'] == ticker) & (df4['Date'] > date)].head(1)
        if not next_available_row.empty:
            selected_row = next_available_row.iloc[0].copy()
            selected_row['Date'] = date  # Keep the original date from df3_grouped
        else:
            continue  # If no later date is available, skip this row
    else:
        selected_row = matching_row.iloc[0].copy()
        selected_row['Date'] = date  # Keep the original date 

    
    merged_data.append({
        'ticker': ticker,
        'date': date.strftime('%d-%m-%Y'),
        'title': row['title'],
        'Bullish/Bearish': selected_row['Bullish/Bearish'],
        'company_name': selected_row['company_name'],
        'Close' : selected_row['Close']
    })


df_merged = pd.DataFrame(merged_data)


print(df_merged)
print(df_merged['title'][0])

    ticker        date                                              title  \
0      AXP  03-06-2024  American Express (AXP) Stock Declines While Ma...   
1      AXP  04-06-2024  3 Blue-Chip Stocks You'll Regret Not Buying in...   
2      AXP  05-06-2024    The 3 Best Blue-Chip Stocks to Buy in June 2024   
3      AXP  06-06-2024  Hidden Treasures: 3 Dow Stocks That Deserve MU...   
4      AXP  09-06-2024  Baby boomers favorite credit card Amex is now ...   
..     ...         ...                                                ...   
187   QCOM  25-05-2024  Why Qualcomm Stock Could Still Be a Great Valu...   
188   QCOM  26-05-2024  10 Best AI Stocks to Buy for 2024 According to...   
189   QCOM  28-05-2024  If You Can Only Buy One Stock in May, It Bette...   
190   QCOM  29-05-2024  1 Soaring Artificial Intelligence (AI) Stock t...   
191   QCOM  30-05-2024  Is Taiwan Semiconductor (TSM) The Best AI Semi...   

    Bullish/Bearish      company_name       Close  
0           Bearish  am

In [71]:
f = lambda title: vader.polarity_scores(title)['compound']
df_merged['compound'] = df_merged['title'].apply(f)

print(df_merged)

    ticker        date                                              title  \
0      AXP  03-06-2024  American Express (AXP) Stock Declines While Ma...   
1      AXP  04-06-2024  3 Blue-Chip Stocks You'll Regret Not Buying in...   
2      AXP  05-06-2024    The 3 Best Blue-Chip Stocks to Buy in June 2024   
3      AXP  06-06-2024  Hidden Treasures: 3 Dow Stocks That Deserve MU...   
4      AXP  09-06-2024  Baby boomers favorite credit card Amex is now ...   
..     ...         ...                                                ...   
187   QCOM  25-05-2024  Why Qualcomm Stock Could Still Be a Great Valu...   
188   QCOM  26-05-2024  10 Best AI Stocks to Buy for 2024 According to...   
189   QCOM  28-05-2024  If You Can Only Buy One Stock in May, It Bette...   
190   QCOM  29-05-2024  1 Soaring Artificial Intelligence (AI) Stock t...   
191   QCOM  30-05-2024  Is Taiwan Semiconductor (TSM) The Best AI Semi...   

    Bullish/Bearish      company_name  compound  
0           Bearish  amer

## training the model 

In [72]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



label_encoder = LabelEncoder()
df_merged['Bullish/Bearish'] = label_encoder.fit_transform(df_merged['Bullish/Bearish'])


X = df_merged['title'].values
y = df_merged['Bullish/Bearish'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [73]:

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)


maxlen = 100
X_train_pad = pad_sequences(X_train_tokens, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_tokens, maxlen=maxlen)


In [80]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout


model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=maxlen))   ## keras embeddings are tasks specific 
## whereas word2vec embeddings are general in nature 
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


history = model.fit(X_train_pad, y_train, epochs=6, batch_size=32, validation_split=0.2)


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [81]:

loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=2)
print(f'Accuracy: {accuracy}')


y_pred = (model.predict(X_test_pad) > 0.5).astype("int32")


from sklearn.metrics import accuracy_score

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')



2/2 - 0s - loss: 0.6263 - accuracy: 0.7179 - 236ms/epoch - 118ms/step
Accuracy: 0.7179487347602844
Accuracy: 0.717948717948718
