## Importing Libraries

In [4]:
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from bs4 import BeautifulSoup
from time import sleep
import requests
import urllib.request
from datetime import datetime
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from datetime import date, timedelta
import ast




## 1. Pulling News Headlines

News headlines are sourced from The Economic Times archives, accessible on their official website.

In [None]:
list_of_dates = []
start_date = datetime(2022, 6, 8)
end_date = datetime(2024, 6, 7)
for i in range(731):
    cur_date = start_date + timedelta(i)
    list_of_dates.append(cur_date)
list_of_dates

Defining a function to extract date and format the url with the date in order to automate the process of pulling

In [None]:
mega_dict = {}
def get_date_news(date, i):
    formatted_date = date.strftime('%Y-%m-%d')
    y = formatted_date[:4]
    m = formatted_date[5:7]
    d = formatted_date[8:]
    ctr = 44720 + i
    url = "https://economictimes.indiatimes.com/archivelist/year-%s,month-%s,starttime-%s.cms" % (y, m, ctr) # edits month, year and artice no. according to a sequence in order to automate the get requests later
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html')
    heads = soup.find_all('li')
    headlines = [headline.text.strip() for headline in heads]
    mega_dict[formatted_date] = headlines[:-11]   # ET also includes trending stories on each archive page, which will be removed by this to prevent overlap
    print("Done with", i, "th date.")


In [None]:
# The following iterates through every date in the past two year
for date in list_of_dates:
    get_date_news(date, list_of_dates.index(date))

news_frame = pd.DataFrame(list(mega_dict.items()), columns=['Date', 'Headlines'])
news_frame.to_csv('news_headlines.csv')

In [None]:
df = pd.read_csv("news_headlines.csv")

In [None]:
#when a list of strings is stored in a CSV file, it is converted into a string. ast.literal_eval converts the string back into a list
df['Headlines'] = df['Headlines'].apply(ast.literal_eval)

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess(fin_head):
    fin_head = re.sub(r'[^a-zA-Z\s]', '', fin_head)
    tokens = word_tokenize(fin_head.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [None]:
string = "Greetings, throughout heaven and earth I alone am the chosen one..." # testing the preprocess() function
preprocess(string)

In [None]:
processed_news = []
for datenews in df['Headlines']:
    news_of_the_day = []
    for headline in datenews:
        news = preprocess(headline)
        news_of_the_day.append(news)
    processed_news.append(news_of_the_day)
processed_news             

In [None]:
df['Processed_headlines'] = processed_news


In [None]:
df.to_csv("Final_news_data.csv")

## 2. Applying Sentiment Analyzer

In [5]:
#importing VADER analyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [6]:
def sentiment_scorer(news):
    analyzer = SentimentIntensityAnalyzer()
    sentiment_dict = analyzer.polarity_scores(news)
    return sentiment_dict['neg'], sentiment_dict['neu'], sentiment_dict['pos'], sentiment_dict['compound']

In [7]:
#read the processed CSV file
news_df = pd.read_csv("Final_news_data.csv")
news_df['Processed_headlines'] = news_df['Processed_headlines'].apply(ast.literal_eval)

In [8]:
news_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Date,Headlines,Processed_headlines
0,0,0,2022-06-08,"['Why waste waste if you can get wasted?', 'Ji...","[waste waste get wasted, jiobp give monthly re..."
1,1,1,2022-06-09,['Centre may express concern over narcotics sm...,[centre may express concern narcotic smuggling...
2,2,2,2022-06-10,"['Tea exporters buoyed by robust demand, expec...",[tea exporter buoyed robust demand expect mill...
3,3,3,2022-06-11,"[""Meta probing Sheryl Sandberg's use of compan...",[meta probing sheryl sandbergs use company res...
4,4,4,2022-06-12,['India’s driving the R&D wheel at Mercedes-Be...,"[india driving rd wheel mercedesbenz, rule wha..."
...,...,...,...,...,...
726,726,726,2024-06-03,"['What drove growth in FY24', 'BJP worker alle...","[drove growth fy, bjp worker allegedly beheade..."
727,727,727,2024-06-04,"['10 Best Mayonnaise of (2024) For quick, tast...","[best mayonnaise quick tasty meal, gen z help ..."
728,728,728,2024-06-05,['How will new moon in Gemini affect you? Peop...,[new moon gemini affect people zodiac sign may...
729,729,729,2024-06-06,['Nisaba Godrej quits VIP Board over differenc...,"[nisaba godrej quits vip board difference, mha..."


In [9]:
# Segregating the sentiment scores into negative, positive, neutral and compound scores
sentiment_scores_lst = []
for day in news_df['Processed_headlines']:
    neg_score_lst = []
    neu_score_lst = []
    pos_score_lst = []
    cpd_score_lst = []
    final_dict = {}
    for news in day:
        neg_score, neu_score, pos_score, cpd_score = sentiment_scorer(news)
        neg_score_lst.append(neg_score)
        neu_score_lst.append(neu_score)
        pos_score_lst.append(pos_score)
        cpd_score_lst.append(cpd_score)
    final_dict['neg'] = neg_score_lst
    final_dict['neu'] = neu_score_lst
    final_dict['pos'] = pos_score_lst
    final_dict['cpd'] = cpd_score_lst
    sentiment_scores_lst.append(final_dict)

In [17]:
# In order to get single value entries, I have taken mean score of each sentiment 
negative_Scores = []
neutral_Scores = []
positive_Scores = []
compound_Scores = []
for score in sentiment_scores_lst:
    neg_mean = sum(score['neg'])/len(score['neg'])
    neu_mean = sum(score['neu'])/len(score['neu'])
    pos_mean = sum(score['pos'])/len(score['pos'])
    cpd_mean = sum(score['cpd'])/len(score['cpd'])
    negative_Scores.append(neg_mean)
    neutral_Scores.append(neu_mean)
    positive_Scores.append(pos_mean)
    compound_Scores.append(cpd_mean)

In [22]:
news_df['Positive Scores'] = positive_Scores
news_df['Negative Scores'] = negative_Scores
news_df['Neutral Scores'] = neutral_Scores
news_df['Compound Scores'] = compound_Scores

In [23]:
news_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Date,Headlines,Processed_headlines,Positive Scores,Negative Scores,Neutral Scores,Compound Scores
0,0,0,2022-06-08,"['Why waste waste if you can get wasted?', 'Ji...","[waste waste get wasted, jiobp give monthly re...",0.110808,0.073270,0.815934,0.044268
1,1,1,2022-06-09,['Centre may express concern over narcotics sm...,[centre may express concern narcotic smuggling...,0.148782,0.076613,0.774605,0.093870
2,2,2,2022-06-10,"['Tea exporters buoyed by robust demand, expec...",[tea exporter buoyed robust demand expect mill...,0.130858,0.098172,0.770968,0.036928
3,3,3,2022-06-11,"[""Meta probing Sheryl Sandberg's use of compan...",[meta probing sheryl sandbergs use company res...,0.105064,0.102094,0.792850,-0.008721
4,4,4,2022-06-12,['India’s driving the R&D wheel at Mercedes-Be...,"[india driving rd wheel mercedesbenz, rule wha...",0.100937,0.124604,0.774458,-0.042125
...,...,...,...,...,...,...,...,...,...
726,726,726,2024-06-03,"['What drove growth in FY24', 'BJP worker alle...","[drove growth fy, bjp worker allegedly beheade...",0.140942,0.073648,0.785410,0.094385
727,727,727,2024-06-04,"['10 Best Mayonnaise of (2024) For quick, tast...","[best mayonnaise quick tasty meal, gen z help ...",0.167072,0.078026,0.754926,0.156232
728,728,728,2024-06-05,['How will new moon in Gemini affect you? Peop...,[new moon gemini affect people zodiac sign may...,0.162856,0.077198,0.759950,0.122053
729,729,729,2024-06-06,['Nisaba Godrej quits VIP Board over differenc...,"[nisaba godrej quits vip board difference, mha...",0.157706,0.077356,0.764926,0.100804


In [24]:
news_df.to_csv("final_sentiment_data.csv")