In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

def scrape_news_by_date(company_name, start_date, end_date):
    base_url = f"https://finance.yahoo.com/quote/{company_name}/news"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    all_news = []
    current_date = start_date
    while current_date <= end_date:
        date_url = f"{base_url}?p={company_name}&date={current_date}"
        response = requests.get(date_url, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            news_articles = soup.find_all("h3", class_="Mb(5px)")

            for article in news_articles:
                headline = article.text
                article_link = article.a['href']
                all_news.append({'Date': current_date, 'Headline': headline})

        else:
            print(f"Failed to fetch news for {current_date}")

        current_date += pd.DateOffset(days=1)  # Move to the next day

    return all_news


company_ticker = "TCS.NS"

# Define the start and end dates for scraping
start_date = pd.to_datetime('2021-12-09')
end_date = pd.to_datetime('2023-12-08')

# Scrape news for the specified date range
news_data = scrape_news_by_date(company_ticker, start_date, end_date)


df = pd.DataFrame(news_data)

# Save the data to a CSV file
df.to_csv('tcs_news.csv', index=False)


In [None]:
!pip install demoji
import demoji
import re
import string
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag
import attr
import nltk



In [None]:
# This function is used to pass the POS tage for each word passed through clean_text function
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
# Cleaning
def clean_text(text):
    # Initialization the tokenizer
    tk = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
    # Initialization the lemmatizer
    lemmatizer = WordNetLemmatizer()
    # Avoid deleting the negative verbs as it affects the meaning.
    stop_words = stopwords.words('english') + ["i'll","i'm", "should", "could"]
    negative_verbs = [ "shan't",'shouldn',"shouldn't",'wasn','weren','won','wouldn','aren','couldn','didn','doesn','hadn','hasn','haven','isn','ma','mightn','mustn',"mustn't",'needn',"needn't","wouldn't","won't","weren't","wasn't","couldn","not","nor","no","mightn't","isn't","haven't","hadn't","hasn't","didn't","doesn't","aren't","don't","couldn't","never"]
    stop_words =[word for word in stop_words if word not in negative_verbs ]

    # Lowering text
    lower_news = text.lower()
    # Removing hashtag and cashtag symbols
    news = re.sub(r"[#$]"," ",lower_news)
    # Removing links
    news = re.sub(r"https?:\/\/.*[\r\n]*"," ", news)
    # Translating emojies into thier descriptions
    news = demoji.replace_with_desc(news)
    # removing numerical values
    news = re.sub(r"[0-9]|-->","",news)
    # Tokenize the tweets by twitter tokenzier.
    news = tk.tokenize(news)
    # Choosing the words that don't exist in stopwords, thier lengths are more than 2 letters and then lemmatize them.
    news = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in news if word not in stop_words and word not in string.punctuation and len(word)>2 and "." not in word]
    # return the tokens in one sentence
    news = " ".join(news)

    return news

In [None]:
# Read the scrapped file
news = pd.read_csv("tcs_news.csv")
news.head()

Unnamed: 0,Date,Headline
0,2021-12-09,Tata Tech Adds Billions to Market Capitalizati...
1,2021-12-09,India's TCS to take $125 million hit to Q3 ear...
2,2021-12-10,Tata Tech Adds Billions to Market Capitalizati...
3,2021-12-10,India's TCS to take $125 million hit to Q3 ear...
4,2021-12-11,Tata Tech Adds Billions to Market Capitalizati...


In [None]:
import nltk
nltk.download('stopwords')
import nltk
nltk.download('averaged_perceptron_tagger')
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Applying text cleaning and then downloading it on the current folder
news['cleaned'] = news["Headline"].apply(lambda row:clean_text(row))
news.to_csv("CleanedNews.csv",index=False)

In [None]:
news = pd.read_csv("CleanedNews.csv")
news.head()

Unnamed: 0,Date,Headline,cleaned
0,2021-12-09,Tata Tech Adds Billions to Market Capitalizati...,tata tech add billion market capitalization in...
1,2021-12-09,India's TCS to take $125 million hit to Q3 ear...,india's tc take million hit earnings lawsuit
2,2021-12-10,Tata Tech Adds Billions to Market Capitalizati...,tata tech add billion market capitalization in...
3,2021-12-10,India's TCS to take $125 million hit to Q3 ear...,india's tc take million hit earnings lawsuit
4,2021-12-11,Tata Tech Adds Billions to Market Capitalizati...,tata tech add billion market capitalization in...


In [None]:
news = news.loc[:,["Date","cleaned"]]
news.head()

Unnamed: 0,Date,cleaned
0,2021-12-09,tata tech add billion market capitalization in...
1,2021-12-09,india's tc take million hit earnings lawsuit
2,2021-12-10,tata tech add billion market capitalization in...
3,2021-12-10,india's tc take million hit earnings lawsuit
4,2021-12-11,tata tech add billion market capitalization in...


In [None]:
# adding empty sentiment columns to stock_data for later calculation
news['compound'] = ''
news['negative'] = ''
news['neutral'] = ''
news['positive'] = ''
news.head()

Unnamed: 0,Date,cleaned,compound,negative,neutral,positive
0,2021-12-09,tata tech add billion market capitalization in...,,,,
1,2021-12-09,india's tc take million hit earnings lawsuit,,,,
2,2021-12-10,tata tech add billion market capitalization in...,,,,
3,2021-12-10,india's tc take million hit earnings lawsuit,,,,
4,2021-12-11,tata tech add billion market capitalization in...,,,,


In [None]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import unicodedata

# instantiating the Sentiment Analyzer
sid = SentimentIntensityAnalyzer()

# calculating sentiment scores
news['compound'] = news['cleaned'].apply(lambda x: sid.polarity_scores(x)['compound'])
news['negative'] = news['cleaned'].apply(lambda x: sid.polarity_scores(x)['neg'])
news['neutral'] = news['cleaned'].apply(lambda x: sid.polarity_scores(x)['neu'])
news['positive'] = news['cleaned'].apply(lambda x: sid.polarity_scores(x)['pos'])


In [None]:
news['Date'] = pd.to_datetime(news['Date'])
news_df = news.groupby('Date').mean().reset_index()
news_df['Date'] =pd.to_datetime(news_df['Date'],infer_datetime_format=True)
news_df['Date'] =pd.to_datetime(news_df['Date'].dt.strftime("%m/%d/%y"))
news_df.head()

  news_df = news.groupby('Date').mean().reset_index()


Unnamed: 0,Date,compound,negative,neutral,positive
0,2021-12-09,-0.11315,0.1205,0.8795,0.0
1,2021-12-10,-0.11315,0.1205,0.8795,0.0
2,2021-12-11,-0.11315,0.1205,0.8795,0.0
3,2021-12-12,-0.11315,0.1205,0.8795,0.0
4,2021-12-13,-0.11315,0.1205,0.8795,0.0


In [None]:
news_df.set_index("Date", inplace=True)

In [None]:
tcs_df = pd.read_csv("/content/tcs_df.csv")
tcs_df['Date'] = pd.to_datetime(tcs_df['Date'],infer_datetime_format=True)
tcs_df.set_index("Date", inplace=True)
tcs_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-12-09,3613.0,3632.4,3567.0,3603.6,77990
2021-12-10,3585.0,3640.85,3575.1,3636.55,78298
2021-12-13,3647.0,3660.0,3603.0,3610.1,52114
2021-12-14,3601.0,3634.4,3576.15,3622.8,87750
2021-12-15,3610.0,3622.7,3558.6,3569.6,70817
2021-12-16,3580.0,3607.25,3570.5,3581.35,19480
2021-12-17,3601.0,3650.0,3573.15,3587.2,82484
2021-12-20,3570.1,3605.0,3509.85,3557.1,160727
2021-12-21,3585.0,3639.6,3568.05,3608.45,104021
2021-12-22,3612.0,3636.0,3605.4,3630.0,68470


In [None]:
final_df = tcs_df.join(news_df,on='Date',how="inner")
final_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,compound,negative,neutral,positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-12-09,3613.0,3632.4,3567.0,3603.6,77990,-0.11315,0.1205,0.8795,0.0
2021-12-10,3585.0,3640.85,3575.1,3636.55,78298,-0.11315,0.1205,0.8795,0.0
2021-12-13,3647.0,3660.0,3603.0,3610.1,52114,-0.11315,0.1205,0.8795,0.0
2021-12-14,3601.0,3634.4,3576.15,3622.8,87750,-0.11315,0.1205,0.8795,0.0
2021-12-15,3610.0,3622.7,3558.6,3569.6,70817,-0.11315,0.1205,0.8795,0.0


In [None]:
final_df.reset_index(inplace=True)

In [None]:
final_df.to_csv('final_df.csv', index=False)