# Web Scrapping the dataset

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from urllib.request import urlopen
from urllib.request import Request
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import numpy as np
from textblob import TextBlob
from nltk.corpus import stopwords
from collections import Counter
import warnings; warnings.simplefilter('ignore')
import nltk
import string
from nltk import ngrams
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer

In [2]:
n = 1000

In [3]:
tickers = ['TCS']

In [4]:
finviz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}

for ticker in tickers:
    url = finviz_url + ticker
    req = Request(url=url,headers={'user-agent': 'my-app/0.0.1'}) 
    resp = urlopen(req)    
    html = BeautifulSoup(resp, features="lxml")
    news_table = html.find(id='news-table')
    news_tables[ticker] = news_table

try:
    for ticker in tickers:
        df = news_tables[ticker]
        df_tr = df.findAll('tr')
    
        print ('\n')
        print ('Recent News Headlines for {}: '.format(ticker))
        
        for i, table_row in enumerate(df_tr):
            a_text = table_row.a.text
            td_text = table_row.td.text
            td_text = td_text.strip()
            print(a_text,'(',td_text,')')
            if i == n-1:
                break
except KeyError:
    pass



Recent News Headlines for TCS: 
The Container Store Group, Inc. Announces Fourth Quarter and Full Fiscal 2021 Earnings Conference Call ( May-03-22 04:05PM )
The Container Store Group, Inc.'s (NYSE:TCS) Stock Has Been Sliding But Fundamentals Look Strong: Is The Market Wrong? ( Apr-12-22 12:12PM )
Tata Consultancy Earnings Trail Estimates After Labor Crunch Boosts Costs ( Apr-11-22 08:18AM )
Payments Canada Partners with TCS to Evolve Payments with RTR ( Apr-05-22 02:45AM )
The Container Store Introduces New Loyalty Program ( Mar-31-22 08:30AM )
FAX Capital Corp. Reports Fourth Quarter and 2021 Results ( Mar-29-22 05:37PM )
Tecsys Takes Store Fulfillment to the Next Level with Omnichannel Store-as-Warehouse Functionality ( Mar-28-22 08:00AM )
Automation Takes Center Stage in Tecsys' New Omni WMS for Digital Commerce ( 08:00AM )
Automation Takes Center Stage in Tecsys' New Omni WMS for Digital Commerce ( 08:00AM )
3 Growing Retailers Trading for Less Than 10 Times Earnings ( Mar-15-22 

In [5]:
parsed_news = []
for file_name, news_table in news_tables.items():
    for x in news_table.findAll('tr'):
        text = x.a.get_text() 
        date_scrape = x.td.text.split()

        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        else:
            date = date_scrape[0]
            time = date_scrape[1]

        ticker = file_name.split('_')[0]
        
        parsed_news.append([ticker, date, time, text])

In [6]:
analyzer = SentimentIntensityAnalyzer()

columns = ['Ticker', 'Date', 'Time', 'Headline']
news = pd.DataFrame(parsed_news, columns=columns)
#scores = news['Headline'].apply(analyzer.polarity_scores).tolist()

#df_scores = pd.DataFrame(scores)
#news = news.join(df_scores, rsuffix='_right')

In [7]:
news.head()

Unnamed: 0,Ticker,Date,Time,Headline
0,TCS,May-03-22,04:05PM,"The Container Store Group, Inc. Announces Four..."
1,TCS,Apr-12-22,12:12PM,"The Container Store Group, Inc.'s (NYSE:TCS) S..."
2,TCS,Apr-11-22,08:18AM,Tata Consultancy Earnings Trail Estimates Afte...
3,TCS,Apr-05-22,02:45AM,Payments Canada Partners with TCS to Evolve Pa...
4,TCS,Mar-31-22,08:30AM,The Container Store Introduces New Loyalty Pro...


In [8]:
news.to_csv('data.csv')

In [9]:
df = pd.read_csv('data.csv')

In [10]:
def review_clean(review): 
    # changing to lower case
    lower = review.str.lower()
    
    # Replacing the repeating pattern of &#039;
    pattern_remove = lower.str.replace("&#039;", "")
    
    # Removing all the special Characters
    special_remove = pattern_remove.str.replace(r'[^\w\d\s]',' ')
    
    # Removing all the non ASCII characters
    ascii_remove = special_remove.str.replace(r'[^\x00-\x7F]+',' ')
    
    # Removing the leading and trailing Whitespaces
    whitespace_remove = ascii_remove.str.replace(r'^\s+|\s+?$','')
    
    # Replacing multiple Spaces with Single Space
    multiw_remove = whitespace_remove.str.replace(r'\s+',' ')
    
    # Replacing Two or more dots with one
    dataframe = multiw_remove.str.replace(r'\.{2,}', ' ')
    
    return dataframe

In [11]:
df['Text'] = review_clean(df['Headline'])

In [12]:
stop_words = set(stopwords.words('english'))
df['Text'] = df['Text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

In [13]:
Snow_ball = SnowballStemmer("english")
df['Text'] = df['Text'].apply(lambda x: " ".join(Snow_ball.stem(word) for word in x.split()))

In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,Ticker,Date,Time,Headline,Text
0,0,TCS,May-03-22,04:05PM,"The Container Store Group, Inc. Announces Four...",contain store group inc announc fourth quarter...
1,1,TCS,Apr-12-22,12:12PM,"The Container Store Group, Inc.'s (NYSE:TCS) S...",contain store group inc nyse tcs stock slide f...
2,2,TCS,Apr-11-22,08:18AM,Tata Consultancy Earnings Trail Estimates Afte...,tata consult earn trail estim labor crunch boo...
3,3,TCS,Apr-05-22,02:45AM,Payments Canada Partners with TCS to Evolve Pa...,payment canada partner tcs evolv payment rtr
4,4,TCS,Mar-31-22,08:30AM,The Container Store Introduces New Loyalty Pro...,contain store introduc new loyalti program


In [15]:
def sentiment(review):
    # Sentiment polarity of the reviews
    pol = []
    for i in review:
        analysis = TextBlob(i)
        pol.append(analysis.sentiment.polarity)
    return pol

In [16]:
df['sentiment'] = sentiment(df['Headline'])

In [17]:
df['sentiment_clean'] = sentiment(df['Text'])

In [18]:
df.head()

Unnamed: 0.1,Unnamed: 0,Ticker,Date,Time,Headline,Text,sentiment,sentiment_clean
0,0,TCS,May-03-22,04:05PM,"The Container Store Group, Inc. Announces Four...",contain store group inc announc fourth quarter...,0.175,0.175
1,1,TCS,Apr-12-22,12:12PM,"The Container Store Group, Inc.'s (NYSE:TCS) S...",contain store group inc nyse tcs stock slide f...,-0.033333,-0.033333
2,2,TCS,Apr-11-22,08:18AM,Tata Consultancy Earnings Trail Estimates Afte...,tata consult earn trail estim labor crunch boo...,0.0,0.0
3,3,TCS,Apr-05-22,02:45AM,Payments Canada Partners with TCS to Evolve Pa...,payment canada partner tcs evolv payment rtr,0.0,0.0
4,4,TCS,Mar-31-22,08:30AM,The Container Store Introduces New Loyalty Pro...,contain store introduc new loyalti program,0.136364,0.136364


In [19]:
df.loc[(df['sentiment_clean'] >= 0.05), 'real_sentiment'] = 1
df.loc[(df['sentiment_clean'] < 0.05), 'real_sentiment'] = 0

In [20]:
df.head()

Unnamed: 0.1,Unnamed: 0,Ticker,Date,Time,Headline,Text,sentiment,sentiment_clean,real_sentiment
0,0,TCS,May-03-22,04:05PM,"The Container Store Group, Inc. Announces Four...",contain store group inc announc fourth quarter...,0.175,0.175,1.0
1,1,TCS,Apr-12-22,12:12PM,"The Container Store Group, Inc.'s (NYSE:TCS) S...",contain store group inc nyse tcs stock slide f...,-0.033333,-0.033333,0.0
2,2,TCS,Apr-11-22,08:18AM,Tata Consultancy Earnings Trail Estimates Afte...,tata consult earn trail estim labor crunch boo...,0.0,0.0,0.0
3,3,TCS,Apr-05-22,02:45AM,Payments Canada Partners with TCS to Evolve Pa...,payment canada partner tcs evolv payment rtr,0.0,0.0,0.0
4,4,TCS,Mar-31-22,08:30AM,The Container Store Introduces New Loyalty Pro...,contain store introduc new loyalti program,0.136364,0.136364,1.0


In [32]:
df.to_csv('data1.csv')

In [21]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size = 0.25, random_state = 0)

# checking the shape 
print("Shape of train:", df_train.shape)
print("Shape of test: ", df_test.shape)

Shape of train: (75, 9)
Shape of test:  (25, 9)


In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

cv = CountVectorizer(max_features = 20000, ngram_range = (4, 4))
pipeline = Pipeline([('vect',cv)])

df_train_features = pipeline.fit_transform(df_train['Text'])
df_test_features = pipeline.fit_transform(df_test['Text'])

print("df_train_features :", df_train_features.shape)
print("df_test_features :", df_test_features.shape)

df_train_features : (75, 389)
df_test_features : (25, 138)


In [23]:
y_train = df_train['real_sentiment']
y_test = df_test['real_sentiment']
solution = y_test.copy()

## Normal CNN

In [24]:
import keras 
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Bidirectional
from keras.layers import BatchNormalization
from keras.layers import LSTM
from keras.preprocessing.sequence import pad_sequences

In [28]:
model = keras.models.Sequential()

model.add(keras.layers.Dense(200, input_shape=(389,)))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation('relu'))
model.add(keras.layers.Dropout(0.5))

model.add(keras.layers.Dense(300))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation('relu'))
model.add(keras.layers.Dropout(0.5))

model.add(keras.layers.Dense(100, activation = 'relu'))
model.add(keras.layers.Dense(1, activation = 'sigmoid'))

#  Model compile
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [26]:
df_train_features = df_train_features.toarray()

In [30]:
hist = model.fit(df_train_features, y_train, epochs=15, batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
