In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from urllib.request import urlopen
from urllib.request import Request
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import numpy as np
from textblob import TextBlob
from nltk.corpus import stopwords
from collections import Counter
import warnings; warnings.simplefilter('ignore')
import nltk
import string
from nltk import ngrams
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer

In [4]:
df = pd.read_csv('data.csv')

In [5]:
def review_clean(review): 
    # changing to lower case
    lower = review.str.lower()
    
    # Replacing the repeating pattern of &#039;
    pattern_remove = lower.str.replace("&#039;", "")
    
    # Removing all the special Characters
    special_remove = pattern_remove.str.replace(r'[^\w\d\s]',' ')
    
    # Removing all the non ASCII characters
    ascii_remove = special_remove.str.replace(r'[^\x00-\x7F]+',' ')
    
    # Removing the leading and trailing Whitespaces
    whitespace_remove = ascii_remove.str.replace(r'^\s+|\s+?$','')
    
    # Replacing multiple Spaces with Single Space
    multiw_remove = whitespace_remove.str.replace(r'\s+',' ')
    
    # Replacing Two or more dots with one
    dataframe = multiw_remove.str.replace(r'\.{2,}', ' ')
    
    return dataframe

In [6]:
df['Text'] = review_clean(df['Headline'])

In [7]:
stop_words = set(stopwords.words('english'))
df['Text'] = df['Text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

In [8]:
Snow_ball = SnowballStemmer("english")
df['Text'] = df['Text'].apply(lambda x: " ".join(Snow_ball.stem(word) for word in x.split()))

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,Ticker,Date,Time,Headline,Text
0,0,TCS,May-03-22,04:05PM,"The Container Store Group, Inc. Announces Four...",contain store group inc announc fourth quarter...
1,1,TCS,Apr-12-22,12:12PM,"The Container Store Group, Inc.'s (NYSE:TCS) S...",contain store group inc nyse tcs stock slide f...
2,2,TCS,Apr-11-22,08:18AM,Tata Consultancy Earnings Trail Estimates Afte...,tata consult earn trail estim labor crunch boo...
3,3,TCS,Apr-05-22,02:45AM,Payments Canada Partners with TCS to Evolve Pa...,payment canada partner tcs evolv payment rtr
4,4,TCS,Mar-31-22,08:30AM,The Container Store Introduces New Loyalty Pro...,contain store introduc new loyalti program


In [10]:
def sentiment(review):
    # Sentiment polarity of the reviews
    pol = []
    for i in review:
        analysis = TextBlob(i)
        pol.append(analysis.sentiment.polarity)
    return pol

In [11]:
df['sentiment'] = sentiment(df['Headline'])

In [12]:
df['sentiment_clean'] = sentiment(df['Text'])

In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,Ticker,Date,Time,Headline,Text,sentiment,sentiment_clean
0,0,TCS,May-03-22,04:05PM,"The Container Store Group, Inc. Announces Four...",contain store group inc announc fourth quarter...,0.175,0.175
1,1,TCS,Apr-12-22,12:12PM,"The Container Store Group, Inc.'s (NYSE:TCS) S...",contain store group inc nyse tcs stock slide f...,-0.033333,-0.033333
2,2,TCS,Apr-11-22,08:18AM,Tata Consultancy Earnings Trail Estimates Afte...,tata consult earn trail estim labor crunch boo...,0.0,0.0
3,3,TCS,Apr-05-22,02:45AM,Payments Canada Partners with TCS to Evolve Pa...,payment canada partner tcs evolv payment rtr,0.0,0.0
4,4,TCS,Mar-31-22,08:30AM,The Container Store Introduces New Loyalty Pro...,contain store introduc new loyalti program,0.136364,0.136364


In [14]:
df.loc[(df['sentiment_clean'] >= 0.05), 'real_sentiment'] = 1
df.loc[(df['sentiment_clean'] < 0.05), 'real_sentiment'] = 0

In [15]:
df.head()

Unnamed: 0.1,Unnamed: 0,Ticker,Date,Time,Headline,Text,sentiment,sentiment_clean,real_sentiment
0,0,TCS,May-03-22,04:05PM,"The Container Store Group, Inc. Announces Four...",contain store group inc announc fourth quarter...,0.175,0.175,1.0
1,1,TCS,Apr-12-22,12:12PM,"The Container Store Group, Inc.'s (NYSE:TCS) S...",contain store group inc nyse tcs stock slide f...,-0.033333,-0.033333,0.0
2,2,TCS,Apr-11-22,08:18AM,Tata Consultancy Earnings Trail Estimates Afte...,tata consult earn trail estim labor crunch boo...,0.0,0.0,0.0
3,3,TCS,Apr-05-22,02:45AM,Payments Canada Partners with TCS to Evolve Pa...,payment canada partner tcs evolv payment rtr,0.0,0.0,0.0
4,4,TCS,Mar-31-22,08:30AM,The Container Store Introduces New Loyalty Pro...,contain store introduc new loyalti program,0.136364,0.136364,1.0


In [32]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D ,GRU, Dropout
from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt

In [17]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['Text'].values)
X = tokenizer.texts_to_sequences(df['Text'].values)
X = pad_sequences(X)
print("X tokenized data = ", X[:5])

X tokenized data =  [[  0   0   0   0   0   2   1   3   4   8  31  11  38  12  13   7  14  17]
 [  0   0   0   0   0   2   1   3   4   9   5   6 126 127  62  39  23  63]
 [  0   0   0   0   0   0   0   0   0  64  65   7 128  24 129 130 131  66]
 [  0   0   0   0   0   0   0   0   0   0   0  40 132  32   5 133  40 134]
 [  0   0   0   0   0   0   0   0   0   0   0   0   2   1 135  15 136 137]]


In [23]:
 y = pd.get_dummies(df['real_sentiment'])

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# LSTM

In [25]:
lstm = Sequential()
lstm.add(Embedding(5000, 256, input_length=X.shape[1]))
lstm.add(SpatialDropout1D(0.4))
lstm.add(LSTM(256, return_sequences=True, dropout=0.3, recurrent_dropout=0.2))
lstm.add(LSTM(256, dropout=0.3, recurrent_dropout=0.2))
lstm.add(Dense(2, activation='softmax'))
lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 18, 256)           1280000   
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 18, 256)          0         
 lDropout1D)                                                     
                                                                 
 lstm_2 (LSTM)               (None, 18, 256)           525312    
                                                                 
 lstm_3 (LSTM)               (None, 256)               525312    
                                                                 
 dense_1 (Dense)             (None, 2)                 514       
                                                                 
Total params: 2,331,138
Trainable params: 2,331,138
Non-trainable params: 0
____________________________________________

In [28]:
batch_size=32
lstm.fit(X_train, y_train, epochs = 30,batch_size=batch_size, verbose = 'auto')

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x21dbf5454c0>

In [29]:
lstm.evaluate(X_test,y_test)



[1.7759454250335693, 0.75]

In [33]:
gru = Sequential()
gru.add(Embedding(5000, 256, input_length=X.shape[1]))
gru.add(Dropout(0.3))
gru.add(GRU(256, return_sequences=True, dropout=0.3, recurrent_dropout=0.2))
gru.add(GRU(256, dropout=0.3, recurrent_dropout=0.2))
gru.add(Dense(2, activation='softmax'))
gru.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
gru.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 18, 256)           1280000   
                                                                 
 dropout (Dropout)           (None, 18, 256)           0         
                                                                 
 gru (GRU)                   (None, 18, 256)           394752    
                                                                 
 gru_1 (GRU)                 (None, 256)               394752    
                                                                 
 dense_2 (Dense)             (None, 2)                 514       
                                                                 
Total params: 2,070,018
Trainable params: 2,070,018
Non-trainable params: 0
_________________________________________________________________


In [34]:
batch_size=32
gru.fit(X_train, y_train, epochs = 30,batch_size=batch_size, verbose = 'auto')

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x21dbf7677c0>

In [37]:
gru.evaluate(X_test,y_test)



[2.2905304431915283, 0.699999988079071]

In [45]:
predictions = gru.predict(X_test)

In [46]:
avg_neg = np.mean([prediction[0] for prediction in predictions])
avg_pos = np.mean([prediction[1] for prediction in predictions])
print(f"Average negative sentiment score = {avg_neg}\nAverage positive sentiment score = {avg_pos}")

Average negative sentiment score = 0.6489429473876953
Average positive sentiment score = 0.3510570526123047


In [47]:
text = "tata Consultancy Earnings Trail Estimates After Labor Crunch Boosts Costs "

In [48]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(text)
X1 = tokenizer.texts_to_sequences(text)
X1 = pad_sequences(X1)

In [49]:
predictions = gru.predict(X1)

In [50]:
avg_neg = np.mean([prediction[0] for prediction in predictions])
avg_pos = np.mean([prediction[1] for prediction in predictions])
print(f"Average negative sentiment score = {avg_neg}\nAverage positive sentiment score = {avg_pos}")

Average negative sentiment score = 0.5716809034347534
Average positive sentiment score = 0.42831921577453613


In [51]:
gru.save('gru.h5')