### Importing required libraries

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_curve

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer #word stemmer class
lemma = WordNetLemmatizer()

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Bidirectional, LSTM, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding

In [5]:
# defining path of data
my_data = r"C:\Users\Danish\Desktop\ZS_associates\dataset"

In [11]:
train = pd.read_csv('C:/Users/Danish/Desktop/ZS_associates/dataset/train_file.csv')

In [12]:
test = pd.read_csv('C:/Users/Danish/Desktop/ZS_associates/dataset/test_file.csv')

In [13]:
# first five rows
train.head()

Unnamed: 0,IDLink,Title,Headline,Source,Topic,PublishDate,Facebook,GooglePlus,LinkedIn,SentimentTitle,SentimentHeadline
0,Tr3CMgRv1N,Obama Lays Wreath at Arlington National Cemetery,Obama Lays Wreath at Arlington National Cemete...,USA TODAY,obama,2002-04-02 00:00:00,-1,-1,-1,0.0,-0.0533
1,Wc81vGp8qZ,A Look at the Health of the Chinese Economy,"Tim Haywood, investment director business-unit...",Bloomberg,economy,2008-09-20 00:00:00,-1,-1,-1,0.208333,-0.156386
2,zNGH03CrZH,Nouriel Roubini: Global Economy Not Back to 2008,"Nouriel Roubini, NYU professor and chairman at...",Bloomberg,economy,2012-01-28 00:00:00,-1,-1,-1,-0.42521,0.139754
3,3sM1H0W8ts,Finland GDP Expands In Q4,Finland's economy expanded marginally in the t...,RTT News,economy,2015-03-01 00:06:00,-1,-1,-1,0.0,0.026064
4,wUbnxgvqaZ,"Tourism, govt spending buoys Thai economy in J...",Tourism and public spending continued to boost...,The Nation - Thailand&#39;s English news,economy,2015-03-01 00:11:00,-1,-1,-1,0.0,0.141084


### Trying to find out the missing values in it

In [14]:
# creating dataframe of missing values present in columns
missing_val = pd.DataFrame(train.isnull().sum())
# resetting index
missing_val = missing_val.reset_index()
missing_val

Unnamed: 0,index,0
0,IDLink,0
1,Title,0
2,Headline,0
3,Source,175
4,Topic,0
5,PublishDate,0
6,Facebook,0
7,GooglePlus,0
8,LinkedIn,0
9,SentimentTitle,0


There are no missing values in the columns except source it has 175 missing values

In [19]:
test.isnull().sum()

IDLink           0
Title            0
Headline         0
Source         101
Topic            0
PublishDate      0
Facebook         0
GooglePlus       0
LinkedIn         0
dtype: int64

In [16]:
# Info
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55932 entries, 0 to 55931
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   IDLink             55932 non-null  object 
 1   Title              55932 non-null  object 
 2   Headline           55932 non-null  object 
 3   Source             55757 non-null  object 
 4   Topic              55932 non-null  object 
 5   PublishDate        55932 non-null  object 
 6   Facebook           55932 non-null  int64  
 7   GooglePlus         55932 non-null  int64  
 8   LinkedIn           55932 non-null  int64  
 9   SentimentTitle     55932 non-null  float64
 10  SentimentHeadline  55932 non-null  float64
dtypes: float64(2), int64(3), object(6)
memory usage: 4.7+ MB


In [17]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37288 entries, 0 to 37287
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   IDLink       37288 non-null  object
 1   Title        37288 non-null  object
 2   Headline     37288 non-null  object
 3   Source       37187 non-null  object
 4   Topic        37288 non-null  object
 5   PublishDate  37288 non-null  object
 6   Facebook     37288 non-null  int64 
 7   GooglePlus   37288 non-null  int64 
 8   LinkedIn     37288 non-null  int64 
dtypes: int64(3), object(6)
memory usage: 2.6+ MB


In [18]:
train.shape

(55932, 11)

## Now splitting into x_train and y_train 

In [21]:
# x_train and y_train titles
X_train_title = train.loc[:,'Title'].values
y_train_title = train.loc[:,['SentimentTitle']].values

# x_train_headline and y_train_headline
X_train_headline = train.loc[:,'Headline'].values
y_train_headline = train.loc[:,['SentimentHeadline']].values

In [22]:
# introducing x_train title and headline 
X_test_title = test.loc[:,'Title'].values
X_test_headline = test.loc[:,'Headline'].values

In [23]:
# creating title dataframe
title_df=pd.DataFrame()
title_df['X_train_title']=X_train_title
title_df['y_train_title']=y_train_title

# creating headline dataframe 
headline_df=pd.DataFrame()
headline_df['X_train_headline']=X_train_headline
headline_df['y_train_headline']=y_train_headline

# creating test dataframe
test_df=pd.DataFrame()
test_df['X_test_title']=X_test_title
test_df['X_test_headline']=X_test_headline

In [24]:
# defining a function for preprocessing
def preprocess_text(texts):
    texts = texts.lower() 
    texts = re.sub(r'[^\x00-\x7F]+',' ', texts) 
    words = texts.split()
    words = filter(lambda x: x[0]!= '@' , texts.split()) 
    words = [word for word in words if word not in set(stopwords.words('english'))] 
    texts = " ".join(words)
    return texts

In [26]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Danish\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [27]:
# this will give preprocessed texts
title_df['X_train_title'] = title_df.X_train_title.apply(preprocess_text)
display(title_df.head(3))

Unnamed: 0,X_train_title,y_train_title
0,obama lays wreath arlington national cemetery,0.0
1,look health chinese economy,0.208333
2,nouriel roubini: global economy back 2008,-0.42521


In [28]:
# headline preprocessing texts
headline_df['X_train_headline'] = headline_df.X_train_headline.apply(preprocess_text)
display(headline_df.head())

Unnamed: 0,X_train_headline,y_train_headline
0,obama lays wreath arlington national cemetery....,-0.0533
1,"tim haywood, investment director business-unit...",-0.156386
2,"nouriel roubini, nyu professor chairman roubin...",0.139754
3,finland's economy expanded marginally three mo...,0.026064
4,tourism public spending continued boost econom...,0.141084


In [29]:
# test headline preprocessing texts
test_df['X_test_title'] = test_df.X_test_title.apply(preprocess_text)
test_df['X_test_headline'] = test_df.X_test_headline.apply(preprocess_text)
display(test_df.head())

Unnamed: 0,X_test_title,X_test_headline
0,sliding economy: fg fights back n3trn tsa funds,2016 budget passed national assembly n3trillio...
1,microsoft shows hololens bring distant family ...,recent microsoft research video shows $3000 au...
2,"microsoft twitter robot praises hitler, trump ...","* microsoft teamed bing create taytweets, acco..."
3,flood central bank moves can't get world econo...,central bankers managed steer world economy cl...
4,usd/jpy: bears lining mixed u.s. economy outlook,"however, streak seven-day gains might end mark..."


In [42]:
embeddings_index = dict()
f = open('C:/Users/Danish/Desktop/ZS_associates/dataset/glove.6b.50d.txt','r',encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [43]:
# tokenizing title
max_len_title = title_df.X_train_title.apply(lambda x: len(x.split())).max()

tok_title = Tokenizer()
tok_title.fit_on_texts(title_df.X_train_title)
vocab_size_title = len(tok_title.word_index) + 1
encoded_title = tok_title.texts_to_sequences(title_df.X_train_title)
padded_title = pad_sequences(encoded_title, maxlen=max_len_title, padding='post')

vocab_size_title = len(tok_title.word_index) + 1

In [44]:
title_embedding_matrix = np.zeros((vocab_size_title, 50))
for word, i in tok_title.word_index.items():
    t_embedding_vector = embeddings_index.get(word)
    if t_embedding_vector is not None:
        title_embedding_matrix[i] = t_embedding_vector

In [45]:
# tokenizing headline
max_len_headline = headline_df.X_train_headline.apply(lambda x: len(x.split())).max()

tok_headline = Tokenizer()
tok_headline.fit_on_texts(headline_df.X_train_headline)
vocab_size_headline = len(tok_headline.word_index) + 1
encoded_headline = tok_headline.texts_to_sequences(headline_df.X_train_headline)
padded_headline = pad_sequences(encoded_headline, maxlen=max_len_headline, padding='post')

vocab_size_headline = len(tok_headline.word_index) + 1

In [46]:
headline_embedding_matrix = np.zeros((vocab_size_headline, 50))
for word, i in tok_headline.word_index.items():
    h_embedding_vector = embeddings_index.get(word)
    if h_embedding_vector is not None:
        headline_embedding_matrix[i] = h_embedding_vector

In [103]:
# tokenizing test_title
test_max_len_title = test_df.X_test_title.apply(lambda x: len(x.split())).max()

test_tok_title = Tokenizer()
test_tok_title.fit_on_texts(test_df.X_test_title)
test_vocab_size_title = len(test_tok_title.word_index) + 1
test_encoded_title = test_tok_title.texts_to_sequences(test_df.X_test_title)
test_padded_title = pad_sequences(test_encoded_title, maxlen=test_max_len_title, padding='post')

test_vocab_size_title = len(test_tok_title.word_index) + 1

test_max_len_headline = test_df.X_test_headline.apply(lambda x: len(x.split())).max()

# tokenizing test_headline
test_tok_headline = Tokenizer()
test_tok_headline.fit_on_texts(test_df.X_test_headline)
test_vocab_size_headline = len(test_tok_headline.word_index) + 1
test_encoded_headline = test_tok_headline.texts_to_sequences(test_df.X_test_headline)
test_padded_headline = pad_sequences(test_encoded_headline, maxlen=test_max_len_headline, padding='post')

test_vocab_size_headline = len(test_tok_headline.word_index) + 1

In [104]:
test_padded_headline.shape

(37288, 53)

## Splitting into train and test values randomly

In [48]:
x_train_title, x_valid_title, Y_train_title, y_valid_title = train_test_split(padded_title, y_train_title,
                                                                              shuffle = True, test_size = 0.1)
x_train_headline, x_valid_headline, Y_train_headline, y_valid_headline = train_test_split(padded_headline, y_train_headline,
                                                                                          shuffle = True, test_size = 0.1)

In [49]:
# Importing libraries
import math
from math import exp
from keras import backend as K

In [50]:
# defining hyperbolic tan
def mod_tanh(x):
    return K.tanh(0.6*x)

 **Title Model**

In [51]:
title_model = Sequential()
title_model.add(Embedding(vocab_size_title, 50, input_length=max_len_title, weights=[title_embedding_matrix], trainable=True))
title_model.add(Bidirectional(LSTM(20, return_sequences=True)))
title_model.add(Dropout(0.3))
title_model.add(BatchNormalization())
title_model.add(Bidirectional(LSTM(20, return_sequences=True)))
title_model.add(Dropout(0.3))
title_model.add(BatchNormalization())
title_model.add(Bidirectional(LSTM(20)))
title_model.add(Dropout(0.3))
title_model.add(BatchNormalization())
title_model.add(Dense(64, activation='relu'))
title_model.add(Dense(64, activation='relu'))
title_model.add(Dense(1, activation=mod_tanh))
title_model.compile(loss='mse', optimizer='adam', metrics=['mse', 'mae'])

**Headline Model**

In [52]:
headline_model = Sequential()
headline_model.add(Embedding(vocab_size_headline, 50, input_length=max_len_headline, weights=[headline_embedding_matrix], trainable=True))
headline_model.add(Bidirectional(LSTM(20, return_sequences=True)))
headline_model.add(Dropout(0.3))
headline_model.add(BatchNormalization())
headline_model.add(Bidirectional(LSTM(20, return_sequences=True)))
headline_model.add(Dropout(0.3))
headline_model.add(BatchNormalization())
headline_model.add(Bidirectional(LSTM(20)))
headline_model.add(Dropout(0.3))
headline_model.add(BatchNormalization())
headline_model.add(Dense(64, activation='relu'))
headline_model.add(Dense(64, activation='relu'))
headline_model.add(Dense(1, activation=mod_tanh))
headline_model.compile(loss='mse', optimizer='adam', metrics=['mse', 'mae'])

In [53]:
import tensorflow as tf

In [54]:
with tf.device('/device:GPU:0'):
    title_model.fit(x_train_title, Y_train_title, epochs = 9)

Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


In [72]:
with tf.device('/device:GPU:0'):
    headline_model.fit(x_train_headline, Y_train_headline, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [73]:
# validity of title prediction
title_valid_pred = title_model.predict(x_valid_title)

# validity of headline prediction 
headline_valid_pred = headline_model.predict(x_valid_headline)

In [74]:
# saving models
title_model.save('title.h5')

headline_model.save('headline.h5')

In [75]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Mean absolute error fot title
mae_title=mean_absolute_error(y_valid_title,title_valid_pred)

# Mean absolute error for headline
mae_headline=mean_absolute_error(y_valid_headline,headline_valid_pred)

# Evaluation Metrics
score=1-((0.4*mae_title)+(0.6*mae_headline))
print(f'The Evaluation metrics is: {round(score,3)}')

The Evaluation metrics is: 0.932


In [76]:
# prediction of title
pred_title=title_model.predict(test_padded_title)

In [77]:
import warnings                                                                                
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [96]:
pred_headline.shape

(37288, 1)

In [97]:
pred_title.shape

(37288, 1)

In [106]:
A = test_padded_headline

In [107]:
D1, D2 = test_padded_headline.shape
A[:, 0] = A[:, D2-3] 
A.resize((D1, D2-3), refcheck=False)
A.shape

(37288, 50)

In [108]:
# prediction of headline 
pred_headline=headline_model.predict(test_padded_headline)

In [120]:
# creating dataframe as per required format
submission1=pd.DataFrame()
submission1['IDLink']=test['IDLink'].to_list()
submission1['SentimentTitle']=pred_title
submission1['SentimentHeadline']=pred_headline

In [111]:
submission1.to_csv('Submissions.csv')

In [116]:
submission1.to_csv(os.path.join('C:/Users/Danish/Desktop/ZS_associates/dataset','Submission.csv'))