In [200]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import nltk
from nltk.tokenize import word_tokenize
import gensim
from gensim.models import Word2Vec
from keras.preprocessing.text import one_hot,Tokenizer
from keras.utils import pad_sequences


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss
from sklearn.metrics import classification_report, confusion_matrix

from tensorflow import keras
from tensorflow.keras import layers
from keras.initializers import Constant
from tensorflow.keras.layers import Dense, Embedding, Activation, Flatten


In [201]:
df=pd.read_csv('train_data.csv')

In [202]:
df.head()

Unnamed: 0,text,label
0,Here are Thursday's biggest analyst calls: App...,0
1,Buy Las Vegas Sands as travel to Singapore bui...,0
2,"Piper Sandler downgrades DocuSign to sell, cit...",0
3,"Analysts react to Tesla's latest earnings, bre...",0
4,Netflix and its peers are set for a ‘return to...,0


In [203]:
df['label'].value_counts()

2     3545
18    2118
14    1822
9     1557
5      987
16     985
1      837
19     823
7      624
6      524
15     501
17     495
12     487
13     471
4      359
3      321
0      255
8      166
10      69
11      44
Name: label, dtype: int64

In [204]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16990 entries, 0 to 16989
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16990 non-null  object
 1   label   16990 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 265.6+ KB


In [205]:
def cleansing(df):
    df_clean=df.str.lower()
    df_clean=[re.sub(r"\d+","",i )for i in df_clean]
    df_clean=[re.sub(r'[^\w]', ' ', i)for i in df_clean]
    df_clean=[re.sub(r'\s+',' ',i)for i in df_clean]
    return df_clean


In [206]:
df['label'].value_counts().sort_index()

0      255
1      837
2     3545
3      321
4      359
5      987
6      524
7      624
8      166
9     1557
10      69
11      44
12     487
13     471
14    1822
15     501
16     985
17     495
18    2118
19     823
Name: label, dtype: int64

In [207]:
df['clean_text']=cleansing(df['text'])

In [208]:
#check maximum length of word in sentence
max_sen = df['clean_text'].str.split().str.len().max()

In [209]:
max_sen

58

**Split the Data**

In [210]:
x_train, x_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size = 0.2, random_state = 42,stratify=df['label'])

In [211]:
print("Total size: ", len(df))
print("Train size: ", len(X_train))
print("Test size: ", len(X_test))

Total size:  16990
Train size:  13592
Test size:  3398


In [212]:
len(x_train)

13592

In [213]:
df.sample

<bound method NDFrame.sample of                                                     text  label  \
0      Here are Thursday's biggest analyst calls: App...      0   
1      Buy Las Vegas Sands as travel to Singapore bui...      0   
2      Piper Sandler downgrades DocuSign to sell, cit...      0   
3      Analysts react to Tesla's latest earnings, bre...      0   
4      Netflix and its peers are set for a ‘return to...      0   
...                                                  ...    ...   
16985  KfW credit line for Uniper could be raised to ...      3   
16986  KfW credit line for Uniper could be raised to ...      3   
16987  Russian  https://t.co/R0iPhyo5p7 sells 1 bln r...      3   
16988  Global ESG bond issuance posts H1 dip as supra...      3   
16989  Brazil's Petrobras says it signed a $1.25 bill...      3   

                                              clean_text  
0      here are thursday s biggest analyst calls appl...  
1      buy las vegas sands as travel to singa

###Preprocessing

###Tokenization

In [214]:
nltk.download('punkt')
word_token=[word_tokenize(i) for i in X_train]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Text Representation

In [215]:
model_skipgram = gensim.models.Word2Vec(word_token, min_count = 1,vector_size = 100, window = 5, sg=1)
model_skipgram.train(word_token,epochs=10,total_examples=len(word_token))



(2223662, 2903200)

In [216]:
vocabulary_skipgram = model_skipgram.wv.index_to_key
len(vocabulary_skipgram)

34590

In [217]:
word_vec_dict={}
for word in vocabulary_skipgram:
    word_vec_dict[word]=model_skipgram.wv.get_vector(word)
print("The no of key-value pairs : ",len(word_vec_dict)) # should come equal to vocab size


The no of key-value pairs :  34590


In [218]:
from keras.preprocessing.text import one_hot,Tokenizer
tok = Tokenizer()
tok.fit_on_texts(x_train)
vocab_size = len(tok.word_index) + 1
encd_rev = tok.texts_to_sequences(x_train)

In [219]:
max_sen_len= max_sen # max lenght of word in a sentence
vocab_size = len(tok.word_index) + 1 #ideally it should be len(tok.word_index) + 1  or total no of words in data in this case = 337, but to handle number of data which not appear in train, for example in test, make the size higher
embed_dim=100 # embedding dimension as choosen in word2vec constructor

In [220]:
from keras.preprocessing.sequence import pad_sequences
# now padding to have a maximum length of 48
pad_rev= pad_sequences(encd_rev, maxlen=max_sen_len, padding='post')
pad_rev.shape   # note that we had 40 data and we have padded each review to have  a lenght of 48 words.

(13592, 58)

In [221]:
# now creating the embedding matrix
embed_matrix=np.zeros(shape=(vocab_size,embed_dim))
for word,i in tok.word_index.items():
    embed_vector=word_vec_dict.get(word) #mapping the vector to word in our skipgram dictionary
    if embed_vector is not None:  # word is in the vocabulary learned by the w2v model
        embed_matrix[i]=embed_vector
  # if word is not found then embed_vector corressponding to that vector will stay zero.

### Modeling

In [222]:
model = keras.Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=embed_dim,input_length=max_sen_len,embeddings_initializer=Constant(embed_matrix)))
model.add(layers.LSTM(64, input_shape=(None, 28)))
model.add(layers.BatchNormalization())
model.add(layers.Dense(10))
print(model.summary())

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 58, 100)           3455400   
                                                                 
 lstm_6 (LSTM)               (None, 64)                42240     
                                                                 
 batch_normalization_6 (Bat  (None, 64)                256       
 chNormalization)                                                
                                                                 
 dense_6 (Dense)             (None, 10)                650       
                                                                 
Total params: 3498546 (13.35 MB)
Trainable params: 3498418 (13.35 MB)
Non-trainable params: 128 (512.00 Byte)
_________________________________________________________________
None


In [223]:
model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer="sgd",
    metrics=["accuracy"],
)

In [224]:
model.fit(pad_rev, y_train, batch_size=2, epochs=2)

Epoch 1/2


InvalidArgumentError: ignored

### Tuned Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, BatchNormalization, Dropout, Dense
from tensorflow.keras.initializers import Constant

backend.clear_session()

modified_model = Sequential()
modified_model.add(Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=max_sen_len, embeddings_initializer=Constant(embed_matrix)))
modified_model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.3))
modified_model.add(BatchNormalization())
modified_model.add(Dropout(0.3))
modified_model.add(Dense(20, activation='softmax'))

print(modified_model.summary())
