In [38]:
import pandas as pd
from tensorflow.keras.layers import Dense , Bidirectional , LSTM , Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re #regular expression
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
df =pd.read_csv('FA-KES-Dataset.csv' , encoding="latin1")
df.head()

Unnamed: 0,unit_id,article_title,article_content,source,date,location,labels
0,1914947530,Syria attack symptoms consistent with nerve ag...,Wed 05 Apr 2017 Syria attack symptoms consiste...,nna,4/5/2017,idlib,0
1,1914947532,Homs governor says U.S. attack caused deaths b...,Fri 07 Apr 2017 at 0914 Homs governor says U.S...,nna,4/7/2017,homs,0
2,1914947533,Death toll from Aleppo bomb attack at least 112,Sun 16 Apr 2017 Death toll from Aleppo bomb at...,nna,4/16/2017,aleppo,0
3,1914947534,Aleppo bomb blast kills six Syrian state TV,Wed 19 Apr 2017 Aleppo bomb blast kills six Sy...,nna,4/19/2017,aleppo,0
4,1914947535,29 Syria Rebels Dead in Fighting for Key Alepp...,Sun 10 Jul 2016 29 Syria Rebels Dead in Fighti...,nna,7/10/2016,aleppo,0


In [3]:
X=df['article_title']
y=df['labels']

In [4]:
nltk.download('stopwords')
nltk.download('wordnet') # for Lemmenitization purpose

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stopwords = set(stopwords.words('english'))  #  get English stopwords
lemi = WordNetLemmatizer()

In [7]:
corpus=[]
for text in X:
    review=re.sub('[^a-zA-Z]', ' ', text) # keep only alphabets
    review=review.lower().split()
    review=[lemi.lemmatize(word) for word in review if word not in stopwords]
    review=' '.join(review)
    corpus.append(review)
print(corpus)

['syria attack symptom consistent nerve agent use', 'homs governor say u attack caused death doesnt see big human loss', 'death toll aleppo bomb attack least', 'aleppo bomb blast kill six syrian state tv', 'syria rebel dead fighting key aleppo road', 'suicide bombing kill least northeast syria', 'dead heavy u raid syria stronghold', 'suicide bomber kill assad clan hometown', 'explosion rock town damascus', 'damascus explosion due rocket bomb', 'syrian regime step aerial assault douma', 'hizballah lead regime offensive southern syria', 'syrian opposition remains divided', 'video show murder syrian activist', 'syria nusra front stage deadly suicide bombing aleppo', 'regime troop thwart rebel attack syria aleppo', 'ahrar al sham leader killed syria', 'barrel bomb kill town syria', 'rebel advance north western syria', 'israeli strike syrian town kill pro regime fighter', 'syria army plane crash rebel held town', 'syrian regime revenge attack kill score qalamoun', 'chemical massacre idlib d

In [9]:
#Tokenization & Padding
voc_size = 10000  # vocabulary size
max_len = 100     # max sequence length
token=Tokenizer(num_words=voc_size , oov_token='<OOV>')
token.fit_on_texts(corpus)
sequences = token.texts_to_sequences(corpus)
X_embedded = pad_sequences(sequences , maxlen=max_len , padding='post')

In [10]:
#Train test split
X_train, X_temp, y_train, y_temp = train_test_split(X_embedded, y, test_size=0.3, random_state=23)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.3, random_state=23)


In [16]:
#Creating model
dim=100 #per words 100 features
model=Sequential()
model.add(Embedding(voc_size , dim , input_length=max_len)) #(max_len x dimension)
model.add(Bidirectional(LSTM(32)))
model.add(Dense(1 , activation='sigmoid'))

model.compile(loss='binary_crossentropy' , optimizer='adam' , metrics=['accuracy'])
print(model.summary())


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 100)          1000000   
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               34048     
 nal)                                                            
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,034,113
Trainable params: 1,034,113
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
early_stop = EarlyStopping(
    patience=2,
    monitor='val_accuracy',
    restore_best_weights=True
)

In [18]:
#Train model
model.fit(X_train , y_train ,validation_data=(X_val , y_val) , epochs=10 , batch_size=32 , callbacks=[early_stop])

Epoch 1/10
Epoch 2/10
Epoch 3/10


<keras.callbacks.History at 0x1f0cfb63a30>

In [19]:
#evaluate on test set
test_acc=model.evaluate(X_test , y_test)
print(test_acc)

[0.6867783665657043, 0.5890411138534546]


In [20]:
y.value_counts()

labels
1    426
0    378
Name: count, dtype: int64

In [28]:
y_pred_prob= model.predict(X_test)
y_pred=(y_pred_prob>0.5).astype(int)



In [32]:
df_compare=pd.DataFrame(
    {
        'Actual Data': y_test,
        'Predicted Data' : y_pred.flatten()
    }
)
print(df_compare.head(20))

     Actual Data  Predicted Data
674            1               1
153            1               1
248            1               1
596            0               1
197            0               1
564            0               1
661            0               1
387            1               1
364            0               1
167            0               1
284            0               1
131            1               1
134            0               1
650            1               1
213            0               1
135            1               1
574            1               1
743            1               1
794            0               1
679            0               1
