### Importing libraries

In [55]:
import numpy as np
import pandas as pd
import re
import nltk
import itertools
from sklearn import metrics
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Loyumba\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### Preprocessing data

In [7]:
df = pd.read_csv("data.csv")

In [8]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \r\nAn Iranian woman has been sentenced ...,1


In [9]:
df.shape

(20800, 5)

In [10]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

As these are text data, I've decided to fill the null values with empty strings

In [11]:
df = df.fillna("")
df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

I've planned to combine the author and title for predictions

In [12]:
# merging the author name and news title
df['content'] = df['author']+' '+df['title']

In [13]:
df['content'].head()

0    Darrell Lucus House Dem Aide: We Didn’t Even S...
1    Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2    Consortiumnews.com Why the Truth Might Get You...
3    Jessica Purkiss 15 Civilians Killed In Single ...
4    Howard Portnoy Iranian woman jailed for fictio...
Name: content, dtype: object

### Seperating data and label

In [14]:
# getting the independent features
X = df.drop('label', axis=1)
# getting the dependent feature
y = df['label']

print(X.shape)
print(y.shape)

(20800, 5)
(20800,)


In [15]:
messages=X.copy()
messages['content'][1]

'Daniel J. Flynn FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart'

In [16]:
messages.head()

Unnamed: 0,id,title,author,text,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \r\nAn Iranian woman has been sentenced ...,Howard Portnoy Iranian woman jailed for fictio...


### Preprocessing and stemming

Stemming is an algorithm in linguistic normalization where each words are reduced to its root word.

Example:
(i) History, Historical => Histori
(ii) Eat, eating, eaten => Eat

Note: Some of the root word loses its meaning as seen in example (i). This shouldn't be an issue for news classification, but in cases where the word meaning needs to be retained use lemmetization

In [17]:
# Dataset Preprocessing
ps = PorterStemmer()
corpus = []
for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['content'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [18]:
# previewing the first 10 sentences reduced to its root word
corpus[1:10]

['daniel j flynn flynn hillari clinton big woman campu breitbart',
 'consortiumnew com truth might get fire',
 'jessica purkiss civilian kill singl us airstrik identifi',
 'howard portnoy iranian woman jail fiction unpublish stori woman stone death adulteri',
 'daniel nussbaum jacki mason hollywood would love trump bomb north korea lack tran bathroom exclus video breitbart',
 'life life luxuri elton john favorit shark pictur stare long transcontinent flight',
 'alissa j rubin beno hamon win french socialist parti presidenti nomin new york time',
 'excerpt draft script donald trump q ampa black church pastor new york time',
 'megan twohey scott shane back channel plan ukrain russia courtesi trump associ new york time']

### One hot encodding
I've decided to try bidirectional LSTM by one hot encoding and other machine learning algorithms by TF-IDF vectorising


In [19]:
## one hot representation
voc_size=10000
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr[1:3]

[[1790, 4720, 6909, 6909, 3986, 8722, 7467, 7010, 448, 9865],
 [1803, 372, 5288, 1382, 5324, 7391]]

In [22]:
print(f"The first sentence after stemming looks like: {corpus[0]}")
print(f"The same sentence after one hot encoding looks like: {onehot_repr[0]}")

The first sentence after stemming looks like: darrel lucu hous dem aid even see comey letter jason chaffetz tweet
The same sentence after one hot encoding looks like: [2270, 4972, 6139, 1888, 2950, 12, 4590, 1752, 7934, 3968, 7197, 6072]


### Embedding representation

Padding sentence to make them of the same size. We can use 'pre' or 'post' padding, but using one instead of the other didn't make any significant improvements to the model

In [23]:
sent_length=25
embedded_docs=pad_sequences(onehot_repr,padding='post',maxlen=sent_length)
print(embedded_docs)

[[2270 4972 6139 ...    0    0    0]
 [1790 4720 6909 ...    0    0    0]
 [1803  372 5288 ...    0    0    0]
 ...
 [9221 4720 5772 ...    0    0    0]
 [7394 1745 4721 ...    0    0    0]
 [ 131 7813 8586 ...    0    0    0]]


### Creating model

In [24]:
## creating model
embedded_vector_features=40#number of features
model=Sequential()
model.add(Embedding(voc_size,embedded_vector_features,input_length=sent_length))
model.add(Bidirectional(LSTM(100)))#number of nuerons
model.add(Dense(1,activation='sigmoid'))# as it is binary classifier
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 40)            400000    
                                                                 
 bidirectional (Bidirectiona  (None, 200)              112800    
 l)                                                              
                                                                 
 dense (Dense)               (None, 1)                 201       
                                                                 
Total params: 513,001
Trainable params: 513,001
Non-trainable params: 0
_________________________________________________________________
None


In [25]:
# converting into numpy array
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [26]:
X_final.shape,y_final.shape

((20800, 25), (20800,))

Splitting into training and test data

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.3, random_state=33)

In [32]:
##model training
model.fit(X_train, y_train, validation_data=(X_test, y_test),epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f2087f9330>

In [33]:
y_pred=model.predict(X_test)



In [34]:
y_pred=np.where(y_pred>=0.5,1,0)

In [35]:
conf_mat = confusion_matrix(y_test,y_pred)
acc = accuracy_score(y_test,y_pred)
class_rep = classification_report(y_test,y_pred)

print(conf_mat)
print(f'Accuracy is {acc}')
print(class_rep)

[[3096   14]
 [  88 3042]]
Accuracy is 0.9836538461538461
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      3110
           1       1.00      0.97      0.98      3130

    accuracy                           0.98      6240
   macro avg       0.98      0.98      0.98      6240
weighted avg       0.98      0.98      0.98      6240



### Vectorizing the data

Now we try vectorizing the data and using machine learning models

TF-IDF measures the importance of words in a document
- Term Frequency is the number of times the word appears in the document.
- Inverse Document Frequency measures the rarity of the word in the document.

In [37]:
corpus[1:10]

['daniel j flynn flynn hillari clinton big woman campu breitbart',
 'consortiumnew com truth might get fire',
 'jessica purkiss civilian kill singl us airstrik identifi',
 'howard portnoy iranian woman jail fiction unpublish stori woman stone death adulteri',
 'daniel nussbaum jacki mason hollywood would love trump bomb north korea lack tran bathroom exclus video breitbart',
 'life life luxuri elton john favorit shark pictur stare long transcontinent flight',
 'alissa j rubin beno hamon win french socialist parti presidenti nomin new york time',
 'excerpt draft script donald trump q ampa black church pastor new york time',
 'megan twohey scott shane back channel plan ukrain russia courtesi trump associ new york time']

In [39]:
len(corpus)

20800

In [38]:
y

0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 20800, dtype: int64

In [40]:
# converting the textual data into numerical data 
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
X = vectorizer.transform(corpus)

In [44]:
print(X[1:3])

  (0, 16799)	0.30071745655510157
  (0, 6816)	0.1904660198296849
  (0, 5503)	0.7143299355715573
  (0, 3568)	0.26373768806048464
  (0, 2813)	0.19094574062359204
  (0, 2223)	0.3827320386859759
  (0, 1894)	0.15521974226349364
  (0, 1497)	0.2939891562094648
  (1, 15611)	0.41544962664721613
  (1, 9620)	0.49351492943649944
  (1, 5968)	0.3474613386728292
  (1, 5389)	0.3866530551182615
  (1, 3103)	0.46097489583229645
  (1, 2943)	0.3179886800654691


### Splitting the vectorized data into train and test set

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify=y, random_state=33)

### Logistic Regression

In [52]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [53]:
conf_mat = confusion_matrix(y_test,y_pred)
acc = accuracy_score(y_test,y_pred)
class_rep = classification_report(y_test,y_pred)

print(conf_mat)
print(f'Accuracy score of the training data : {acc}')
print(class_rep)


[[2989  127]
 [  24 3100]]
Accuracy score of the training data : 0.975801282051282
              precision    recall  f1-score   support

           0       0.99      0.96      0.98      3116
           1       0.96      0.99      0.98      3124

    accuracy                           0.98      6240
   macro avg       0.98      0.98      0.98      6240
weighted avg       0.98      0.98      0.98      6240



### Multinomial Naive Bayes

Multinomial Naïve Bayes uses term frequency i.e. the number of times a given term appears in a document. Term frequency is often normalized by dividing the raw term frequency by the document length

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=33)

In [57]:
classifier=MultinomialNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [58]:
conf_mat = confusion_matrix(y_test,y_pred)
acc = accuracy_score(y_test,y_pred)
class_rep = classification_report(y_test,y_pred)

print(conf_mat)
print(f'Accuracy score of the training data : {acc}')
print(class_rep)

[[3090   20]
 [ 268 2862]]
Accuracy score of the training data : 0.9538461538461539
              precision    recall  f1-score   support

           0       0.92      0.99      0.96      3110
           1       0.99      0.91      0.95      3130

    accuracy                           0.95      6240
   macro avg       0.96      0.95      0.95      6240
weighted avg       0.96      0.95      0.95      6240



### MODEL COMPARISON

In this dataset we can observe that:

(i) With one hot encodiding:

- Bidirectional LSTM: Accuracy is: 98.36 %

(ii) Using TF-IDF to vectorize the data:

- Logistic Regression : Accuracy is 97.58 %

- Multinomial Naive Bayes : Accuracy is 95.38 %

### CONCLUSION 

- Bidirectional LSTM with one hot encoding and embedding works great.
- We can also try vectorizing the data using word2vec