In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

`Deactivating JEDI for autocomplete to work`

In [None]:
%config Completer.use_jedi = False

# Feature Engineering - Describing, preprocessing, cleaning

`Reading train.csv`

In [None]:
train_data = pd.read_csv("../input/fake-news/train.csv")

`Generating descriptive statistics about the dataframe`

In [None]:
train_data.describe()

`Fetching any null values in the dataframe, and summing them up!`

In [None]:
train_data.isnull().sum()

`Repeating the above steps for test.csv`

In [None]:
test_data = pd.read_csv("../input/fake-news/test.csv")

In [None]:
test_data.describe()

In [None]:
test_data.isnull().sum()

In [None]:
train_data.head()

In [None]:
test_data.head()

`Checking the number of unique row count for "Label" column`

In [None]:
train_data['label'].value_counts()

In [None]:
# Creating a figure adding two columns as we've {0,1} as labels signifying reliable/unreliable resources
fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(12,4), dpi=100)

# Displaying both observations in a bar chart format
sns.countplot(train_data['label'], ax=axes[0])

# Create a Pie-chart
## 'autopct' enables you to display the percent value using Python string formatting.
## 'explode' created a divider space to separate each pizza pieces
## 'startangle' signifies the tilt for the explode.
axes[1].pie(train_data['label'].value_counts(),labels=['reliable source', 'unreliable source'],autopct='%1.2f%%',shadow=True,explode=(0.05, 0),startangle=90)

# Adding a centered Title to the graphs
fig.suptitle('Fake News', fontsize=24)
plt.show()

# # Managing Null values

`Since there are Null values in the categorical columns, we will fill the null values with the empty strings`

In [None]:
train_data = train_data.fillna('')

In [None]:
test_data = test_data.fillna('')

`Merging two columns {title,author} for predicting fake/real news`

In [None]:
train_data['total'] = train_data['title'] + ' ' + train_data['author']

In [None]:
test_data['total'] = test_data['title'] + ' ' + test_data['author']

``Dropping the sample {label} column before training

In [None]:
X = train_data.drop('label',axis=1)
y = train_data['label']

In [None]:
X.shape

In [None]:
y.shape

# # Building Vocab

In [None]:
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
vocab = 8000
news = X.copy()
news_test = test_data.copy()

`Going to use Stemming technique to map each word to their original root form`

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []

In [None]:
for i in range(len(news)):
    # removing emojis, punctuations, urls if present
    review = re.sub('[^a-zA-Z]',' ', news['total'][i])
    # keeping all the text in a lowercase
    review = review.lower()
    # Tokenizing the text in a list
    review = review.split()
    # removing stopwords and applying word stemming.
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    # Creating back to a string
    review = ' '.join(review)
    corpus.append(review)

corpus_test = []
for i in range(len(news_test)):
    # removing emojis, punctuations, urls if present
    review = re.sub('[^a-zA-Z]',' ',news_test['total'][i])
    # keeping all the text in a lowercase
    review = review.lower()
    # Tokenizing the text in a list
    review = review.split()
    # removing stopwords and applying word stemming.
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    # Creating back to a string
    review = ' '.join(review)
    corpus_test.append(review)

`Applying One-Hot Encoding to the words and vocab.`

In [None]:
from tensorflow.keras.preprocessing.text import one_hot

In [None]:
onehot = [one_hot(words,vocab) for words in corpus]
onehot_test = [one_hot(words,vocab) for words in corpus_test]

`Padding each sentence to turn them into equal length`

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
embedded_docs = pad_sequences(onehot, padding = 'pre', maxlen = 25)
embedded_docs_test = pad_sequences(onehot_test, padding = 'pre', maxlen = 25)

# Training the Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout

`Using LSTM`

In [None]:
model = Sequential()
model.add(Embedding(vocab,40,input_length=25))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

`Converting the embeddings to a numpy array`

In [None]:
X_final = np.array(embedded_docs)
y_final = np.array(y)
test_final = np.array(embedded_docs_test)
X_final.shape,y_final.shape,test_final.shape

In [None]:
history = model.fit(X_final,y_final,epochs=20,batch_size=64)

In [None]:
plt.plot(history.history['accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')

In [None]:
y_pred = model.predict_classes(test_final)

In [None]:
res = pd.DataFrame()
res['id']=test_data['id']
res['label'] = y_pred
res.to_csv('result.csv',index=False)