In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

data_train = pd.read_csv("/content/drive/MyDrive/fakenews/fakenews/train.tsv", sep="\t", header=None)
data_valid = pd.read_csv("/content/drive/MyDrive/fakenews/fakenews/valid.tsv", sep="\t", header=None)
data_test = pd.read_csv("/content/drive/MyDrive/fakenews/fakenews/test.tsv", sep="\t", header=None)

In [None]:
#Below function performs all the required data cleaning and preprocessing steps

def data_preprocessing(dataset):
  #Creating new column called 'label' with 1 for true and mostly-true values, else 0 i.e. 1=real, 0=fake
  dataset['label']=[1 if x=="true"or x=="mostly-true" else 0 for x in dataset[1]] 
  #Dropping unwanted columns
  dataset = dataset.drop(labels=[0,1,8,9,10,11,12] ,axis=1)
  #Dealing with empty datapoints for metadata columns - subject, speaker, job, state,affiliation, context
  meta = []
  for i in range(len(dataset)):
      subject = dataset[3][i]
      if subject == 0:
          subject = 'None'

      speaker =  dataset[4][i]
      if speaker == 0:
          speaker = 'None'

      job =  dataset[5][i]
      if job == 0:
          job = 'None'

      state =  dataset[6][i]
      if state == 0:
          state = 'None'

      affiliation =  dataset[7][i]
      if affiliation == 0:
          affiliation = 'None'

      context =  dataset[13][i]
      if context == 0 :
          context = 'None'

      meta.append(str(subject) + ' ' + str(speaker) + ' ' + str(job) + ' ' + str(state) + ' ' + str(affiliation) + ' ' + str(context)) #combining all the meta data columns into a single column
  
  #Adding cleaned and combined metadata column to the dataset
  dataset[14] = meta
  dataset["sentence"] = dataset[14].astype('str')+" "+dataset[2] #Combining metadata and the text columns into single columns

  dataset = dataset.drop([2,3,4,5,6,7,13,14], axis=1) #dropping metadata columns, as we have merged them into a single column
  dataset.dropna() #Dropping if there are still any null values

  return dataset

In [None]:
#Applying pre-processing to the raw data - train, valid and test sets
data_train = data_preprocessing(data_train)
data_valid = data_preprocessing(data_valid)
data_test = data_preprocessing(data_test)

In [None]:
#Sample data after preprocessing
data_train.head(5)

Unnamed: 0,label,sentence
0,0,abortion dwayne-bohac State representative Tex...
1,0,"energy,history,job-accomplishments scott-surov..."
2,1,foreign-policy barack-obama President Illinois...
3,0,health-care blog-posting nan nan none a news r...
4,0,"economy,jobs charlie-crist nan Florida democra..."


In [None]:
# maximun length of a sentence in the corpus
max_len_train = (data_train['sentence'].apply(lambda x: len(x.split(' ')))).max()
print('Max length of a sentence in the corpus: ' + str(max_len_train))

Max length of a sentence in the corpus: 313


In [None]:
# maximun length of a sentence in the corpus
max_len_test = (data_test['sentence'].apply(lambda x: len(x.split(' ')))).max()
print('Max length of a sentence in the corpus: ' + str(max_len_test))

Max length of a sentence in the corpus: 291


In [None]:
max_len_valid = (data_valid['sentence'].apply(lambda x: len(x.split(' ')))).max()
print('Max length of a sentence in the corpus: ' + str(max_len_test))

Max length of a sentence in the corpus: 291


In [None]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras import regularizers
from keras.models import Model
from keras.layers import *
from keras.utils.np_utils import to_categorical
import re
from keras.optimizers import Adam
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from gensim.models import KeyedVectors

import matplotlib.pyplot as plt
%matplotlib inline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# tokenizing the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data_train.sentence)
X1 = tokenizer.texts_to_sequences(data_train.sentence.values)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

# padding sequences with zeroes
X1 = pad_sequences(X1, padding = 'post', maxlen = max_len_train)

# one hot encoding the labels
y1 = to_categorical(data_train.label)

In [None]:
# tokenizing the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data_test.sentence)
X2 = tokenizer.texts_to_sequences(data_test.sentence.values)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

# padding sequences with zeroes
X2 = pad_sequences(X2, padding = 'post', maxlen = max_len_test)

# one hot encoding the labels
y2 = to_categorical(data_test.label)

In [None]:
# tokenizing the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data_valid.sentence)
X3 = tokenizer.texts_to_sequences(data_valid.sentence.values)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

# padding sequences with zeroes
X3 = pad_sequences(X2, padding = 'post', maxlen = max_len_valid)

# one hot encoding the labels
y3 = to_categorical(data_valid.label)

In [None]:
X_train = X1
y_train = y1
X_test = X2
y_test = y2
X_valid = X3
y_valid = y3
print("Size of Training data: " + str(len(X_train)))
print("Size of Testing data: " + str(len(X_test)))

Size of Training data: 10240
Size of Testing data: 1267


In [None]:
X_train.shape, X_test.shape, X_valid.shape

((10240, 313), (1267, 291), (1267, 69))

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import re
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [None]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, Bidirectional, Input, Concatenate, TimeDistributed
from tensorflow.keras.utils import plot_model
     

In [None]:
def plot_history(history):
    # summarize history for accuracy
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()

In [None]:
vocab_size = 14957

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, input_length=32, output_dim=4))
model.add(Dropout(rate=0.4))
model.add(LSTM(units=4))
model.add(Dropout(rate=0.4))
model.add(Dense(units=100,  activation='relu'))
model.add(Dropout(rate=0.5))
model.add(Dense(units=2, activation='sigmoid'))

model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 32, 4)             59828     
                                                                 
 dropout_21 (Dropout)        (None, 32, 4)             0         
                                                                 
 lstm_7 (LSTM)               (None, 4)                 144       
                                                                 
 dropout_22 (Dropout)        (None, 4)                 0         
                                                                 
 dense_14 (Dense)            (None, 100)               500       
                                                                 
 dropout_23 (Dropout)        (None, 100)               0         
                                                                 
 dense_15 (Dense)            (None, 2)               

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics = ['accuracy'])

In [None]:
history = model.fit(
    X_train, 
    y_train, 
    validation_data=[X_valid, y_valid],
    epochs = 5
)

Epoch 1/5


ValueError: ignored