# Importing Data

In [2]:
import pandas as pd
true = pd.read_csv('../raw_data/True.csv')
fake = pd.read_csv('../raw_data/Fake.csv')

# Dropping rows not needed


In [3]:
true.drop(columns = ['date', 'subject'], inplace = True)
fake.drop(columns = ['date', 'subject'], inplace = True)

# Creating new datasets so we dont mess with the real ones

In [4]:
true_copy = true.copy()
fake_copy = fake.copy()

# Brief Data Cleaning

In [5]:
stop_words = ['/Getty Images']
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
fake_copy['text'] = fake_copy['text'].str.replace(pat, '')
true_copy['text'] = true_copy['text'].str.replace(pat, '')

# Creating Target

In [6]:
true_copy['score'] = 1
fake_copy['score'] = 0

# Merging DataSets

In [7]:
data = pd.concat([true_copy, fake_copy], ignore_index=True)

# Concat Text/Title

In [8]:
data['title_text'] = data['title'] + data['text']
data.drop(columns = ['title', 'text'], inplace = True)

# Cleaning Data

In [9]:
import string

punc = string.punctuation + '“' + '”' + '’' + '‘'

def remove_punctuation(text):
    for punctuation in punc:
        text = text.replace(punctuation, '')
    return text

def lower_case(text):
    text = text.lower()
    return text

def remove_numbers(text):
    text = ''.join(word for word in text if not word.isdigit())
    return text


In [10]:
data['title_text'] = data['title_text'].apply(remove_punctuation)

In [11]:
data['title_text'] = data['title_text'].apply(lower_case)

In [12]:
data['title_text'] = data['title_text'].apply(remove_numbers)

# Splitting the data

In [13]:
x = data['title_text']
y = data['score']

In [14]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.3)

# Tokenizing Data / Word2vec

from gensim.models import Word2Vec
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

def convert_sentences(X):
    return [sentence.split(' ') for sentence in X]

def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        print(word)
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

def embedding_pipeline(word2vec, X):
    # Step #2: List of words to list of vectors
    X = convert_sentences(X)
    
    # Step #3: Sentences to list of words
    X = embedding(word2vec, X)
    
    # Step #4: Pad the inputs
    X = pad_sequences(X, dtype='float32', padding='post')
    
    return X
    

# Step #1: Train a word2vec - with possible hyperparameters
word2vec = Word2Vec(sentences=x_train, size=100, min_count=10, window=10)


x_train_pad = embedding_pipeline(word2vec, x_train)
x_test_pad = embedding_pipeline(word2vec, x_test)

In [None]:
#embedding(word2vec,convert_sentences(x_train[0:1]))

In [33]:
#word2vec.wv['man']

KeyError: "word 'man' not in vocabulary"

# New embedding attempt

In [27]:
from tensorflow import keras
from keras.preprocessing.text import Tokenizer


tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(x_train)

X_train = tokenizer.texts_to_sequences(x_train)
X_test = tokenizer.texts_to_sequences(x_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

x_train[0]

'as us budget fight looms republicans flip their fiscal scriptwashington reuters  the head of a conservative republican faction in the us congress who voted this month for a huge expansion of the national debt to pay for tax cuts called himself a fiscal conservative on sunday and urged budget restraint in  in keeping with a sharp pivot under way among republicans us representative mark meadows speaking on cbs face the nation drew a hard line on federal spending which lawmakers are bracing to do battle over in january when they return from the holidays on wednesday lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues such as immigration policy even as the november congressional election campaigns approach in which republicans will seek to keep control of congress president donald trump and his republicans want a big budget increase in military spending while democrats also want proportional increases for nondefense discretionary spending on

In [30]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

x_train_pad = pad_sequences(x_train)
tokenized_test = tokenizer.texts_to_sequences(x_test)
X_test_pad = pad_sequences(x_test)

ValueError: invalid literal for int() with base 10: 'breaking finally new wikileaks email…we are going to have to dump all those emailsthis latest wikileaks email is evidence of more than just  smoke where hillary s concerned it places her in the middl

# Modelling

In [20]:
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequenes

def init_model():
    model = Sequential()
    model.add(layers.Masking())
    model.add(layers.LSTM(10, activation='tanh'))
    model.add(layers.Dense(20, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    return model

model = init_model()

In [21]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=5, restore_best_weights=True)

model.fit(x_train_pad, y_train, 
          batch_size = 16,
          epochs=250,
          validation_split=0.3,
          callbacks=[es]
         )

NameError: name 'x_train_pad' is not defined

In [None]:
model.evaluate(x_test,y_test)