In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import plot_confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report,roc_curve,auc
from sklearn.utils import shuffle
import nltk
import nltk as nlp
import string
import re
import pickle
from textblob import TextBlob
from nltk.tokenize import word_tokenize
import re
import string
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook)
MAX_NB_WORDS = 100000    # max no. of words for tokenizer
MAX_SEQUENCE_LENGTH = 200 # max length of each entry (sentence), including padding
VALIDATION_SPLIT = 0.2   # data for validation (not used in training)
EMBEDDING_DIM = 100   
GLOVE_DIR = "glove/glove.6B."+str(EMBEDDING_DIM)+"d.txt"

ModuleNotFoundError: No module named 'seaborn'

In [None]:
#preparing Dataset

true = pd.read_csv(r'data/True.csv')
fake = pd.read_csv(r'data/Fake.csv')

true['target'] = 0
fake['target'] = 1

df = pd.concat([true, fake])
df = shuffle(df)

df.head()

In [None]:

wordnet = WordNetLemmatizer()
regex = re.compile('[%s]' % re.escape(string.punctuation))

def basic_text_cleaning(line_from_column):
    tokenized_doc = word_tokenize(line_from_column)
    
    new_review = []
    for token in tokenized_doc:
        new_token = regex.sub('', token)
        if new_token != '':
            new_review.append(new_token)
    
    new_term_vector = []
    for word in new_review:
        if word.lower() not in stopwords.words('english'):
            new_term_vector.append(word)
    
    final_doc = []
    for word in new_term_vector:
        final_doc.append(wordnet.lemmatize(word))
    
    return ' '.join(final_doc)


In [None]:

tqdm.pandas()  

df['clean_text'] = df['text'].progress_map(basic_text_cleaning)
df['clean_title'] = df['title'].progress_map(basic_text_cleaning)

df.to_csv("models/clean_news.csv", index=False)


SyntaxError: invalid character '“' (U+201C) (1369050451.py, line 2)

In [None]:
df['polarity'] = df['clean_text'].progress_map(lambda text: TextBlob(str(text)).sentiment.polarity)
df['text_len'] = df['clean_text'].astype(str).progress_map(len)
df['text_word_count'] = df['clean_text'].progress_map(lambda x: len(str(x).split()))
df['title_len'] = df['clean_title'].astype(str).progress_map(len)
df['title_word_count'] = df['clean_title'].progress_map(lambda x: len(str(x).split()))


SyntaxError: invalid character '‘' (U+2018) (3640869389.py, line 1)

In [None]:
def ecdf(data):
    length = len(data)
    x = np.sort(data)
    y = np.arange(1, length + 1) / length
    return x, y

def generate_hist_ECDF(data, x_label, title):
    x_1, y_1 = ecdf(data=data[df['target'] == 1])
    x_0, y_0 = ecdf(data=data[df['target'] == 0])

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    fig.suptitle('ECDF and histogram plots for col — {} comparison between true and false'.format(title))

    ax1.plot(x_1, y_1, marker='.', linestyle='none', label='Fake')
    ax1.plot(x_0, y_0, marker='.', linestyle='none', label='True')
    ax1.set(xlabel=x_label, ylabel='CDF')

    ax2.hist(data[df['target'] == 1], density=True, bins=50, alpha=0.6, label='Fake')
    ax2.hist(data[df['target'] == 0], density=True, bins=50, alpha=0.6, label='True')
    ax2.set(xlabel=x_label, ylabel='Probability')

    ax1.legend()
    ax2.legend()
    plt.show()

def get_top_n_words(corpus, n=None):
    corpus_fake = corpus[df['target'] == 1].astype(str)
    corpus_true = corpus[df['target'] == 0].astype(str)

    vec = CountVectorizer(stop_words='english').fit(corpus_fake)
    bow_fake = vec.transform(corpus_fake)
    sum_words = bow_fake.sum(axis=0)
    words_freq_fake = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq_fake = sorted(words_freq_fake, key=lambda x: x[1], reverse=True)

    vec = CountVectorizer(stop_words='english').fit(corpus_true)
    bow_true = vec.transform(corpus_true)
    sum_words = bow_true.sum(axis=0)
    words_freq_true = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq_true = sorted(words_freq_true, key=lambda x: x[1], reverse=True)

    df_fake = pd.DataFrame(words_freq_fake[:n], columns=['text', 'count'])
    df_true = pd.DataFrame(words_freq_true[:n], columns=['text', 'count'])

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
    ax1.bar(df_fake['text'], df_fake['count'])
    ax1.set_xticklabels(df_fake['text'], rotation=45)
    ax1.set(xlabel='Top {} most frequent terms for fake news'.format(n), ylabel='Count')

    ax2.bar(df_true['text'], df_true['count'])
    ax2.set_xticklabels(df_true['text'], rotation=45)
    ax2.set(xlabel='Top {} most frequent terms for true news'.format(n), ylabel='Count')

    plt.suptitle('Comparison of most frequent terms (fake vs true)')
    plt.tight_layout()
    plt.show()

def get_top_n_bigram(corpus, n=None):
    corpus_fake = corpus[df['target'] == 1].astype(str)
    corpus_true = corpus[df['target'] == 0].astype(str)

    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus_fake)
    bow_fake = vec.transform(corpus_fake)
    sum_words = bow_fake.sum(axis=0)
    words_freq_fake = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq_fake = sorted(words_freq_fake, key=lambda x: x[1], reverse=True)

    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus_true)
    bow_true = vec.transform(corpus_true)
    sum_words = bow_true.sum(axis=0)
    words_freq_true = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq_true = sorted(words_freq_true, key=lambda x: x[1], reverse=True)

    df_fake = pd.DataFrame(words_freq_fake[:n], columns=['text', 'count'])
    df_true = pd.DataFrame(words_freq_true[:n], columns=['text', 'count'])

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
    ax1.bar(df_fake['text'], df_fake['count'])
    ax1.set_xticklabels(df_fake['text'], rotation=45)
    ax1.set(xlabel='Top {} bigrams for fake news'.format(n), ylabel='Count')

    ax2.bar(df_true['text'], df_true['count'])
    ax2.set_xticklabels(df_true['text'], rotation=45)
    ax2.set(xlabel='Top {} bigrams for true news'.format(n), ylabel='Count')

    plt.suptitle('Comparison of bigrams (fake vs true)')
    plt.tight_layout()
    plt.show()


In [None]:
f, axes = plt.subplots(1, 5, figsize=(24, 8))

sns.boxplot(y='polarity', x='target', data=df, orient='v', ax=axes[0], showfliers=False)
axes[0].set_title('Polarity', fontsize=17)

sns.boxplot(y='text_len', x='target', data=df, orient='v', ax=axes[1], showfliers=False)
axes[1].set_title('Length of News characters', fontsize=17)

sns.boxplot(y='text_word_count', x='target', data=df, orient='v', ax=axes[2], showfliers=False)
axes[2].set_title('Length of News words', fontsize=17)

sns.boxplot(y='title_len', x='target', data=df, orient='v', ax=axes[3], showfliers=False)
axes[3].set_title('Length of News title characters', fontsize=17)

sns.boxplot(y='title_word_count', x='target', data=df, orient='v', ax=axes[4], showfliers=False)
axes[4].set_title('Length of News title words', fontsize=17)

plt.show()


In [9]:
generate_hist_ECDF(data=df['text_len'], x_label='text_len', title='text_len')
generate_hist_ECDF(data=df['title_len'], x_label='title_len', title='title_len')

NameError: name 'generate_hist_ECDF' is not defined

In [None]:
generate_hist_ECDF(data=df["polarity"], x_label="polarity", title="polarity")

get_top_n_words(corpus=df["clean_text"], n=10)

get_top_n_words(corpus=df["clean_title"], n=10)


In [None]:
import numpy as np
import pandas as pd
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def get_dummies(df, col_target):
    df_y = pd.get_dummies(df[col_target])
    df_new = df.join(df_y)
    df_new = df_new.drop(col_target, axis=1)
    return df_new

def prep_features(df, labels, text):
    y = df[labels].values
    comments_train = df[text]
    comments_train = list(comments_train)
    return comments_train, y

def prep_tokenizer(texts, MAX_NB_WORDS):
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(texts)

    print("Tokenizer created — Saving Tokenizer")

    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    print("Tokenizer saved")

def prepare_training_test_data(texts, tokenizer, y):
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    print('Vocabulary size:', len(word_index))

    print("Padding sequences")

    data = pad_sequences(sequences, padding='post', maxlen=MAX_SEQUENCE_LENGTH)
    print('Shape of data tensor:', data.shape)

    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    data = data[indices]
    labels = y[indices]

    num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
    x_train = data[:-num_validation_samples]
    y_train = labels[:-num_validation_samples]
    x_val = data[-num_validation_samples:]
    y_val = labels[-num_validation_samples:]

    print('Number of entries in each category:')
    print('training:', y_train.sum(axis=0))
    print('validation:', y_val.sum(axis=0))

    print('Tokenized sentences:\n', data[0])
    print('One hot label:\n', labels[0])

    return x_train, y_train, x_val, y_val


In [None]:
# One-hot encoding the target labels
df_new = get_dummies(df=df, col_target="target")
df_new.head()

# Generating the X and Y values needed for training
labels = [0, 1]
x_train, y_train = prep_features(df=df_new, labels=[0, 1], text="clean_text")

print(x_train[0])
print(y_train[0])

prep_tokenizer(texts=x_train, MAX_NB_WORDS=MAX_NB_WORDS)


In [None]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

x_train, y_train, x_test, y_test = prepare_training_test_data(
    texts=x_train,
    tokenizer=tokenizer,
    y=y_train
)


In [None]:
from tensorflow.keras.layers import Input, Embedding
import numpy as np

embeddings_index = {}
word_index = tokenizer.word_index

f = open(GLOVE_DIR, encoding="utf8")
print('Loading GloVe from:', GLOVE_DIR, '…', end='')

for line in f:
    values = line.split()
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32')

f.close()
print("Done.\nProceeding with Embedding Matrix…", end='')

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

print("Completed!")

# Create embedding input layer
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

embedding_layer = Embedding(
    input_dim=len(word_index) + 1,
    output_dim=EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=MAX_SEQUENCE_LENGTH,
    trainable=False,
    name='embeddings'
)

embedded_sequences = embedding_layer(sequence_input)


In [None]:
from tensorflow.keras.layers import LSTM, GlobalMaxPool1D, Dropout, Dense
from tensorflow.keras.models import Model

x = LSTM(60, return_sequences=True, name='lstm_layer')(embedded_sequences)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
preds = Dense(2, activation="sigmoid")(x)

model = Model(sequence_input, preds)

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()


In [None]:
history = model.fit(
    x_train,
    y_train,
    epochs=10,
    batch_size=32,
    validation_data=(x_test, y_test)
)


In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']
acc = history.history['acc']
val_acc = history.history['val_acc']
epochs = range(1, len(loss)+1)
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(15,5))
ax1.plot(epochs, loss, label='Training loss')
ax1.plot(epochs, val_loss, label='Validation loss')
ax1.set_title('Training and validation loss')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Loss')
ax1.legend()
ax2.plot(epochs, acc, label='Training accuracy')
ax2.plot(epochs, val_acc, label='Validation accuracy')
ax2.set_title('Training and validation accuracy')
ax2.set_xlabel('Epochs')
ax2.set_ylabel('Accuracy')
ax2.legend()
plt.show()

In [None]:
model.save('models/fake_news_lstm_model.h5')
print("Model saved as fake_news_lstm_model.h5")
