# Loading and Displaying

In [None]:
import pandas as pd

# Load the datasets
real_news = pd.read_csv('True.csv')

# Printing head and tail
print(real_news)

# Print length => rows
print("Length:", len(real_news))


                                                   title  \
0      As U.S. budget fight looms, Republicans flip t...   
1      U.S. military to accept transgender recruits o...   
2      Senior U.S. Republican senator: 'Let Mr. Muell...   
3      FBI Russia probe helped by Australian diplomat...   
4      Trump wants Postal Service to charge 'much mor...   
...                                                  ...   
21412  'Fully committed' NATO backs new U.S. approach...   
21413  LexisNexis withdrew two products from Chinese ...   
21414  Minsk cultural hub becomes haven from authorities   
21415  Vatican upbeat on possibility of Pope Francis ...   
21416  Indonesia to buy $1.14 billion worth of Russia...   

                                                    text       subject  \
0      WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1      WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2      WASHINGTON (Reuters) - The special counsel inv... 

In [None]:
import pandas as pd

# Load the datasets
fake_news = pd.read_csv('Fake.csv')

# Printing head and tail
print(fake_news)

# Print length => rows
print("Length:", len(fake_news))


                                                   title  \
0       Donald Trump Sends Out Embarrassing New Year’...   
1       Drunk Bragging Trump Staffer Started Russian ...   
2       Sheriff David Clarke Becomes An Internet Joke...   
3       Trump Is So Obsessed He Even Has Obama’s Name...   
4       Pope Francis Just Called Out Donald Trump Dur...   
...                                                  ...   
23476  McPain: John McCain Furious That Iran Treated ...   
23477  JUSTICE? Yahoo Settles E-mail Privacy Class-ac...   
23478  Sunnistan: US and Allied ‘Safe Zone’ Plan to T...   
23479  How to Blow $700 Million: Al Jazeera America F...   
23480  10 U.S. Navy Sailors Held by Iranian Military ...   

                                                    text      subject  \
0      Donald Trump just couldn t wish all Americans ...         News   
1      House Intelligence Committee Chairman Devin Nu...         News   
2      On Friday, it was revealed that former Milwauk...    

# Combining

In [None]:
# Load the dataset
fake_news = pd.read_csv('Fake.csv')
real_news = pd.read_csv('True.csv')

# Combine the datasets and shuffle
fake_news['label'] = 0    # adds a column
real_news['label'] = 1

combined = pd.concat([fake_news, real_news]).sample(frac=1.0)   # frac 1.0 => returns all the data (rows)

combined

Unnamed: 0,title,text,subject,date,label
8759,Supreme Court Justice Ginsburg 'regrets' Trump...,WASHINGTON (Reuters) - U.S. Supreme Court Just...,politicsNews,"July 14, 2016",1
20677,WHY GROWN MAN WAS ARRESTED In Democrats New “S...,"As a mother of three young girls, if anything ...",left-news,"Apr 26, 2016",0
22254,Kellyanne Conway: ‘Presidents aren’t judged by...,21st Century Wire says Kellyanne Conway thrash...,US_News,"January 23, 2017",0
14115,BADASS CAMPUS COPS Cite Students For Wearing E...,Trigger warning If liberal schools and badass ...,politics,"Apr 14, 2016",0
4634,NYPD So Delighted With Cop Who Killed Eric Ga...,Daniel Pantaleo was caught on video using an i...,News,"September 14, 2016",0
...,...,...,...,...,...
20336,Russian Islamic State fighter sentenced to han...,BAGHDAD (Reuters) - A Russian Islamic State fi...,worldnews,"September 12, 2017",1
14898,MEET THE TRUMPS: In Case You Missed It….Here’s...,"Even Barbara Walters, a master manipulator of ...",politics,"Nov 23, 2015",0
3316,Americans should hear both from Comey and Trum...,WASHINGTON (Reuters) - The head of the Senate ...,politicsNews,"June 8, 2017",1
19707,CNN FIRES BLACK DEM Party Chair: New Wikileaks...,You just can t make this up! CNN fires black D...,left-news,"Oct 31, 2016",0


# Training

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
from nltk.stem import PorterStemmer

# Check if GPU is available
if tf.test.is_gpu_available():
    device_name = tf.test.gpu_device_name()
else:
    device_name = 'CPU:0'
print('Using device: ', device_name)

print("Loading data...")
# Load the dataset
fake = pd.read_csv('Fake.csv')
real = pd.read_csv('True.csv')

# Add labels to the datasets
fake['label'] = 0
real['label'] = 1

# Combine the datasets and shuffle
data = pd.concat([fake, real]).sample(frac=1)

# Write the combined data to a new CSV file
data.to_csv('Combined.csv', index=False)

print("Data loaded successfully and combined CSV generated!")

# Split the dataset
print("Splitting data...")
train, validate, test = np.split(data.sample(frac=1), [int(.6*len(data)), int(.8*len(data))])
print("Data split successfully!")

# Preprocessing
print("Preprocessing data...")
stemmer = PorterStemmer()
def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = [stemmer.stem(word) for word in text.split()]
    return ' '.join(text)

train['text'] = train['text'].apply(preprocess_text)
validate['text'] = validate['text'].apply(preprocess_text)
test['text'] = test['text'].apply(preprocess_text)
print("Preprocessing done!")

# Tokenization and padding
print("Tokenizing and padding data...")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train['text'])
train_sequences = tokenizer.texts_to_sequences(train['text'])
validate_sequences = tokenizer.texts_to_sequences(validate['text'])
test_sequences = tokenizer.texts_to_sequences(test['text'])

max_len = max([len(x) for x in train_sequences])
train_sequences = pad_sequences(train_sequences, maxlen=max_len)
validate_sequences = pad_sequences(validate_sequences, maxlen=max_len)
test_sequences = pad_sequences(test_sequences, maxlen=max_len)
print("Tokenization and padding done!")

with tf.device(device_name):
    print("Training model...")
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128))
    model.add(LSTM(128))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Save the model after each epoch
    checkpoint = ModelCheckpoint('my_model_{epoch}.h5', save_weights_only=False)

    # Early stopping
    early_stop = EarlyStopping(monitor='val_loss', patience=2)

    # Logger to CSV file
    csv_logger = CSVLogger('training.log', append=True)

    model.fit(train_sequences, train['label'], validation_data=(validate_sequences, validate['label']), epochs=3, callbacks=[checkpoint, early_stop, csv_logger])

    # Save the final model
    model.save("my_model.h5")

    print("Model training completed and model saved!")

print("Evaluating model...")

# Predict classes
# predictions = model.predict_classes(test_sequences)

# Predict probabilities
predictions = model.predict(test_sequences)

# Convert probabilities to classes
predicted_classes = (predictions > 0.5).astype('int')

# Flatten the classes if necessary
predicted_classes = predicted_classes.flatten()


# Calculate metrics
accuracy = accuracy_score(test['label'], predicted_classes)
precision = precision_score(test['label'], predicted_classes)
recall = recall_score(test['label'], predicted_classes)
f1 = f1_score(test['label'], predicted_classes)

print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('F1 score: ', f1)
print("Model evaluation completed!")



Using device:  /device:GPU:0
Loading data...
Data loaded successfully and combined CSV generated!
Splitting data...
Data split successfully!
Preprocessing data...
Preprocessing done!
Tokenizing and padding data...
Tokenization and padding done!
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Model training completed and model saved!
Evaluating model...
Accuracy:  0.978619153674833
Precision:  0.9842352941176471
Recall:  0.9709842154131848
F1 score:  0.9775648516008414
Model evaluation completed!


# Using Model to Predict

In [None]:
from keras.models import load_model

# Load the saved model
model = load_model("my_model.h5")

def preprocess_input(text):
    text = preprocess_text(text)
    sequence = tokenizer.texts_to_sequences([text])
    sequence = pad_sequences(sequence, maxlen=max_len)
    return sequence

user_input = input("Enter a news headline : ")
preprocessed_input = preprocess_input(user_input)

prediction = model.predict(preprocessed_input)
predicted_class = (prediction > 0.5).astype('int')[0][0]

if predicted_class == 0:
    print("The input is predicted as FAKE news.")
else:
    print("The input is predicted as REAL news.")
