# 1.Importing Libraries

In [1]:
import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot

import warnings
warnings.filterwarnings('ignore')

# 2. Import and Combine Data

In [50]:
df1 = pd.read_csv("Dataset/Fake.csv")
df2 = pd.read_csv("Dataset/True.csv")

df1['label'] = 1
df2['label'] = 0 

df1 = df1.drop(columns = ['subject', 'date'])
df2 = df2.drop(columns = ['subject', 'date'])

df3 = pd.read_csv("Dataset/news.csv")

# Use replace to change 'FAKE' to 1 and 'REAL' to 0
df3['label'] = df3['label'].replace({'FAKE': 1, 'REAL': 0})
df3 = df3.drop(columns = df3.columns[0])

df4 = pd.read_csv("Dataset/WELFake_Dataset.csv")

df4 = df4.drop(columns = df4.columns[0])
df4['label'] = df4['label'].replace({1: 0, 0: 1})

In [51]:
print(df1.head())

                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text  label  
0  Donald Trump just couldn t wish all Americans ...      1  
1  House Intelligence Committee Chairman Devin Nu...      1  
2  On Friday, it was revealed that former Milwauk...      1  
3  On Christmas day, Donald Trump announced that ...      1  
4  Pope Francis used his annual Christmas Day mes...      1  


In [52]:
print(df2.head())

                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text  label  
0  WASHINGTON (Reuters) - The head of a conservat...      0  
1  WASHINGTON (Reuters) - Transgender people will...      0  
2  WASHINGTON (Reuters) - The special counsel inv...      0  
3  WASHINGTON (Reuters) - Trump campaign adviser ...      0  
4  SEATTLE/WASHINGTON (Reuters) - President Donal...      0  


In [53]:
print(df3.head())

                                               title  \
0                       You Can Smell Hillary’s Fear   
1  Watch The Exact Moment Paul Ryan Committed Pol...   
2        Kerry to go to Paris in gesture of sympathy   
3  Bernie supporters on Twitter erupt in anger ag...   
4   The Battle of New York: Why This Primary Matters   

                                                text  label  
0  Daniel Greenfield, a Shillman Journalism Fello...      1  
1  Google Pinterest Digg Linkedin Reddit Stumbleu...      1  
2  U.S. Secretary of State John F. Kerry said Mon...      0  
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...      1  
4  It's primary day in New York and front-runners...      0  


In [54]:
print(df4.head())

                                               title  \
0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1                                                NaN   
2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3  Bobby Jindal, raised Hindu, uses story of Chri...   
4  SATAN 2: Russia unvelis an image of its terrif...   

                                                text  label  
0  No comment is expected from Barack Obama Membe...      0  
1     Did they post their votes for Hillary already?      0  
2   Now, most of the demonstrators gathered last ...      0  
3  A dozen politically active pastors came here f...      1  
4  The RS-28 Sarmat missile, dubbed Satan 2, will...      0  


In [59]:
df = pd.concat([df1, df2, df3, df4], ignore_index=True)

# 3 Data Processing

In [60]:
#Remove Rows with empty values
print(df.shape[0])
df = df.dropna()
print(df.shape[0])

123367
122770


In [63]:
#Remove duplciate data
print(df.shape[0])
df = df.drop_duplicates(subset=['title'], keep='first')
print(df.shape[0])

122770
62308


In [64]:
# Shuffle (randomize) the rows
df = df.sample(frac=1, random_state=42) 

In [65]:
#Combine Title and Text
df['final'] = df['title'] + df['text']
df.head()

Unnamed: 0,title,text,label,final
30368,Boeing says current Air Force One contract wor...,"SIMI VALLEY, Calif. (Reuters) - Boeing Co on T...",0,Boeing says current Air Force One contract wor...
8926,Watch The NRA’s Wayne LaPierre Support Then O...,It s no secret that NRA executive vice preside...,1,Watch The NRA’s Wayne LaPierre Support Then O...
26110,Democrats in U.S. Congress urge review of Amaz...,WASHINGTON (Reuters) - A group of Democrats in...,0,Democrats in U.S. Congress urge review of Amaz...
12185,"BLACK LAWYER, Editor Of Legal Website Writes: ...","Elie Mystal, an editor and contributor to the ...",1,"BLACK LAWYER, Editor Of Legal Website Writes: ..."
29629,Trump order paves way for agencies to weaken h...,WASHINGTON (Reuters) - President Donald Trump ...,0,Trump order paves way for agencies to weaken h...


In [75]:
#Remove numbers
df['final'] = df['final'].str.replace('\d+','')

# Load English stopwords
stopwords_set = set(stopwords.words('english'))

# Apply preprocessing to remove stopwords, punctuation, and lowercase the text
df['final'] = df['text'].apply(lambda x: ' '.join([word.lower() for word in x.split() if word.lower() not in stopwords_set and word not in string.punctuation]))

#Reduce words to root form
stemmer = PorterStemmer()
df['final'] = df['final'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

In [2]:
#Save DataFrame
df.to_pickle('Dataframe_1.pkl')

NameError: name 'df' is not defined

In [3]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# To load the DataFrame back
data= pd.read_pickle('Dataframe.pkl')
# Data preprocessing
X = data['title'].values
y = data['label'].values

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
vocab_size = len(tokenizer.word_index) + 1

X = tokenizer.texts_to_sequences(X)

# Padding
max_length = 50  # Adjust as needed
X_padded = pad_sequences(X, maxlen=max_length, padding='post')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Build the LSTM model
embedding_dim = 50  # Adjust as needed
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(100))  # Adjust LSTM units as needed
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')

y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions
print(y_pred)
print(y_pred_binary)




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 50)            1649200   
                                                                 
 lstm (LSTM)                 (None, 100)               60400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 1709701 (6.52 MB)
Trainable params: 1709701 (6.52 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/5



KeyboardInterrupt: 

In [None]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred_binary)
print(f'Accuracy: {accuracy:.4f}')

# Precision
precision = precision_score(y_test, y_pred_binary)
print(f'Precision: {precision:.4f}')

# Recall
recall = recall_score(y_test, y_pred_binary)
print(f'Recall: {recall:.4f}')

# F1-score
f1 = f1_score(y_test, y_pred_binary)
print(f'F1-Score: {f1:.4f}')

In [None]:
# AUC-ROC
roc_auc = roc_auc_score(y_test, y_pred)
print(f'AUC-ROC: {roc_auc:.4f}')

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_binary)
print(f'Confusion Matrix:\n{conf_matrix}')