In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from newsapi import NewsApiClient
import joblib
import json
import pickle
import h5py
from sklearn.metrics import f1_score
import nltk
from nltk.corpus import stopwords
import statistics

ModuleNotFoundError: No module named 'pandas'

In [None]:
df = pd.read_csv("csv_file/data.csv")

In [None]:
count_words = []
count_words_alnum = []
for i in df["headlines"]:
    word = i.split()
    word = ' '.join(letter for letter in word if letter.isalnum())
    count_words.append(word)

for i in count_words:
    i = i.split(" ")
    count_words_alnum.append(i)

In [None]:
def flatten_extend(matrix):
    flat_list = []
    for row in matrix:
        flat_list.extend(row)
    return flat_list

flatten_list = flatten_extend(count_words_alnum)

In [None]:
flatten_list_unique = set(flatten_list)

In [None]:
print(len(flatten_list_unique))

In [None]:
for index, i in enumerate(df["headlines"]):
    row = df["headlines"].iloc[index]
    row = row.split(" ")
    row = ' '.join(letter for letter in row if letter.isalnum() or letter == "COVID-19")
    if "COVID-19" in row:
        row = row.replace("COVID-19", "COVID19")
    replace = df["headlines"].iloc[index]
    df = df.replace(to_replace=replace, value=row)


In [None]:
for i in range(15):
    print(df["headlines"].iloc[i])

In [None]:
list_set = set(count_words)
unique_list = (list(list_set))
print(len(unique_list))

In [None]:
# Sample data (you should replace this with your dataset)
texts = df["headlines"]
labels = df["outcome"]

# Tokenize and pad sequences
max_words = 12137  # Number of unique words to keep
maxlen = 75  # Maximum length of sequences

In [None]:
stopwords = nltk.download('stopwords')

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
stopwords

In [None]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)

In [None]:
word_index = tokenizer.word_index

In [None]:
word_index

In [None]:
filtered_word_index = {word: index for word, index in word_index.items() if word.lower() not in stop_words}
    

In [None]:
print(len(filtered_word_index))

In [None]:
tokenizer.word_index = filtered_word_index

In [None]:
sequences = tokenizer.texts_to_sequences(texts)

In [None]:
sequence_lengths = [len(seq) for seq in sequences]

In [None]:
print(statistics.mean(sequence_lengths))

In [None]:

X = pad_sequences(sequences, maxlen=maxlen)
y = np.array(labels)

In [None]:
# Build the RNN model
embedding_dim = 50  # Dimensionality of the embedding space
hidden_units = 50  # Number of LSTM units

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(LSTM(hidden_units))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

# Train the model
epochs = 5
batch_size = 32

model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
X_train

In [None]:
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_binary)

# Display confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Fake', 'Real'], yticklabels=['Fake', 'Real'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred_binary))

In [None]:
newsapi = NewsApiClient(api_key='2ec72918ca08491b885785649a92cfb7')  # Replace with your actual News API key

# /v2/top-headlines
covid_headlines = newsapi.get_top_headlines(q='covid',
                                            language='en',
                                            country='us')

# /v2/everything
covid_articles = newsapi.get_everything(q='covid',
                                        language='en',
                                        sort_by='relevancy',
                                        page=1)


title = []
clean_title = []
for article in covid_articles['articles']:
    print(article['title'])
    title.append(article['title'])

for index, i in enumerate(title):
    row = title[index]
    row = row.split(" ")
    row = ' '.join(letter for letter in row if letter.isalnum() or letter == "COVID-19")
    if "COVID-19" in row:
        row = row.replace("COVID-19", "COVID19")
    row = row.lower()
    clean_title.append(row)




In [None]:
clean_title

In [None]:
# News API setup
newsapi = NewsApiClient(api_key='2ec72918ca08491b885785649a92cfb7')  # Replace with your actual News API key

# Query the News API
news_data = newsapi.get_everything(q='covid',
                                        language='en',
                                        sort_by='relevancy',
                                        page=1)


In [None]:
def preprocess_text(text):
    sequences = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequences, maxlen=maxlen)  # Ensure maxlen matches the length used during training
    print(padded_sequence)
    return padded_sequence

true_labels = []
predicted_labels = []
# Process and predict for each news article
for index, article in enumerate(clean_title):
    title = article
    if "covid" in title.lower():
        true_labels.append(1)
        print(title)
    else:
        true_labels.append(0)
   
        
    # Preprocess the text
    processed_title = preprocess_text(title)
    
    # Make predictions
    prediction = model.predict(processed_title)
    if prediction < 0.5:
        predicted_labels.append(0)
    else:
        predicted_labels.append(1)
    
   
    
    # Display results
    print(f"Title: {title}")
    print(f"Prediction: {'Fake' if prediction < 0.5 else 'Real'}")
    print("=" * 50)



In [None]:
conf_matrix = confusion_matrix(true_labels, predicted_labels)

# Print confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

# Print classification report
print("Classification Report:")
print(classification_report(true_labels, predicted_labels))