# Final Group Assignment
- Analyzing news articles to determine if they are fake or real, based on a dataset from kaggle.

https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset

In [35]:
#imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
from gensim.models import Word2Vec

In [20]:
#Load & label fake news data
df_fake = pd.read_csv('Fake.csv')
df_fake['label'] = 0
df_fake.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [21]:
#Load & label real news data
df_true = pd.read_csv('True.csv')
df_true['label'] = 1
df_true.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [22]:
#merge & shuffle data
df_news = pd.concat([df_fake, df_true]).sample(frac=1).reset_index(drop=True)
df_news.head(50)

Unnamed: 0,title,text,subject,date,label
0,Liberals Push to Paint White House BROWN Becau...,Mark Dice rolls out another gem that displays ...,politics,"Aug 10, 2015",0
1,"‘Empire’ Takes On Racist Police Violence, Gun...",The season premiere of the hit television dram...,News,"September 21, 2016",0
2,LIBERAL BIGOT Destroyed by Legendary Democrat ...,https://www.youtube.com/watch?v=P-TBfkqk7gU,left-news,"Mar 16, 2017",0
3,IMF's Lagarde tells Ukraine president to speed...,KIEV (Reuters) - The head of the International...,worldnews,"December 8, 2017",1
4,Former Trump campaign staffer files discrimina...,NEW YORK (Reuters) - A 26-year-old former camp...,politicsNews,"February 1, 2016",1
5,Canada deported hundreds to war-torn countries...,TORONTO (Reuters) - Canada has deported hundre...,worldnews,"September 10, 2017",1
6,Reflections on a World Gone Mad and Pushing Ba...,Andre Vltchek Anti DiplomaticoThe following i...,Middle-east,"October 14, 2017",0
7,"U.N. decries Israel's killing of Gaza amputee,...",GENEVA (Reuters) - A senior U.N. official said...,worldnews,"December 19, 2017",1
8,WATCH HILARIOUSLY AWKWARD Moment Between Rick ...,Franken asked Perry if he enjoyed a meeting th...,politics,"Jan 20, 2017",0
9,Mark Zuckerberg Masterfully Takes Donald Trum...,If there s anyone right now worthy of complete...,News,"April 12, 2016",0


In [23]:
# dropping date and subject
df_news = df_news.drop(columns = ['subject','date'])

In [24]:
df_news.head(10)

Unnamed: 0,title,text,label
0,Liberals Push to Paint White House BROWN Becau...,Mark Dice rolls out another gem that displays ...,0
1,"‘Empire’ Takes On Racist Police Violence, Gun...",The season premiere of the hit television dram...,0
2,LIBERAL BIGOT Destroyed by Legendary Democrat ...,https://www.youtube.com/watch?v=P-TBfkqk7gU,0
3,IMF's Lagarde tells Ukraine president to speed...,KIEV (Reuters) - The head of the International...,1
4,Former Trump campaign staffer files discrimina...,NEW YORK (Reuters) - A 26-year-old former camp...,1
5,Canada deported hundreds to war-torn countries...,TORONTO (Reuters) - Canada has deported hundre...,1
6,Reflections on a World Gone Mad and Pushing Ba...,Andre Vltchek Anti DiplomaticoThe following i...,0
7,"U.N. decries Israel's killing of Gaza amputee,...",GENEVA (Reuters) - A senior U.N. official said...,1
8,WATCH HILARIOUSLY AWKWARD Moment Between Rick ...,Franken asked Perry if he enjoyed a meeting th...,0
9,Mark Zuckerberg Masterfully Takes Donald Trum...,If there s anyone right now worthy of complete...,0


# I. Splitting the data and vectorizing using TF-IDF (Term Frequency - Inverse Document Frequency)
- additionaly, we are testing it out with a Multinomial Naive Bayes classifier

## a) Splitting the data, while including the title in X

In [25]:
X = df_news['title'] + ' ' + df_news['text']
y = df_news['label']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [27]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.6)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the validation and test data using the same vectorizer
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

## b) Using Grid Search to perform 5 fold cross-validation in order to get the best params and estimator for MultinomialNaiveBayes

In [None]:
param_grid = {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]}
nb_classifier = MultinomialNB()
grid_search = GridSearchCV(estimator=nb_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train_tfidf, y_train)

best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

best_classifier = grid_search.best_estimator_

y_val_pred = best_classifier.predict(X_val_tfidf)

val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.2f}')

y_test_pred = best_classifier.predict(X_test_tfidf)

test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.2f}')

print(classification_report(y_test, y_test_pred))

### Ultimately, it seems like MNB yielded impressively good results

## c) We do the same using a CNN to investigate whether it will output better results than the MNB Classifier

## d) Using the same vectorizer, we try it out with RNN to analyze if this will yield more accuracy overall

# II. Using WordEmbeddings for text representation (Word2Vec)

In [47]:
X = df_news['title'] + ' ' + df_news['text']
y = df_news['label']

# Convert labels to numerical values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text data
max_words = 10000  # Adjust based on the size of your vocabulary
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure consistent length for input to neural network
max_sequence_length = 1000  # Adjust based on your dataset
X_train_padded = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_sequence_length)

# Train Word2Vec model
tokenized_sentences = [text.split() for text in X_train]
word2vec_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Create an embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_words:
        try:
            embedding_vector = word2vec_model.wv[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            # Word not in Word2Vec model vocabulary
            pass

# Build a simple neural network model
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_sequence_length, weights=[embedding_matrix], trainable=False))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7f05baf2dd90>

In [48]:
# Evaluate the model on the test set
y_pred = model.predict(X_test_padded)
y_pred = y_pred.flatten()

# Convert predictions back to original labels
y_pred_labels = label_encoder.inverse_transform(((y_pred_normalized > 0.5).astype(int)))
print(y_pred_labels)
print(y_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_labels)
print(f'Test Accuracy: {accuracy:.2f}')

# Print classification report for the test set
print(classification_report(y_test, y_pred_labels))

[1 1 1 ... 0 1 1]
[1 1 1 ... 0 1 1]
Test Accuracy: 0.96
              precision    recall  f1-score   support

           0       0.99      0.94      0.97      4702
           1       0.94      0.99      0.96      4278

    accuracy                           0.96      8980
   macro avg       0.96      0.97      0.96      8980
weighted avg       0.97      0.96      0.96      8980



  y = column_or_1d(y, warn=True)
