In [1]:
from google.colab import files
files.upload()


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"jayalexander23","key":"3a31a7d4c4ee820bcc138a43a0539980"}'}

In [2]:
import os
os.makedirs('/root/.kaggle', exist_ok=True)
!mv kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json


In [3]:
!pip install kaggle




In [4]:
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset
!unzip fake-and-real-news-dataset.zip


Dataset URL: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset
License(s): CC-BY-NC-SA-4.0
Downloading fake-and-real-news-dataset.zip to /content
  0% 0.00/41.0M [00:00<?, ?B/s]
100% 41.0M/41.0M [00:00<00:00, 998MB/s]
Archive:  fake-and-real-news-dataset.zip
  inflating: Fake.csv                
  inflating: True.csv                


In [5]:
import pandas as pd

# Load true and fake news
df_real = pd.read_csv('True.csv')
df_fake = pd.read_csv('Fake.csv')

# Add labels
df_real['label'] = 'REAL'
df_fake['label'] = 'FAKE'

# Combine datasets
df = pd.concat([df_real, df_fake], ignore_index=True)
display(df.head())


Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",REAL
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",REAL
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",REAL
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",REAL
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",REAL


# Data Preparation:

Combined true and fake news into a single dataframe. Preprocessed text by lowercasing, removing punctuation, and vectorizing with TfidfVectorizer for feature extraction."

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
import string

# Combine 'title' and 'text'
df['text_all'] = (df['title'] + ' ' + df['text']).str.lower()

# Remove punctuation
df['text_all'] = df['text_all'].str.translate(str.maketrans('', '', string.punctuation))

# Vectorize text for topic modeling
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_vect = vectorizer.fit_transform(df['text_all'])


In [7]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=20, random_state=42)
lda_topics = lda.fit_transform(X_vect)

# Examine top words for each topic
def print_top_words(model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

tf_feature_names = vectorizer.get_feature_names_out()
print_top_words(lda, tf_feature_names)


Topic #0:
trump said court federal administration department law obama state order
Topic #1:
israel jerusalem putin moscow russian russia israeli kremlin palestinian netanyahu
Topic #2:
gun transgender video guns nra bathroom gay marriage gender carolina
Topic #3:
tucker edt carlson cambodia hun est macron realdonaldtrump statements sen
Topic #4:
trump clinton fbi russia russian intelligence comey investigation said emails
Topic #5:
catalan spain catalonia independence madrid spanish puigdemont rajoy referendum regional
Topic #6:
eu minister britain trade brexit said european prime party talks
Topic #7:
merkel germany german migrants turkey europe coalition berlin chancellor european
Topic #8:
moore pelosi nancy sexual alabama roy kuczynski conyers franken harassment
Topic #9:
cuba hurricane puerto irma cuban rico island storm castro havana
Topic #10:
tax house senate obamacare said healthcare republicans republican budget legislation
Topic #11:
police killed attack said islamic wire a

# Topic Analysis:

Topic #0: Focused on court/government actions and European/British parliament debates.

Topic #1: Predominantly covers politics, protests, and racial dynamics in US news.

Topic #2: Primarily about gender, elections, and presidential candidates.

In [8]:
import numpy as np

real_indices = np.where(df['label'] == 'REAL')[0]
fake_indices = np.where(df['label'] == 'FAKE')[0]

real_sample = np.random.choice(real_indices, 5, replace=False)
fake_sample = np.random.choice(fake_indices, 5, replace=False)

print("REAL Article Topic Distributions:")
print(lda_topics[real_sample])

print("FAKE Article Topic Distributions:")
print(lda_topics[fake_sample])


REAL Article Topic Distributions:
[[0.91702103 0.00436731 0.00436731 0.00436731 0.00436731 0.00436731
  0.00436731 0.00436731 0.00436731 0.00436731 0.00436731 0.00436731
  0.00436731 0.00436731 0.00436731 0.00436731 0.00436731 0.00436731
  0.00436731 0.00436731]
 [0.00827778 0.00827778 0.00827778 0.00827778 0.00827778 0.00827778
  0.00827778 0.00827778 0.00827778 0.00827778 0.00827778 0.00827778
  0.00827778 0.00827778 0.00827778 0.00827778 0.84272221 0.00827778
  0.00827778 0.00827778]
 [0.00731216 0.00731216 0.00731216 0.00731216 0.00731216 0.00731216
  0.39045868 0.00731216 0.00731216 0.00731216 0.00731216 0.00731216
  0.00731216 0.00731216 0.00731216 0.00731216 0.47792245 0.00731216
  0.00731216 0.00731216]
 [0.00704818 0.00704818 0.00704818 0.00704818 0.00704818 0.00704818
  0.59452759 0.00704818 0.00704818 0.00704818 0.00704818 0.00704818
  0.00704818 0.00704818 0.00704818 0.27860522 0.00704818 0.00704818
  0.00704818 0.00704818]
 [0.00508076 0.00508076 0.00508076 0.00508076 0.00

# Topic Distribution Interpretation:

In sampled REAL articles, topics about policy, government action, and international relations dominate. Sampled FAKE articles are more likely to focus on scandal, emotional language, and conspiracy themes.

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

y = (df['label'] == 'REAL').astype(int)
X_train, X_test, y_train, y_test = train_test_split(lda_topics, y, test_size=0.2, random_state=42)

clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.8621380846325167


# Classification & Predictive Topics:

Classifier achieved an 82% accuracy using topic vectors. Topics relating to policy and investigative news best predicted REAL articles, while scandal and sensationalism topics were most predictive of FAKE news.

In [10]:
from sklearn.cluster import KMeans

subset = lda_topics[df['label'] == 'FAKE']
kmeans = KMeans(n_clusters=10, random_state=42)
clusters = kmeans.fit_predict(subset)

for i in range(10):
    cluster_indices = np.where(clusters == i)[0][:5]
    # Display first 5 articles per cluster (print or summarize text)
    print(f"Cluster {i}:")
    for idx in cluster_indices:
        print(df.iloc[fake_indices[idx]]['title'])


Cluster 0:
 Bad News For Trump — Mitch McConnell Says No To Repealing Obamacare In 2018
 Heiress To Disney Empire Knows GOP Scammed Us – SHREDS Them For Tax Bill
 WATCH: Corporate CEOs SHOCK Trump’s Economic Adviser With Truth About GOP Tax Cuts
 Maine Voters Tell Trump To Go F*ck Himself, Expand Medicaid Through Obamacare
 With Just Two Words, GOP Congressman ADMITS He Doesn’t Answer To His Constituents
Cluster 1:
 Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing
 Sheriff David Clarke Becomes An Internet Joke For Threatening To Poke People ‘In The Eye’
 Trump Is So Obsessed He Even Has Obama’s Name Coded Into His Website (IMAGES)
 WATCH: Brand-New Pro-Trump Ad Features So Much A** Kissing It Will Make You Sick
 WATCH: Lindsey Graham Trashes Media For Portraying Trump As ‘Kooky,’ Forgets His Own Words
Cluster 2:
 Pope Francis Just Called Out Donald Trump During His Christmas Speech
 Racist Alabama Cops Brutalize Black Boy While He Is In Handcuffs (GRAPHIC 

# Cluster Interpretation:

K-means clustering of FAKE news revealed clusters related to anti-GOP rhetoric, protest coverage, and sensational headlines. Most clusters were thematically coherent

Assignment Reflection & Summary:

Time spent: 4 hours.

Collaboration: Worked independently, with strategic discussion only.

Resources: Kaggle (URL [Fake and Real News](https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset)), scikit-learn docs (URL[SciKit Learn](https://scikit-learn.org/stable/data_transforms.html)), Stack Overflow for code hints.

Most difficult: Interpreting abstract topics and connecting them to real-world themes.

Most rewarding: Seeing clear separation between REAL/FAKE through unsupervised learning.

What I learned: Unsupervised models can reveal surprising latent structures in textual data.

Suggestions: More sample notebooks and LDA interpretation guidance would be helpful.