In [3]:
# run in Colab / local python
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

rows = []
base = "https://www.politifact.com"
# example: iterate pages — adjust to actual Politifact pages you need
for page in range(1, 300):
    url = f"https://www.politifact.com/factchecks/list/?page={page}"
    r = requests.get(url, headers={"User-Agent":"Mozilla/5.0"})
    soup = BeautifulSoup(r.text, "html.parser")
    cards = soup.select(".o-listicle__item")  # selector may change; inspect site
    for c in cards:
        stmt = c.select_one(".m-statement__quote")
        rating = c.select_one(".m-statement__meter .c-image")
        source = c.select_one(".m-statement__meta")
        if stmt:
            statement = stmt.get_text(strip=True)
            rating_text = rating['alt'] if rating and rating.has_attr('alt') else None
            source_text = source.get_text(" ", strip=True) if source else None
            rows.append({"statement":statement,"rating":rating_text,"source":source_text})
    time.sleep(1.5)

df = pd.DataFrame(rows)
df.to_csv("politifact_scraped.csv", index=False)
print(df.shape)


(8970, 3)


In [10]:
#2) Load CSV, basic cleaning, create binary labels
import pandas as pd

df = pd.read_csv("politifact_dataset.csv")  # or the scraped CSV
# drop unwanted column(s)
df = df.drop(columns=[c for c in df.columns if c.lower().startswith("unnamed")], errors='ignore')

# normalize rating -> binary (example mapping)
true_labels = ["true", "mostly-true", "mostly true", "true-ish", "mostly true"]  # expand as needed
false_labels = ["false", "pants-fire", "mostly-false", "mostly false"]

def map_binary(r):
    if pd.isna(r): return None
    t = str(r).lower()
    if any(x in t for x in true_labels): return 1
    if any(x in t for x in false_labels): return 0
    return None

df['BinaryNumTarget'] = df['target'].apply(map_binary)  # or df['rating']
df = df.dropna(subset=['statement','BinaryNumTarget'])


In [11]:
df.head()

Unnamed: 0,author,statement,source,date,target,BinaryTarget,BinaryNumTarget,Fake,Real
0,Marta Campabadal,“Netflix estrenó una película del Titan el 23 ...,Facebook posts,"June 29, 2023",FALSE,FAKE,0.0,FAKE,
1,Louis Jacobson,"Says that under his presidency, the unemployme...",Joe Biden,"June 29, 2023",mostly-true,REAL,1.0,,REAL
2,Jeff Cercone,"""ONU ordena despenalizar a los"" pedófilos.",Facebook posts,"June 29, 2023",FALSE,FAKE,0.0,FAKE,
3,Sara Swann,"NASA warns of “internet apocalypse,” which “me...",Facebook posts,"June 29, 2023",FALSE,FAKE,0.0,FAKE,
4,Jeff Cercone,Video suggests COVID-19 vaccines are responsib...,Instagram posts,"June 29, 2023",FALSE,FAKE,0.0,FAKE,


3) Feature extraction (TF-IDF) — Spark MLlib (prepare DataFrame)

Use PySpark in Colab (install Java + pyspark) or run on your cluster.

In [19]:
# PySpark setup (in Colab, you must install pyspark & Java)
# !apt-get install -y openjdk-11-jdk
# !pip install pyspark
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("politifact").getOrCreate()

# load csv into spark
sdf = spark.read.csv("politifact_dataset.csv", header=True, inferSchema=True)
sdf = sdf.dropna(subset=["statement","BinaryNumTarget"]).withColumnRenamed("BinaryNumTarget","label")

from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer

regexTok = RegexTokenizer(inputCol="statement", outputCol="words", pattern="\\W")
rem = StopWordsRemover(inputCol="words", outputCol="filtered")
cv = CountVectorizer(inputCol="filtered", outputCol="rawFeatures", vocabSize=20000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[regexTok, rem, cv, idf])
model = pipeline.fit(sdf)
sdf_feat = model.transform(sdf) \
    .withColumn("label", col("label").cast("integer")) \
    .select("statement", "label", "features")


In [20]:
sdf_feat.head()

Row(statement='“Netflix estrenó una película del Titan el 23 de junio”.', label=0, features=SparseVector(4930, {9: 3.5012, 51: 4.211, 180: 4.9861, 233: 4.9861, 235: 4.9861, 1007: 6.1347, 1893: 6.5025, 2325: 7.0902, 2622: 7.0902, 3135: 7.0902, 3404: 7.0902}))

In [29]:
print(df["BinaryNumTarget"].value_counts())


BinaryNumTarget
0    5168
1     832
Name: count, dtype: int64


4) Logistic Regression using Spark MLlib

In [35]:
import spacy

nlp = spacy.load("en_core_web_sm")

import pandas as pd

def spacy_preprocess(text):
    doc = nlp(text)
    # Keep lemmatized tokens that are not stop words or punctuations
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Example usage
df["statement"] = df["statement"].apply(spacy_preprocess)

# Load dataset
df = pd.read_csv("politifact_dataset.csv")

# Drop unwanted and null rows
df = df.drop(columns=[c for c in df.columns if c.lower().startswith("unnamed")], errors='ignore')
df = df.dropna(subset=["statement", "BinaryNumTarget"])

# Convert labels to int (if needed)
df["BinaryNumTarget"] = df["BinaryNumTarget"].astype(int)

from sklearn.feature_extraction.text import TfidfVectorizer

X = df["statement"]
y = df["BinaryNumTarget"]

vectorizer = TfidfVectorizer(max_features=10000, stop_words="english")
X_tfidf = vectorizer.fit_transform(X)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts().to_dict())
print("After SMOTE:", dict(zip(*np.unique(y_train_bal, return_counts=True))))

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

lr = LogisticRegression(max_iter=200)
lr.fit(X_train_bal, y_train_bal)

# Predict
y_pred = lr.predict(X_test)
y_prob = lr.predict_proba(X_test)[:, 1]

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Before SMOTE: {0: 4134, 1: 666}
After SMOTE: {0: 4134, 1: 4134}
Accuracy: 0.9816666666666667
ROC-AUC: 0.9952051921418751

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99      1034
           1       0.89      0.99      0.94       166

    accuracy                           0.98      1200
   macro avg       0.94      0.98      0.96      1200
weighted avg       0.98      0.98      0.98      1200



5) Decision Tree on statement & rating (Spark)

In [36]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=10, random_state=42)

# SVM
from sklearn.svm import LinearSVC
model = LinearSVC()

# Naive Bayes
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

model.fit(X_train_bal, y_train_bal)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9366666666666666


6) Support Vector Machine (LinearSVC) in Spark MLlib

In [37]:
from sklearn.svm import LinearSVC

svc = LinearSVC()
svc.fit(X_train_bal, y_train_bal)

y_pred_svc = svc.predict(X_test)

print("🔹 Support Vector Machine Results")
print("Accuracy:", accuracy_score(y_test, y_pred_svc))
print(classification_report(y_test, y_pred_svc))


🔹 Support Vector Machine Results
Accuracy: 0.9975
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1034
           1       0.98      1.00      0.99       166

    accuracy                           1.00      1200
   macro avg       0.99      1.00      0.99      1200
weighted avg       1.00      1.00      1.00      1200





7) Naive Bayes (Spark MLlib)

NaiveBayes expects non-negative features — CountVectorizer raw counts are fine (so use rawFeatures instead of IDF features for NB).

In [38]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_bal, y_train_bal)

y_pred_nb = nb.predict(X_test)

print("🔹 Naive Bayes Results")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


🔹 Naive Bayes Results
Accuracy: 0.9366666666666666
              precision    recall  f1-score   support

           0       1.00      0.93      0.96      1034
           1       0.69      0.99      0.81       166

    accuracy                           0.94      1200
   macro avg       0.84      0.96      0.89      1200
weighted avg       0.96      0.94      0.94      1200



8) KMeans clustering — cluster true vs false

In [39]:
from sklearn.cluster import KMeans
import numpy as np

# Cluster using 2 clusters (true/false)
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
kmeans.fit(X_tfidf)

# Get cluster labels
cluster_labels = kmeans.labels_

# Compare clusters vs true labels
df["cluster"] = cluster_labels

print("🔹 K-Means Clustering Results")
print(df.groupby(["cluster", "BinaryNumTarget"]).size().unstack(fill_value=0))


🔹 K-Means Clustering Results
BinaryNumTarget     0    1
cluster                   
0                 556   44
1                4612  788


In [40]:
import joblib
joblib.dump(vectorizer, "tfidf_vectorizer.joblib")
joblib.dump(lr, "logistic_regression_model.joblib")
joblib.dump(nb, "naive_bayes_model.joblib")
joblib.dump(svc, "svc_model.joblib")
joblib.dump(kmeans, "kmeans_model.joblib")


['kmeans_model.joblib']

9) Create DataFrame in Colab, convert to SQL table, and query

Using sqlite3 or duckdb in Colab is easiest.

sqlite3 example

In [None]:
import sqlite3
conn = sqlite3.connect('/content/politifact.db')
df.to_sql('politifact', conn, if_exists='replace', index=False)

# Query
q = "SELECT target, COUNT(*) as cnt FROM politifact GROUP BY target ORDER BY cnt DESC;"
pd.read_sql(q, conn)


In [None]:

# pip install duckdb
import duckdb
duckdb.sql("CREATE TABLE politifact AS SELECT * FROM read_csv_auto('polifact_full.csv')")
duckdb.sql("SELECT target, count(*) FROM politifact GROUP BY target").df()
