In [21]:
from pathlib import Path
import os

print("CWD =", Path.cwd())

# 1) List what is inside current folder
print("\n--- Contents of CWD ---")
for p in sorted(Path.cwd().iterdir()):
    print(p.name)

# 2) If we are inside notebooks/, also list the parent (project root)
parent = Path.cwd().parent
print("\nPARENT =", parent)

print("\n--- Contents of PARENT ---")
for p in sorted(parent.iterdir()):
    print(p.name)

# 3) Try to locate the project root automatically by searching upwards for "data/fakeddit/images"
p = Path.cwd()
found = None
while p != p.parent:
    candidate = p / "data" / "fakeddit" / "images"
    if candidate.exists():
        found = candidate
        break
    p = p.parent

print("\nFOUND images folder =", found)
if found:
    items = list(found.iterdir())
    print("Total items in images folder:", len(items))
    print("First 10:")
    for x in items[:10]:
        print(" -", x.name)


CWD = c:\Users\guyga\My Drive\MBD_Multimodal_Misinformation\notebooks

--- Contents of CWD ---
MBD_Technical_Group.ipynb

PARENT = c:\Users\guyga\My Drive\MBD_Multimodal_Misinformation

--- Contents of PARENT ---
data
demo
models
notebooks
outputs
README.md
requirements.txt
src

FOUND images folder = c:\Users\guyga\My Drive\MBD_Multimodal_Misinformation\data\fakeddit\images
Total items in images folder: 10025
First 10:
 - 100jh1.jpg
 - 100k2z.jpg
 - 100qm6.jpg
 - 100ur8.jpg
 - 102xfq.jpg
 - 1054hk.jpg
 - 10603q.jpg
 - 1069x4.jpg
 - 1098ia.jpg
 - 10aasu.jpg


In [22]:
from pathlib import Path
import os

# Force project root (NOT notebooks/)
PROJECT_ROOT = Path.cwd().parent

IMAGES_DIR = PROJECT_ROOT / "data" / "fakeddit" / "images"
MANIFEST_PATH = PROJECT_ROOT / "data" / "fakeddit" / "downloaded_image_ids.csv"
BASE_TSV = PROJECT_ROOT / "data" / "fakeddit" / "multimodal_train.tsv"
PARQUET_PATH = PROJECT_ROOT / "data" / "fakeddit" / "df_clip_embeddings.parquet"

print("PROJECT_ROOT =", PROJECT_ROOT)
print("IMAGES_DIR =", IMAGES_DIR, "exists:", IMAGES_DIR.exists())
print("Num files in IMAGES_DIR =", len(list(IMAGES_DIR.glob("*"))))
print("BASE_TSV exists:", BASE_TSV.exists())
print("PARQUET exists:", PARQUET_PATH.exists())


PROJECT_ROOT = c:\Users\guyga\My Drive\MBD_Multimodal_Misinformation
IMAGES_DIR = c:\Users\guyga\My Drive\MBD_Multimodal_Misinformation\data\fakeddit\images exists: True
Num files in IMAGES_DIR = 10025
BASE_TSV exists: True
PARQUET exists: True


In [23]:
import pandas as pd

image_ids = [
    p.stem
    for p in IMAGES_DIR.iterdir()
    if p.suffix.lower() in [".jpg", ".png", ".jpeg"]
]

print("Recovered image count:", len(image_ids))

pd.DataFrame({"id": image_ids}).to_csv(MANIFEST_PATH, index=False)
print("✔ Saved manifest at:", MANIFEST_PATH)


Recovered image count: 10025
✔ Saved manifest at: c:\Users\guyga\My Drive\MBD_Multimodal_Misinformation\data\fakeddit\downloaded_image_ids.csv


In [24]:
!pip install clip
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from PIL import Image

import torch
import torch.nn.functional as F
import clip
from pathlib import Path





[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [25]:


PROJECT_ROOT = Path.cwd().parent  
matches = list(PROJECT_ROOT.rglob("multimodal_train.tsv"))

print("Found:", len(matches))
for m in matches:
    print(m)


Found: 1
c:\Users\guyga\My Drive\MBD_Multimodal_Misinformation\data\fakeddit\multimodal_train.tsv


In [26]:


BASE_TSV = matches[0]   # assumes found >= 1
print("Using BASE_TSV:", BASE_TSV)

df_raw = pd.read_csv(BASE_TSV, sep="\t")
print("df_raw shape:", df_raw.shape)


Using BASE_TSV: c:\Users\guyga\My Drive\MBD_Multimodal_Misinformation\data\fakeddit\multimodal_train.tsv
df_raw shape: (564000, 16)


In [27]:

# We are running inside .../notebooks, so project root is parent
PROJECT_ROOT = Path.cwd().parent

# Load base multimodal-only train TSV (local path, not Colab)
BASE_TSV = PROJECT_ROOT / "data" / "fakeddit" / "multimodal_train.tsv"

print("Using BASE_TSV:", BASE_TSV)
df_raw = pd.read_csv(BASE_TSV, sep="\t")
print("df_raw shape (FULL TSV):", df_raw.shape)

# Keep only samples with text + image
df_raw = df_raw[
    df_raw["clean_title"].notna() &
    df_raw["image_url"].notna()
].reset_index(drop=True)

print("df_raw shape after filtering text+image:", df_raw.shape)

# Create 15k working subset (fixed seed)
df_15k = df_raw.sample(n=15000, random_state=42).reset_index(drop=True)
print("df_15k shape (WORKING SET):", df_15k.shape)


Using BASE_TSV: c:\Users\guyga\My Drive\MBD_Multimodal_Misinformation\data\fakeddit\multimodal_train.tsv
df_raw shape (FULL TSV): (564000, 16)
df_raw shape after filtering text+image: (562466, 16)
df_15k shape (WORKING SET): (15000, 16)


In [28]:

# We are inside /notebooks → project root is parent
PROJECT_ROOT = Path.cwd().parent

BASE_DIR = PROJECT_ROOT / "data" / "fakeddit"

IMAGES_DIR = BASE_DIR / "images"
MANIFEST_PATH = BASE_DIR / "downloaded_image_ids.csv"
TRAIN_TSV = BASE_DIR / "multimodal_train.tsv"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("Images dir exists:", IMAGES_DIR.exists())
print("Manifest exists:", MANIFEST_PATH.exists())
print("Train TSV exists:", TRAIN_TSV.exists())


PROJECT_ROOT: c:\Users\guyga\My Drive\MBD_Multimodal_Misinformation
Images dir exists: True
Manifest exists: True
Train TSV exists: True


In [29]:
# Build df_ready from saved artifacts
downloaded_ids = set(pd.read_csv(MANIFEST_PATH)["id"])

df_ready = df_15k[
    df_15k["id"].isin(downloaded_ids)
].reset_index(drop=True)

print("df_ready shape:", df_ready.shape)


df_ready shape: (10025, 16)


In [30]:
df_ready.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10025 entries, 0 to 10024
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   author                9505 non-null   object 
 1   clean_title           10025 non-null  object 
 2   created_utc           10025 non-null  float64
 3   domain                9959 non-null   object 
 4   hasImage              10025 non-null  bool   
 5   id                    10025 non-null  object 
 6   image_url             10025 non-null  object 
 7   linked_submission_id  66 non-null     object 
 8   num_comments          9959 non-null   float64
 9   score                 10025 non-null  int64  
 10  subreddit             10025 non-null  object 
 11  title                 10025 non-null  object 
 12  upvote_ratio          9959 non-null   float64
 13  2_way_label           10025 non-null  int64  
 14  3_way_label           10025 non-null  int64  
 15  6_way_label        

# Data Preprocessing

In [31]:

import re
# We assume df_ready already exists and has ~10,025 rows
print("Input shape:", df_ready.shape)

# --- 1. Select only necessary columns ---
df = df_ready[[
    "id",
    "clean_title",
    "2_way_label"
]].copy()

df.rename(columns={"2_way_label": "label"}, inplace=True)

# --- 2. Basic text cleaning (CLIP-safe) ---
def clean_text(text):
    text = text.lower().strip()
    text = re.sub(r"http\S+", "", text)        # remove URLs
    text = re.sub(r"\s+", " ", text)           # normalize whitespace
    text = re.sub(r"[^\w\s]", "", text)        # remove punctuation
    return text

df["text"] = df["clean_title"].apply(clean_text)

# --- 3. Drop very short or empty texts ---
df["text_len"] = df["text"].apply(len)

df = df[df["text_len"] >= 5].reset_index(drop=True)

# --- 4. Sanity checks ---
assert df["id"].isnull().sum() == 0
assert df["text"].isnull().sum() == 0
assert df["label"].isnull().sum() == 0

print("Final CLIP-ready shape:", df.shape)
print(df.head())
df

Input shape: (10025, 16)
Final CLIP-ready shape: (9979, 5)
       id                                  clean_title  label  \
0  2cdqb2   a terrifying and chocolatey cookie monster      0   
1  axhmr4  the synagogue cared for by muslims bbc reel      1   
2  b25ywe                     the flight of this plane      1   
3  2umqj8         mr butterface and his little brother      0   
4  1kog6d         this pylon looks like a sleeping owl      0   

                                          text  text_len  
0   a terrifying and chocolatey cookie monster        42  
1  the synagogue cared for by muslims bbc reel        43  
2                     the flight of this plane        24  
3         mr butterface and his little brother        36  
4         this pylon looks like a sleeping owl        36  


Unnamed: 0,id,clean_title,label,text,text_len
0,2cdqb2,a terrifying and chocolatey cookie monster,0,a terrifying and chocolatey cookie monster,42
1,axhmr4,the synagogue cared for by muslims bbc reel,1,the synagogue cared for by muslims bbc reel,43
2,b25ywe,the flight of this plane,1,the flight of this plane,24
3,2umqj8,mr butterface and his little brother,0,mr butterface and his little brother,36
4,1kog6d,this pylon looks like a sleeping owl,0,this pylon looks like a sleeping owl,36
...,...,...,...,...,...
9974,7h0wdm,girl licking a big chocolate popsicle,0,girl licking a big chocolate popsicle,37
9975,djfs8z,the pattern the grass made,1,the pattern the grass made,26
9976,5l3r3b,burnt up penis experiencing necrose nsfw,0,burnt up penis experiencing necrose nsfw,40
9977,ba7jmf,this supermarket built into the old city theat...,1,this supermarket built into the old city theat...,72


In [32]:


# We are inside /notebooks → go to project root
PROJECT_ROOT = Path.cwd().parent

PARQUET_PATH = PROJECT_ROOT / "data" / "fakeddit" / "df_clip_embeddings.parquet"

print("Using PARQUET_PATH:", PARQUET_PATH)
print("Exists:", PARQUET_PATH.exists())

df_emb = pd.read_parquet(PARQUET_PATH)

print("Loaded embeddings shape:", df_emb.shape)


Using PARQUET_PATH: c:\Users\guyga\My Drive\MBD_Multimodal_Misinformation\data\fakeddit\df_clip_embeddings.parquet
Exists: True
Loaded embeddings shape: (9979, 4)


# Build features

In [33]:
from sklearn.metrics.pairwise import cosine_similarity

# Stack embeddings into arrays
img_embs = np.vstack(df_emb["image_emb"].values)
txt_embs = np.vstack(df_emb["text_emb"].values)

# 1) Cosine similarity
cos_sim = np.array([
    cosine_similarity(
        img_embs[i].reshape(1, -1),
        txt_embs[i].reshape(1, -1)
    )[0, 0]
    for i in range(len(df_emb))
]).reshape(-1, 1)

# 2) Absolute difference
abs_diff = np.abs(img_embs - txt_embs)

# 3) Concatenation
concat = np.hstack([img_embs, txt_embs])

# Final feature matrix
X = np.hstack([cos_sim, abs_diff, concat])
y = df_emb["label"].values

print("Feature matrix shape:", X.shape)


Feature matrix shape: (9979, 1537)


# Train / test split

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape, "Test:", X_test.shape)


Train: (7484, 1537) Test: (2495, 1537)


# Model Training

In [35]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(
    max_iter=2000,
    n_jobs=-1,
    class_weight="balanced"
)

clf.fit(X_train, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,2000


# Model EValuation

In [36]:
from sklearn.metrics import (
    roc_auc_score,
    classification_report,
    confusion_matrix
)

y_probs = clf.predict_proba(X_test)[:, 1]
y_pred = clf.predict(X_test)

print("ROC-AUC:", roc_auc_score(y_test, y_probs))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred))


ROC-AUC: 0.9237417820564916

Classification report:

              precision    recall  f1-score   support

           0       0.82      0.84      0.83      1104
           1       0.87      0.85      0.86      1391

    accuracy                           0.85      2495
   macro avg       0.84      0.85      0.85      2495
weighted avg       0.85      0.85      0.85      2495



# saving the trained classifier results

In [37]:
from pathlib import Path
import joblib

# We are inside /notebooks → project root is parent
PROJECT_ROOT = Path.cwd().parent

DEMO_DIR = PROJECT_ROOT / "demo"
DEMO_DIR.mkdir(parents=True, exist_ok=True)  # create if missing

MODEL_PATH = DEMO_DIR / "model.pkl"

joblib.dump(clf, MODEL_PATH)

print("✔ Model saved at:", MODEL_PATH)


✔ Model saved at: c:\Users\guyga\My Drive\MBD_Multimodal_Misinformation\demo\model.pkl


In [38]:
from pathlib import Path

root = Path(".").resolve()  # current notebook working directory
print(f"Root: {root}\n")

for path in sorted(root.rglob("*")):
    depth = len(path.relative_to(root).parts) - 1
    indent = "    " * depth
    marker = "[D]" if path.is_dir() else "[F]"
    print(f"{indent}{marker} {path.name}")


Root: C:\Users\guyga\My Drive\MBD_Multimodal_Misinformation\notebooks

[F] MBD_Technical_Group.ipynb


# Streamlit Deployment