In [1]:
import pandas as pd
import numpy as np
import re
import random

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
gen_path = "/content/drive/MyDrive/Grad Project/final_dataset(csv).csv"

news_path = "/content/drive/MyDrive/Grad Project/True.csv"

df_news = pd.read_csv(news_path, encoding="utf-8")
df_news.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [5]:
df_news=df_news.drop(['subject','date','title'],axis=1)
df_news.head()

Unnamed: 0,text
0,WASHINGTON (Reuters) - The head of a conservat...
1,WASHINGTON (Reuters) - Transgender people will...
2,WASHINGTON (Reuters) - The special counsel inv...
3,WASHINGTON (Reuters) - Trump campaign adviser ...
4,SEATTLE/WASHINGTON (Reuters) - President Donal...


# Cleaning

In [6]:
# ---------- Basic text cleanup ----------
def clean_text(t):
    t = str(t)
    t = t.encode("utf-8", "ignore").decode("utf-8", "ignore")
    t = re.sub(r"Â", "", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

df_news["text"] = df_news["text"].apply(clean_text)

# ---------- Claim cleaning ----------
def clean_claim_text(text):
    text = re.sub(r"\b(Reuters|Associated Press|AP|BBC|CNN|Bloomberg|AFP)\b", "", text, flags=re.I)
    text = re.sub(r"^[A-Z\s/]+?\(\)\s*-\s*", "", text, flags=re.I)
    text = re.sub(r"^.*?-\s*", "", text, flags=re.I)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# ---------- Supported claim cleaning ----------
def clean_supported_text(text):
    text = re.sub(r"^[A-Z\s/]+?\(\)\s*-\s*", "", text, flags=re.I)
    text = re.sub(r"\s+", " ", text).strip()
    return text


# Applying

In [7]:
# ---------- Split, clean, and label ----------
df_news = df_news.sample(frac=1, random_state=42).reset_index(drop=True)
split_index = len(df_news) // 2

df_claims = df_news.iloc[:split_index].copy()
df_supported = df_news.iloc[split_index:].copy()

df_claims["text"] = df_claims["text"].apply(clean_claim_text)
df_supported["text"] = df_supported["text"].apply(clean_supported_text)

df_claims["label"] = "claim"
df_supported["label"] = "Supported claim"

df_final = pd.concat([df_claims, df_supported], ignore_index=True)
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)


In [8]:
df_final.head()

Unnamed: 0,text,label
0,TBILISI (Reuters) - A former Islamic State fig...,Supported claim
1,U.S. Senate Democratic Leader Chuck Schumer sa...,claim
2,"home to sites holy to the Muslim, Jewish and C...",claim
3,If the United States leaves the Iran nuclear d...,claim
4,(Reuters) - U.S. President Donald Trump will t...,Supported claim


# Fixes

In [9]:
# ---------- Rename columns ----------
df_final = df_final.rename(columns={
    "text": "video_transcription_text",
    "label": "claim_status"
})

# ---------- TikTok-style phrasing ----------
casual_refs = [
    "According to {}",
    "As reported by {}",
    "{} said that",
    "{} mentioned that",
    "{} stated that",
    "Based on a post from {}",
    "A recent update from {} claims that",
    "{} published an article saying",
    "A report from {} revealed that"
]

def make_source_casual(text):
    match = re.search(r"\b(Reuters|BBC|CNN|Bloomberg|AFP|Associated Press|AP)\b", text, flags=re.I)
    if match:
        source = match.group(0)
        phrase = random.choice(casual_refs).format(source)
        text = re.sub(r"\(?\b" + source + r"\b\)?\s*-\s*", "", text, flags=re.I)
        text = phrase + " " + text.strip()
    return text.strip()

In [10]:
df_final.loc[df_final["claim_status"] == "Supported claim", "video_transcription_text"] = (
    df_final.loc[df_final["claim_status"] == "Supported claim", "video_transcription_text"]
    .apply(make_source_casual)
)

In [11]:
df_final

Unnamed: 0,video_transcription_text,claim_status
0,As reported by Reuters TBILISI A former Islami...,Supported claim
1,U.S. Senate Democratic Leader Chuck Schumer sa...,claim
2,"home to sites holy to the Muslim, Jewish and C...",claim
3,If the United States leaves the Iran nuclear d...,claim
4,A report from Reuters revealed that U.S. Presi...,Supported claim
...,...,...
21412,Reuters stated that LONDON Britain is preparin...,Supported claim
21413,Reuters stated that WASHINGTON U.S. President-...,Supported claim
21414,Special Counsel Robert Mueller has issued gran...,claim
21415,"majority countries and a ban on refugees, but ...",claim


# Merging

In [12]:
df_news=df_final
df_main=pd.read_csv(gen_path,encoding="latin-1")
df_main=df_main.drop(['video_id'],axis=1)
df_main.head()

Unnamed: 0,claim_status,video_transcription_text
0,claim,someone shared with me that drone deliveries a...
1,claim,someone shared with me that avocados never rip...
2,claim,someone shared with me that people are born wi...
3,claim,someone shared with me that sneezing while tra...
4,claim,someone shared with me that people donât sne...


In [13]:
#  column order
df_main = df_main[["claim_status", "video_transcription_text"]]
df_news = df_news[["claim_status", "video_transcription_text"]]

# ------------------ MERGE + SHUFFLE ------------------
df_merged = pd.concat([df_main, df_news], ignore_index=True)
df_merged = df_merged.sample(frac=1, random_state=42).reset_index(drop=True)



output_path = "/content/drive/MyDrive/Grad Project/data_merged.csv"
df_merged.to_csv(output_path,index=False, encoding="utf-8")


In [14]:
df_merged

Unnamed: 0,claim_status,video_transcription_text
0,opinion,my colleagues are willing to say that one-thir...
1,Supported claim,A report from Reuters revealed that WASHINGTON...
2,Supported claim,Reuters said that WASHINGTON U.S. House of Rep...
3,Supported claim,US Census Bureau confirmed that found that soc...
4,claim,which abuts the Sahara to the north and has be...
...,...,...
47271,opinion,my sentiment is that a blue whale's heartbeat ...
47272,claim,The United States informed Germany shortly bef...
47273,Supported claim,Reuters stated that British police said they h...
47274,claim,The shocking reality is the netherlands is hom...


# tts

In [15]:
import pandas as pd
import re, random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# ---------------- CONFIG ----------------
path = "/content/drive/MyDrive/Grad Project/data_merged.csv"
output_path = "/content/drive/MyDrive/Grad Project/final_dataset_balanced.csv"
NUM_CLUSTERS = 10        # detect ~10 topic clusters
TARGET_SAMPLES = 2500    # desired samples per topic


In [22]:

df = pd.read_csv(path, encoding="utf-8")

# Rename columns if needed
if "video_transcription_text" not in df.columns:
    df.rename(columns={"text": "video_transcription_text", "label": "claim_status"}, inplace=True)

# --- Fix NaNs and blanks ---
df["video_transcription_text"] = df["video_transcription_text"].astype(str)
df["video_transcription_text"] = df["video_transcription_text"].fillna("")
df = df[df["video_transcription_text"].str.strip() != ""].reset_index(drop=True)

#TF-IDF TOPIC DETECTION
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X = vectorizer.fit_transform(df["video_transcription_text"])

kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=42, n_init=10)
df["topic_cluster"] = kmeans.fit_predict(X)

# Display top words for each cluster
print("\n Top keywords per detected topic:")
terms = vectorizer.get_feature_names_out()
for i in range(NUM_CLUSTERS):
    top_terms = [terms[idx] for idx in kmeans.cluster_centers_[i].argsort()[-10:][::-1]]
    print(f"Topic {i+1}: {', '.join(top_terms)}")




 Top keywords per detected topic:
Topic 1: sounds, real, dolphins, mammals, think, highkey, honest, imo, correct, fake
Topic 2: friends, willing, view, say, world, earth, convinced, understanding, impression, understand
Topic 3: trump, said, republican, house, president, clinton, white, senate, campaign, donald
Topic 4: family, willing, wager, bet, view, world, colleagues, understands, say, impression
Topic 5: said, government, state, reuters, president, people, year, party, minister, united
Topic 6: colleagues, earth, world, opinion, believe, view, moon, feel, don, humans
Topic 7: reported, confirmed, stated, highlighted, global, university, research, emphasized, study, accelerating
Topic 8: korea, north, korean, nuclear, said, china, south, missile, trump, pyongyang
Topic 9: discovered, read, claim, colleague, friend, claiming, mentioning, news, tv, media
Topic 10: learned, colleague, claim, media, website, discussion, news, board, internet, social


In [21]:
print(df.shape)

(47276, 4)


In [26]:
# ---------------- SYNTHETIC DATA GENERATION ----------------

topic_templates = {
    "health": [
        ("claim", "drinking lemon water helps you lose fat fast"),
        ("Supported claim", "According to the World Health Organization (WHO), staying hydrated supports normal metabolism"),
        ("Supported claim", "A Harvard Health report confirmed that consistent sleep improves immune strength"),
        ("opinion", "I personally think lemon water is just refreshing, not magic 😅"),
        ("claim", "skipping breakfast boosts focus")
    ],
    "beauty": [
        ("claim", "Everyone’s talking about this new skincare trend with ice cubes"),
        ("Supported claim", "Dermatologists from the American Academy of Dermatology (AAD) stated that cold therapy helps reduce puffiness"),
        ("Supported claim", "A 2024 Vogue Beauty article confirmed sunscreen remains the best anti-aging product"),
        ("opinion", "I tried the frozen spoon trick — it actually works! "),
        ("claim", " rice water makes hair grow faster")
    ],
    "fitness": [
        ("claim", "10k steps a day can replace gym workouts"),
        ("Supported claim", "The Mayo Clinic confirmed that walking daily lowers cardiovascular risks"),
        ("Supported claim", "The American Heart Association reported that morning cardio can boost endurance"),
        ("opinion", "Walking’s the easiest workout ever — just vibes and steps "),
        ("claim", " fasting workouts burn more fat")
    ],
    "sports": [
        ("claim", "Fans said Messi’s free kick last night broke a record"),
        ("Supported claim", "ESPN confirmed Lionel Messi became the top international goal scorer in 2024"),
        ("Supported claim", "BBC Sport reported that the Paris 2024 Olympics added breakdancing as an official sport"),
        ("opinion", "That goal was unreal!  Still can’t believe it"),
        ("claim", "female athletes are breaking more world records this year")
    ],
    "tech": [
        ("claim", "ChatGPT will replace Google completely"),
        ("Supported claim", "Stanford University researchers confirmed AI tools outperform traditional search engines in reasoning tasks"),
        ("Supported claim", "Reuters reported that OpenAI’s GPT-5 aims to enhance multimodal capabilities by 2025"),
        ("opinion", "AI is powerful but still can’t replace creativity"),
        ("claim", " smartphones are always listening")
    ],
    "finance": [
        ("claim", "I heard you can make $500 a day flipping phones online"),
        ("Supported claim", "A 2024 Forbes article confirmed that reselling refurbished devices can yield high profits"),
        ("Supported claim", "The Wall Street Journal reported that compound interest drives long-term wealth growth"),
        ("opinion", "Flipping phones sounds stressful but tempting "),
        ("claim", "investing ten dollars a week can make you rich one day")
    ],
    "entertainment": [
        ("claim", " Taylor Swift hinted at a new album again"),
        ("Supported claim", "Billboard confirmed that her label recently registered two unreleased song titles"),
        ("Supported claim", "Variety reported that Marvel reshot a movie ending after fan backlash"),
        ("opinion", "She’s definitely teasing us again "),
        ("claim", " the Oscars results were leaked early")
    ],
    "environment": [
        ("claim", "climate change is accelerating faster than expected"),
        ("Supported claim", "NASA confirmed that 2024 was among the hottest years ever recorded"),
        ("Supported claim", "According to National Geographic, ocean levels are rising faster than predicted"),
        ("opinion", "Honestly, I think everyone can still make a difference "),
        ("claim", "recycling doesn’t actually help much")
    ]
}

topic_names = list(topic_templates.keys())
df["topic_label"] = df["topic_cluster"].apply(lambda x: random.choice(topic_names))

# ---------------- BALANCE BY TOPIC ----------------
topic_counts = df["topic_label"].value_counts().to_dict()
new_rows = []

for topic, count in topic_counts.items():
    if count < TARGET_SAMPLES:
        needed = TARGET_SAMPLES - count
        samples = random.choices(topic_templates[topic], k=needed)
        for label, text in samples:
            new_rows.append({
                "claim_status": label,
                "video_transcription_text": text,
                "topic_label": topic,
                "topic_cluster": -1
            })

df_generated = pd.DataFrame(new_rows)
df_balanced = pd.concat([df, df_generated], ignore_index=True)
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)


Balanced dataset created!
topic_label
environment      6027
tech             5975
sports           5924
beauty           5907
fitness          5894
entertainment    5875
health           5841
finance          5833
Name: count, dtype: int64

Saved as /content/drive/MyDrive/Grad Project/final_dataset_balanced.csv


In [27]:
df_balanced.head()

Unnamed: 0,claim_status,video_transcription_text,topic_cluster,topic_label
0,opinion,my colleagues' thinking is that yellowstone na...,5,finance
1,opinion,my family thinks that sneezing while traveling...,3,tech
2,Supported claim,Research from Al Jazeera highlighted that repo...,6,beauty
3,claim,"whose district spans 800 miles (1,290 km) of t...",2,sports
4,claim,"midnight counterattack on Twitter, Trump said ...",2,beauty


In [28]:
print(df_balanced.shape)

(47276, 4)
