In [None]:
import random
import pandas as pd

# Supported Claim

In [None]:
sources = [
    # Health & Science
    "World Health Organization (WHO)",
    "Centers for Disease Control and Prevention (CDC)",
    "National Institutes of Health (NIH)",
    "European Medicines Agency (EMA)",
    "Food and Drug Administration (FDA)",
    "Mayo Clinic",
    "Cleveland Clinic",
    "The Lancet",
    "Nature",
    "Science",
    "BMJ (British Medical Journal)",

    # Global & Political
    "United Nations (UN)",
    "UNICEF",
    "UNESCO",
    "World Bank",
    "International Monetary Fund (IMF)",
    "European Union (EU)",
    "African Union (AU)",
    "OECD",
    "World Economic Forum (WEF)",

    # Statistics & Data
    "Pew Research Center",
    "Gallup",
    "Statista",
    "US Census Bureau",
    "UK Office for National Statistics (ONS)",
    "Jordan Department of Statistics (DOS)",

    # News & Media
    "Reuters",
    "Associated Press (AP)",
    "BBC News",
    "The Guardian",
    "Al Jazeera",
    "The New York Times",
    "The Washington Post",

    # Education & Research
    "Harvard University",
    "MIT",
    "Stanford University",
    "Oxford University",
    "Cambridge University",
    "Yale University",
    "Columbia University",
    "University of Tokyo",
    "ETH Zurich",
    "Max Planck Institute"
]

# ---------- FACT SNIPPETS ----------
facts = [
    "reported that global obesity rates have tripled since 1975",
    "confirmed that climate change is accelerating faster than expected",
    "found that regular exercise reduces the risk of heart disease by 30%",
    "stated that nearly 700 million people live in extreme poverty",
    "reported that renewable energy investments reached record highs in 2023",
    "highlighted that over 90% of the world’s population breathes polluted air",
    "found that students who sleep at least 8 hours score higher on exams",
    "confirmed that vaccines have prevented millions of deaths worldwide",
    "reported that global unemployment decreased slightly last year",
    "stated that mental health disorders affect 1 in 4 people globally",
    "found that social media use among teens exceeds 3 hours per day",
    "reported that sea levels are rising at an unprecedented rate",
    "confirmed that artificial intelligence adoption is growing rapidly",
    "stated that education access has improved in developing countries",
    "found that diabetes cases have increased dramatically in the last decade",
    "reported that global literacy rates continue to rise",
    "confirmed that biodiversity loss is accelerating worldwide",
    "stated that inflation reached its highest levels in decades",
    "reported that women’s participation in the workforce is steadily increasing",
    "highlighted that renewable energy now makes up 30% of global electricity"
]


In [None]:
templates = [
    "According to {source}, {fact}.",
    "{source} reported that {fact}.",
    "A study by {source} found that {fact}.",
    "{source} confirmed that {fact}.",
    "{source} stated that {fact}.",
    "Research from {source} highlighted that {fact}.",
    "In its latest report, {source} revealed that {fact}.",
    "{source} emphasized that {fact}."
]


In [None]:
# ---------- DATA GENERATION ----------
records = []
num_records = 7000

for _ in range(num_records):
    source = random.choice(sources)
    fact = random.choice(facts)
    template = random.choice(templates)
    text = template.format(source=source, fact=fact)
    records.append({"transcription_text": text, "label": "supported claim"})

In [None]:
df = pd.DataFrame(records)
df.head()

Unnamed: 0,transcription_text,label
0,"In its latest report, MIT revealed that report...",supported claim
1,According to National Institutes of Health (NI...,supported claim
2,UNICEF emphasized that stated that inflation r...,supported claim
3,Reuters reported that reported that sea levels...,supported claim
4,UK Office for National Statistics (ONS) report...,supported claim


In [None]:

num_unique_rows = df.drop_duplicates().shape[0]
print(f"Number of unique rows: {num_unique_rows}")

Number of unique rows: 4397


In [None]:
df=df.drop_duplicates()

In [None]:
df.to_csv("supported_claims_dataset.csv", index=False, encoding="utf-8")

print("Dataset generated with", num_records, "records")
print(df.sample(10))

Dataset generated with 7000 records
                                     transcription_text            label
645   Research from The New York Times highlighted t...  supported claim
2500  UNICEF stated that stated that education acces...  supported claim
3822  Research from US Census Bureau highlighted tha...  supported claim
796   Cambridge University confirmed that stated tha...  supported claim
59    International Monetary Fund (IMF) reported tha...  supported claim
2914  United Nations (UN) stated that reported that ...  supported claim
3575  A study by Yale University found that found th...  supported claim
5393  In its latest report, Cleveland Clinic reveale...  supported claim
680   Al Jazeera confirmed that stated that mental h...  supported claim
4969  A study by BMJ (British Medical Journal) found...  supported claim


# Claim

In [None]:
import itertools
import random

## Formal

In [None]:
templates = [
    "People say that {fact}.",
    "It’s claimed that {fact}.",
    "Many believe that {fact}.",
    "There are reports that {fact}.",
    "It’s often said that {fact}.",
    "Sources online suggest that {fact}.",
    "Did you know that {fact}?",
    "Guess what, {fact}.",
    "It might surprise you, but {fact}.",
    "Everyone talks about how {fact}.",
    "Funny enough, {fact}.",
    "I just found out that {fact}."
]

facts = [
    # Science
    "bananas are technically berries",
    "sharks existed before trees",
    "the heart of a blue whale is the size of a car",
    "Venus is hotter than Mercury even though it is farther from the sun",
    "Mount Everest grows about 4 millimeters every year",
    "lightning is five times hotter than the surface of the sun",
    "penguins can drink seawater",
    "sloths can hold their breath longer than dolphins",
    "octopuses have three hearts",
    "a day on Venus is longer than a year on Venus",

    # Health
    "drinking too much water can be dangerous",
    "sugar does not actually cause hyperactivity in children",
    "coffee can help improve short-term memory",
    "laughing can improve blood circulation",
    "walking 30 minutes a day reduces risk of heart disease",

    # History
    "Napoleon was taller than the average Frenchman of his time",
    "the Great Wall of China is not visible from space with the naked eye",
    "Cleopatra lived closer in time to the moon landing than to the pyramids",
    "the library of Alexandria was not destroyed in one fire but many over centuries",
    "the Olympic Games originated in ancient Greece",

    # Everyday trivia
    "goldfish do not have a memory of just three seconds",
    "carrots were originally purple",
    "ketchup was once sold as medicine",
    "the first email was sent in 1971",
    "the hashtag symbol is technically called an octothorpe",
    "cats have fewer toes on their back paws than on their front paws",
    "the Eiffel Tower grows taller in summer due to heat expansion"
]



In [None]:
combinations = [t.format(fact=f) for t, f in itertools.product(templates, facts)]
random.shuffle(combinations)


In [None]:
df = pd.DataFrame({
    "claim_status": "claim",
    "video_transcription_text": combinations
})

In [None]:
num_unique_rows = df.drop_duplicates().shape[0]
print(f"Number of unique rows: {num_unique_rows}")

Number of unique rows: 324


In [None]:
df.to_csv("unique_claims_dataset.csv", index=False, encoding="utf-8")

print("Dataset generated with", df.shape[0], "unique rows")
print(df.sample(5))

Dataset generated with 324 unique rows
    claim_status                           video_transcription_text
122        claim  Sources online suggest that walking 30 minutes...
301        claim  I just found out that carrots were originally ...
68         claim  Sources online suggest that lightning is five ...
14         claim  It might surprise you, but Napoleon was taller...
134        claim  It’s claimed that the first email was sent in ...


## Informal/Tiktok style

In [None]:
import re

In [None]:
templates = [
    # Conversational
    "Did you know {fact}?",
    "No one talks about how {fact}.",
    "Here’s the thing: {fact}.",
    "Most people don’t realize {fact}.",
    "This is crazy, but {fact}.",
    "Everyone needs to know that {fact}.",
    "It blew my mind when I learned {fact}.",
    "Nobody tells you that {fact}.",
    "What if I told you {fact}?",
    "You probably don’t know this, but {fact}.",

    # Social Media
    "I saw online that {fact}.",
    "Twitter is going wild because {fact}.",
    "Everyone on TikTok says {fact}.",
    "It’s trending because {fact}.",
    "Reddit threads claim {fact}.",

    # Shock
    "You won’t believe this: {fact}.",
    "This fact changes everything: {fact}.",
    "It sounds fake, but {fact}.",
    "This blew my mind: {fact}.",
    "Nobody believes me when I say {fact}.",
    "This is 100% true: {fact}.",
    "It might shock you, but {fact}.",
    "I couldn’t believe it when I found out {fact}.",

    # Word-of-Mouth
    "A friend told me that {fact}.",
    "Someone once said {fact}.",
    "People keep saying that {fact}.",
    "I heard from someone that {fact}.",
    "Everyone is talking about how {fact}.",
    "They say {fact}.",
    "The story goes that {fact}.",

    # Authority Illusion
    "Experts say that {fact}.",
    "Doctors online revealed that {fact}.",
    "Scientists discovered {fact}.",
    "Studies show that {fact}.",
    "Historians agree that {fact}.",
    "According to research, {fact}.",
    "Data proves {fact}.",
    "The truth is {fact}.",

    # Clickbait
    "The secret is that {fact}.",
    "What nobody tells you is {fact}.",
    "Hidden truth: {fact}.",
    "They don’t want you to know that {fact}.",
    "Here’s what schools never teach you: {fact}.",
    "This changes history: {fact}.",
    "The truth they’re hiding is {fact}.",
    "If you know this, your life changes: {fact}.",
    "The shocking reality is {fact}."
]

In [None]:
def rewrite_claim(text):
    # Remove variations of "claim"
    clean_text = re.sub(r"\bclaim(?:ed|ing|s)?\b", "", text, flags=re.IGNORECASE).strip()

    # If 'that' exists, keep the part after 'that'
    if "that" in clean_text:
        fact = clean_text.split("that", 1)[1].strip()
    else:
        fact = clean_text

    # Ensure it's not empty
    if not fact:
        fact = clean_text

    # Wrap in TikTok-style template
    template = random.choice(templates)
    return template.format(fact=fact)

In [None]:
df = pd.read_excel("/content/to_befixed.xlsx")

# --- Apply rewrite only on claim rows ---
df.loc[df["claim_status"] == "claim", "video_transcription_text"] = (
    df.loc[df["claim_status"] == "claim", "video_transcription_text"].apply(rewrite_claim)
)


In [None]:
df.to_excel("cleaned_tiktok_claims.xlsx", index=False)   # Excel

print("Saved as cleaned_tiktok_claims.xlsx and cleaned_tiktok_claims.csv")

Saved as cleaned_tiktok_claims.xlsx and cleaned_tiktok_claims.csv


# Opinion

In [None]:
opinion_templates = [
    "I don’t really believe that {fact}.",
    "Personally, I think {fact} is exaggerated.",
    "In my opinion, {fact} isn’t true at all.",
    "Honestly, I feel like {fact} makes sense.",
    "For me, {fact} seems a little fake.",
    "I just can’t accept that {fact}.",
    "To be honest, I think {fact} could be real.",
    "I feel like {fact} is probably not accurate.",
    "IMO, {fact} sounds right to me.",
    "I don’t buy into the idea that {fact}.",
    "Lowkey, I think {fact} is fake.",
    "Highkey, {fact} sounds real to me.",
    "Not gonna lie, I doubt {fact}.",
    "Tbh, I kinda believe {fact}.",
    "For real, I don’t think {fact} is correct.",
    "Honestly, {fact} just feels made up.",
    "No cap, {fact} could actually be true.",
    "I swear, {fact} doesn’t add up.",
    "Personally, {fact} doesn’t sound right.",
    "IMO, people are overreacting about {fact}."
]

facts = [
    "goldfish only have a three-second memory",
    "coffee stunts your growth",
    "the moon landing was staged",
    "dolphins are mammals",
    "the Great Wall of China is visible from space",
    "sugar causes hyperactivity in children",
    "cracking your knuckles causes arthritis",
    "Napoleon was extremely short",
    "humans only use 10% of their brains",
    "the pyramids were built by aliens",
    "bats are completely blind",
    "the earth is flat",
    "vaccines cause autism",
    "the Eiffel Tower grows taller in summer",
    "lightning never strikes the same place twice",
    "you can catch a cold from being in the rain",
    "violin bows are made from horsehair",
    "the moon is moving away from the earth",
    "apple seeds contain cyanide",
    "strawberries are the only fruit with seeds on the outside"
]

In [None]:
def generate_opinion(fact):
    template = random.choice(opinion_templates)
    return template.format(fact=fact)

In [None]:
num_records = 3000
records = []

for _ in range(num_records):
    fact = random.choice(facts)
    transcript = generate_opinion(fact)
    records.append({"claim_status": "opinion", "video_transcription_text": transcript})

In [None]:
print(df.duplicated().sum())
df=df.drop_duplicates()

565


In [None]:
df = pd.DataFrame(records)
df.to_excel("opinion_dataset.xlsx", index=False)
df.to_csv("opinion_dataset.csv", index=False)

print("Opinion dataset generated with", num_records, "rows")
print(df.sample(10))

Opinion dataset generated with 3000 rows
     claim_status                           video_transcription_text
2139      opinion  I don’t really believe that apple seeds contai...
2702      opinion  I swear, violin bows are made from horsehair d...
328       opinion  I just can’t accept that the pyramids were bui...
1468      opinion  Not gonna lie, I doubt strawberries are the on...
2012      opinion  I don’t buy into the idea that Napoleon was ex...
271       opinion  For me, the Great Wall of China is visible fro...
1470      opinion  I don’t really believe that humans only use 10...
749       opinion  Honestly, goldfish only have a three-second me...
2465      opinion  Lowkey, I think violin bows are made from hors...
1255      opinion  Tbh, I kinda believe the moon landing was staged.
