In [1]:
# ðŸ“š Imports
import pandas as pd
import re
from datetime import datetime

# ðŸ“¥ Load raw scraped data
df = pd.read_csv("step1_raw_devpost_events.csv")
df.head()


Unnamed: 0,event_title,platform,event_type,event_date,location,speakers_orgs,topic_tags,url,audience_segment
0,LaunchHacks IV,Devpost,Hackathon,"Jun 28 - Jul 01, 2025",Online,LaunchHacks,"Beginner Friendly, Education, Open Ended",https://launchhacks-iv.devpost.com/?ref_featur...,
1,United Hacks V5,Devpost,Hackathon,"Jul 11 - 13, 2025",Online,Hack United,"Beginner Friendly, Open Ended, Social Good",https://unitedhacksv5.devpost.com/?ref_feature...,
2,NextStep Hacks 2025,Devpost,Hackathon,"Jul 11 - 14, 2025",Online,HackAlphaX,"Beginner Friendly, Social Good, Machine Learni...",https://nextstep2025.devpost.com/?ref_feature=...,
3,Proof of Concept,Devpost,Hackathon,"Jun 26 - Aug 21, 2025",Online,XION,"Beginner Friendly, Machine Learning/AI, Mobile",https://proofofconcept.devpost.com/?ref_featur...,
4,HackVortex Codestorm 5,Devpost,Hackathon,"Aug 08 - 17, 2025",Online,HackVortex,"Beginner Friendly, Health, Machine Learning/AI",https://hackvortex-codestorm-5.devpost.com/?re...,


In [2]:
# ðŸ“… Extract start date from raw range e.g. "Jun 28 - Jul 01, 2025"
def parse_start_date(date_range):
    match = re.search(r"([A-Za-z]{3}) (\d{1,2})", date_range)
    year_match = re.search(r"\d{4}", date_range)
    if match and year_match:
        date_str = f"{match.group(1)} {match.group(2)} {year_match.group(0)}"
        try:
            return datetime.strptime(date_str, "%b %d %Y").strftime("%d-%m-%Y")
        except:
            return ""
    return ""

df["event_date"] = df["event_date"].apply(parse_start_date)


In [3]:
# Normalize event location to Online / In-person / Remote / City name
def normalize_location(loc):
    loc = str(loc).lower()
    if "online" in loc:
        return "Online"
    elif "remote" in loc:
        return "Remote"
    elif "hybrid" in loc:
        return "Hybrid"
    elif loc == "":
        return "Unknown"
    else:
        return loc.title()  # e.g., "Bangalore"

df["location"] = df["location"].apply(normalize_location)


In [4]:
# Fill missing topic tags with empty string
df["topic_tags"] = df["topic_tags"].fillna("")

# Strip whitespace from all text fields
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].str.strip()


In [5]:
df.to_csv("step2_cleaned_devpost_events.csv", index=False)
df.head()


Unnamed: 0,event_title,platform,event_type,event_date,location,speakers_orgs,topic_tags,url,audience_segment
0,LaunchHacks IV,Devpost,Hackathon,28-06-2025,Online,LaunchHacks,"Beginner Friendly, Education, Open Ended",https://launchhacks-iv.devpost.com/?ref_featur...,
1,United Hacks V5,Devpost,Hackathon,11-07-2025,Online,Hack United,"Beginner Friendly, Open Ended, Social Good",https://unitedhacksv5.devpost.com/?ref_feature...,
2,NextStep Hacks 2025,Devpost,Hackathon,11-07-2025,Online,HackAlphaX,"Beginner Friendly, Social Good, Machine Learni...",https://nextstep2025.devpost.com/?ref_feature=...,
3,Proof of Concept,Devpost,Hackathon,26-06-2025,Online,XION,"Beginner Friendly, Machine Learning/AI, Mobile",https://proofofconcept.devpost.com/?ref_featur...,
4,HackVortex Codestorm 5,Devpost,Hackathon,08-08-2025,Online,HackVortex,"Beginner Friendly, Health, Machine Learning/AI",https://hackvortex-codestorm-5.devpost.com/?re...,
