In [1]:
import pandas as pd
from pathlib import Path


PROJECT_ROOT = Path.cwd().parent
DATA_DIR     = PROJECT_ROOT / "data"
OUTPUT_DIR   = PROJECT_ROOT / "preprocessed_data"


OUTPUT_DIR.mkdir(exist_ok=True)

FILE_PATHS = {
    "ks_projects_2016_12": DATA_DIR / "ks-projects-201612.csv",
    "ks_projects_2018_01": DATA_DIR / "ks-projects-201801.csv",
    "live_campaigns":      DATA_DIR / "live.csv",
    "most_backed":         DATA_DIR / "most_backed.csv",
}

# Quick existence check
for name, path in FILE_PATHS.items():
    status = "FOUND" if path.exists() else "MISSING"
    print(f"{name:<20} → {status}")


ks_projects_2016_12  → FOUND
ks_projects_2018_01  → FOUND
live_campaigns       → FOUND
most_backed          → FOUND


In [2]:
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
    # 1) normalize column names
    df.columns = (
        df.columns
          .str.strip()
          .str.lower()
          .str.replace(" ", "_", regex=False)
    )
    # 2) drop exact duplicates on id
    if "id" in df.columns:
        df = df.drop_duplicates(subset="id")
    # 3) parse timestamps & compute duration
    if {"deadline", "launched_at"}.issubset(df.columns):
        df["deadline"]    = pd.to_datetime(df["deadline"],    unit="s", errors="coerce")
        df["launched_at"] = pd.to_datetime(df["launched_at"], unit="s", errors="coerce")
        df["campaign_duration_days"] = (df["deadline"] - df["launched_at"]).dt.days
    # 4) convert key financial columns to numeric
    for col in ["goal", "pledged", "usd_goal_real", "usd_pledged_real"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    # 5) drop columns with >90% missing
    threshold = int(len(df) * 0.10)
    df = df.dropna(axis=1, thresh=threshold)
    # 6) impute remaining missings
    for col in df.columns:
        if df[col].dtype.kind in "biufc":  # numeric
            df[col].fillna(df[col].median(), inplace=True)
        else:                             # categorical/text
            mode = df[col].mode()
            df[col].fillna(mode.iloc[0] if not mode.empty else "", inplace=True)
    return df


In [3]:
for name, path in FILE_PATHS.items():
    print(f"Processing {name}…")
    if not path.exists():
        print(f"  ERROR: file not found at {path}")
        continue

    raw = pd.read_csv(path, encoding="latin1", low_memory=False)
    print(f"  raw shape:   {raw.shape}")

    clean = preprocess_df(raw)
    print(f"  clean shape: {clean.shape}")
    display(clean.head())

    out_path = OUTPUT_DIR / f"{name}_clean.csv"
    clean.to_csv(out_path, index=False)
    print(f"  saved to    {out_path}\n")


Processing ks_projects_2016_12…
  raw shape:   (323750, 17)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode.iloc[0] if not mode.empty else "", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which

  clean shape: (323750, 13)


Unnamed: 0,id,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 11:36:00,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0
1,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:20:50,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220
2,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16 04:24:11,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1
3,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29 01:00:00,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283
4,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01 13:38:27,50000.0,2016-02-26 13:38:27,52375.0,successful,224,US,52375


  saved to    C:\Users\joyaw\OneDrive\Desktop\CrowdFunding-Fraud-Detection\preprocessed_data\ks_projects_2016_12_clean.csv

Processing ks_projects_2018_01…
  raw shape:   (378661, 15)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode.iloc[0] if not mode.empty else "", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which

  clean shape: (378661, 15)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode.iloc[0] if not mode.empty else "", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which

Unnamed: 0,id,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


  saved to    C:\Users\joyaw\OneDrive\Desktop\CrowdFunding-Fraud-Detection\preprocessed_data\ks_projects_2018_01_clean.csv

Processing live_campaigns…
  raw shape:   (4000, 13)
  clean shape: (4000, 13)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

Unnamed: 0,unnamed:_0,amt.pledged,blurb,by,country,currency,end.time,location,percentage.funded,state,title,type,url
0,0,15823.0,"\r\n'Catalysts, Explorers & Secret Keepers: Wo...",Museum of Science Fiction,US,usd,2016-11-01T23:59:00-04:00,"Washington, DC",186,DC,"Catalysts, Explorers & Secret Keepers: Women o...",Town,/projects/1608905146/catalysts-explorers-and-s...
1,1,6859.0,\r\nA unique handmade picture book for kids & ...,"Tyrone Wells & Broken Eagle, LLC",US,usd,2016-11-25T01:13:33-05:00,"Portland, OR",8,OR,The Whatamagump (a hand-crafted story picture ...,Town,/projects/thewhatamagump/the-whatamagump-a-han...
2,2,17906.0,\r\nA horror comedy about a repairman who was ...,Tessa Stone,US,usd,2016-11-23T23:00:00-05:00,"Los Angeles, CA",102,CA,Not Drunk Enough Volume 1!,Town,/projects/1890925998/not-drunk-enough-volume-1...
3,3,67081.0,\r\nThe Johnny Wander autobio omnibus you've a...,Johnny Wander,US,usd,2016-11-01T23:50:00-04:00,"Brooklyn, NY",191,NY,Our Cats Are More Famous Than Us: A Johnny Wan...,County,/projects/746734715/our-cats-are-more-famous-t...
4,4,32772.0,\r\nThe vision for this project is the establi...,Beau's All Natural Brewing Company,RW,cad,2016-11-18T23:05:48-05:00,"Kigali, Rwanda",34,Kigali Province,The Rwanda Craft Brewery Project,Town,/projects/beaus/the-rwanda-craft-brewery-proje...


  saved to    C:\Users\joyaw\OneDrive\Desktop\CrowdFunding-Fraud-Detection\preprocessed_data\live_campaigns_clean.csv

Processing most_backed…
  raw shape:   (4000, 13)
  clean shape: (4000, 13)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

Unnamed: 0,unnamed:_0,amt.pledged,blurb,by,category,currency,goal,location,num.backers,num.backers.tier,pledge.tier,title,url
0,0,8782571.0,\r\nThis is a card game for people who are int...,Elan Lee,Tabletop Games,usd,10000.0,"Los Angeles, CA",219382,"[15505, 202934, 200, 5]","[20.0, 35.0, 100.0, 500.0]",Exploding Kittens,/projects/elanlee/exploding-kittens
1,1,6465690.0,"\r\nAn unusually addicting, high-quality desk ...",Matthew and Mark McLachlan,Product Design,usd,15000.0,"Denver, CO",154926,"[788, 250, 43073, 21796, 41727, 21627, 12215, ...","[1.0, 14.0, 19.0, 19.0, 35.0, 35.0, 79.0, 79.0...",Fidget Cube: A Vinyl Desk Toy,/projects/antsylabs/fidget-cube-a-vinyl-desk-toy
2,2,5408916.0,\r\nBring Reading Rainbowâs library of inter...,LeVar Burton & Reading Rainbow,Web,usd,1000000.0,"Los Angeles, CA",105857,"[19639, 14343, 9136, 2259, 5666, 24512, 4957, ...","[5.0, 10.0, 25.0, 30.0, 35.0, 50.0, 75.0, 100....","Bring Reading Rainbow Back for Every Child, Ev...",/projects/readingrainbow/bring-reading-rainbow...
3,3,5702153.0,\r\nUPDATED: This is it. We're making a Veroni...,Rob Thomas,Narrative Film,usd,2000000.0,"San Diego, CA",91585,"[5938, 8423, 11509, 22997, 23227, 1865, 7260, ...","[1.0, 10.0, 25.0, 35.0, 50.0, 75.0, 100.0, 150...",The Veronica Mars Movie Project,/projects/559914737/the-veronica-mars-movie-pr...
4,4,3336371.0,"\r\nAn adventure game from Tim Schafer, Double...",Double Fine and 2 Player Productions,Video Games,usd,400000.0,"San Francisco, CA",87142,"[47946, 24636, 1090, 11530, 900, 148, 100, 10, 4]","[15.0, 30.0, 60.0, 100.0, 250.0, 500.0, 1000.0...",Double Fine Adventure,/projects/doublefine/double-fine-adventure


  saved to    C:\Users\joyaw\OneDrive\Desktop\CrowdFunding-Fraud-Detection\preprocessed_data\most_backed_clean.csv

