In [3]:
import os
import time
import warnings
import numpy as np
import pandas as pd
from datetime import datetime


def cleanData(data, projectIDs):
    # Misc
    data = data[['backers_count', 'category', 'country', 'deadline', 'fx_rate', 'goal', 'launched_at', 'pledged', 'profile', 'state', 'urls']]
    data['midway_date'] = (data['launched_at'] + data['deadline']) / 2
    data['projectID'] = data['profile'].apply(lambda entry: getProjectID(entry))
    data = data.set_index('projectID')

    # Filtering
    data = data[data['country'] == "US"]
    data = data[data['pledged'] > 0]
    data = data[data['state'].isin(['successful', 'failed'])]
    data = data[data['midway_date'] >= time.mktime(datetime(2019, 6, 1, 0, 0, 0).timetuple())]
    data = data[data['midway_date'] <= time.mktime(datetime(2022, 5, 31, 23, 59, 59).timetuple())]
    data = data[~data.index.isin(projectIDs)]
    data = data.drop_duplicates()
    if len(data) == 0:
        return None

    # Modifying existing columns
    data['category'] = data['category'].apply(lambda entry: getCategoryName(entry))
    data['Lgoal'] = np.log10(data['goal'] * data['fx_rate'])
    data['Lamount'] = np.log10(data['pledged'] * data['fx_rate'])
    data['state'] = data['state'].replace({'successful': 1, 'failed': 0})
    data['slug'] = data['urls'].apply(lambda entry: getSlug(entry))

    # Creating new columns
    data['category_art'] = (data['category'] == "art").replace({True: 1, False: 0})
    data['category_comics'] = (data['category'] == "comics").replace({True: 1, False: 0})
    data['category_crafts'] = (data['category'] == "crafts").replace({True: 1, False: 0})
    data['category_dance'] = (data['category'] == "dance").replace({True: 1, False: 0})
    data['category_design'] = (data['category'] == "design").replace({True: 1, False: 0})
    data['category_fashion'] = (data['category'] == "fashion").replace({True: 1, False: 0})
    data['category_film&video'] = (data['category'] == "film & video").replace({True: 1, False: 0})
    data['category_food'] = (data['category'] == "food").replace({True: 1, False: 0})
    data['category_games'] = (data['category'] == "games").replace({True: 1, False: 0})
    data['category_journalism'] = (data['category'] == "journalism").replace({True: 1, False: 0})
    data['category_music'] = (data['category'] == "music").replace({True: 1, False: 0})
    data['category_photography'] = (data['category'] == "photography").replace({True: 1, False: 0})
    data['category_publishing'] = (data['category'] == "publishing").replace({True: 1, False: 0})
    data['category_technology'] = (data['category'] == "technology").replace({True: 1, False: 0})
    data['category_theater'] = (data['category'] == "theater").replace({True: 1, False: 0})
    data['duration'] = (data['deadline'] - data['launched_at']) / 3600 / 24
    data['month'] = data.apply(lambda entry: datetime.fromtimestamp(entry['midway_date']).month, axis=1)
    data['year'] = data.apply(lambda entry: datetime.fromtimestamp(entry['midway_date']).year, axis=1)

    # Misc
    data = data.rename(columns={"state": "success"})
    data = data[['success', 'Lamount', 'Lgoal', 'backers_count', 'duration', 'category_art', 'category_comics', 'category_crafts', 'category_dance', 'category_design', 'category_fashion', 'category_film&video', 'category_food', 'category_games', 'category_journalism', 'category_music', 'category_photography', 'category_publishing', 'category_technology', 'category_theater', 'month', 'year', 'slug', 'launched_at']]

    return data


def getCategoryName(entry):
    tmp = entry.split(",")[2]
    tmp = tmp.split(":")[1]
    tmp = tmp.split("/")[0]
    categoryName = tmp.replace("\"", "")
    return categoryName


def getProjectID(entry):
    tmp = entry.split(",")[1]
    projectID = int(tmp.split(":")[1])
    return projectID


def getSlug(entry):
    tmp = entry.split(",")[0]
    tmp = tmp.split("/")[4:6]
    tmp = '/'.join(tmp)
    slug = tmp.split("?")[0]
    return slug

warnings.filterwarnings("ignore")
files = os.listdir("Kickstarter Data")
allData = pd.DataFrame()
for file in files:
    allData = pd.concat([allData, cleanData(pd.read_csv("Kickstarter Data\\" + file), allData.index)])
allData = allData.groupby(['year', 'month']).sample(n=300)
allData.to_csv('input.csv')
allData

Unnamed: 0_level_0,success,Lamount,Lgoal,backers_count,duration,category_art,category_comics,category_crafts,category_dance,category_design,...,category_journalism,category_music,category_photography,category_publishing,category_technology,category_theater,month,year,slug,launched_at
projectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3733950,0,0.477121,4.000000,3,30.000000,0,0,0,0,0,...,0,0,0,0,0,0,6,2019,beatenpathco/clothing-for-beach-bums-mountain-...,1558057933
3735666,1,3.233250,2.698970,38,30.000000,0,0,0,0,0,...,0,0,0,1,0,0,6,2019,pjp/blazing-a-trail-the-story-of-minna-anthony...,1558321599
3731059,1,4.063784,3.477121,180,30.661609,0,0,0,0,0,...,0,0,0,1,0,0,6,2019,cosplaydeviants/cosplay-deviants-2020-wall-cal...,1558094777
3743927,0,1.505150,3.000000,4,7.000000,0,0,0,0,0,...,0,0,0,0,0,0,6,2019,arch-angel/broke-t-shirts,1559575620
3753934,1,2.775974,2.698970,19,3.000000,0,0,0,1,0,...,0,0,0,0,0,0,6,2019,5x5campaign/5x5,1561744849
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4349010,1,3.903253,3.124830,131,30.000000,0,0,0,0,0,...,0,0,0,0,0,0,5,2022,barinstacoffee/barinsta-coffee-freeze-dried-pr...,1650371481
4390665,0,3.207096,4.176091,11,30.000000,0,0,0,0,0,...,0,0,0,0,0,0,5,2022,cocoaandbutter/cocoa-and-butter-palo-alto,1650344109
4392008,0,3.961279,4.342423,32,30.000000,0,0,0,0,0,...,0,0,0,0,0,0,5,2022,72395701/macbook-pro-docking-station-by-landin...,1651597201
4370213,1,4.002166,4.000000,81,25.000000,0,0,0,0,0,...,0,0,0,0,0,0,5,2022,thomaspk/the-last-railroad-town-an-apca-short-...,1651874406
