In [19]:
import os
import time
import warnings
import numpy as np
import pandas as pd
from datetime import datetime


def cleanData(data, projectIDs):
    # Misc
    data = data[['backers_count', 'category', 'country', 'deadline', 'fx_rate', 'goal', 'launched_at', 'pledged', 'profile', 'state', 'urls']]
    data['midway_date'] = (data['launched_at'] + data['deadline']) / 2
    data['projectID'] = data['profile'].apply(lambda entry: getProjectID(entry))
    data = data.set_index('projectID')

    # Filtering
    data = data[data['country'] == "US"]
    data = data[data['pledged'] > 0]
    data = data[data['state'].isin(['successful', 'failed'])]
    data = data[data['midway_date'] >= time.mktime(datetime(2019, 6, 1, 0, 0, 0).timetuple())]
    data = data[data['midway_date'] <= time.mktime(datetime(2022, 5, 31, 23, 59, 59).timetuple())]
    data = data[~data.index.isin(projectIDs)]
    data = data.drop_duplicates()
    if len(data) == 0:
        return None

    # Modifying existing columns
    data['category'] = data['category'].apply(lambda entry: getCategoryName(entry))
    data['Lgoal'] = np.log10(data['goal'] * data['fx_rate'])
    data['Lamount'] = np.log10(data['pledged'] * data['fx_rate'])
    data['state'] = data['state'].replace({'successful': 1, 'failed': 0})
    data['slug'] = data['urls'].apply(lambda entry: getSlug(entry))

    # Creating new columns
    data['category_art'] = (data['category'] == "art").replace({True: 1, False: 0})
    data['category_comics'] = (data['category'] == "comics").replace({True: 1, False: 0})
    data['category_crafts'] = (data['category'] == "crafts").replace({True: 1, False: 0})
    data['category_dance'] = (data['category'] == "dance").replace({True: 1, False: 0})
    data['category_design'] = (data['category'] == "design").replace({True: 1, False: 0})
    data['category_fashion'] = (data['category'] == "fashion").replace({True: 1, False: 0})
    data['category_film&video'] = (data['category'] == "film & video").replace({True: 1, False: 0})
    data['category_food'] = (data['category'] == "food").replace({True: 1, False: 0})
    data['category_games'] = (data['category'] == "games").replace({True: 1, False: 0})
    data['category_journalism'] = (data['category'] == "journalism").replace({True: 1, False: 0})
    data['category_music'] = (data['category'] == "music").replace({True: 1, False: 0})
    data['category_photography'] = (data['category'] == "photography").replace({True: 1, False: 0})
    data['category_publishing'] = (data['category'] == "publishing").replace({True: 1, False: 0})
    data['category_technology'] = (data['category'] == "technology").replace({True: 1, False: 0})
    data['category_theater'] = (data['category'] == "theater").replace({True: 1, False: 0})
    data['duration'] = (data['deadline'] - data['launched_at']) / 3600 / 24
    data['month'] = data.apply(lambda entry: datetime.fromtimestamp(entry['midway_date']).month, axis=1)
    data['year'] = data.apply(lambda entry: datetime.fromtimestamp(entry['midway_date']).year, axis=1)

    # Misc
    data = data.rename(columns={"state": "success"})
    data = data[['success', 'Lamount', 'Lgoal', 'backers_count', 'duration', 'category_art', 'category_comics', 'category_crafts', 'category_dance', 'category_design', 'category_fashion', 'category_film&video', 'category_food', 'category_games', 'category_journalism', 'category_music', 'category_photography', 'category_publishing', 'category_technology', 'category_theater', 'month', 'year', 'slug', 'launched_at']]

    return data


def getCategoryName(entry):
    tmp = entry.split(",")[2]
    tmp = tmp.split(":")[1]
    tmp = tmp.split("/")[0]
    categoryName = tmp.replace("\"", "")
    return categoryName


def getProjectID(entry):
    tmp = entry.split(",")[1]
    projectID = int(tmp.split(":")[1])
    return projectID


def getSlug(entry):
    tmp = entry.split(",")[0]
    tmp = tmp.split("/")[4:6]
    tmp = '/'.join(tmp)
    slug = tmp.split("?")[0]
    return slug

warnings.filterwarnings("ignore")
files = os.listdir("Kickstarter Data")
allData = pd.DataFrame()
for file in files:
    allData = pd.concat([allData, cleanData(pd.read_csv("Kickstarter Data\\" + file), allData.index)])
allData.to_csv('input.csv')
allData

Unnamed: 0_level_0,success,Lamount,Lgoal,backers_count,duration,category_art,category_comics,category_crafts,category_dance,category_design,...,category_journalism,category_music,category_photography,category_publishing,category_technology,category_theater,month,year,slug,launched_at
projectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3734507,1,4.177345,4.176091,111,1.024942,0,0,0,1,0,...,0,0,0,0,0,0,6,2019,feath3r/the-mo-money-mo-problems-24hour-tele-g...,1559913845
3742749,0,2.315970,3.544068,7,12.180475,0,0,0,0,0,...,0,0,0,1,0,0,6,2019,itsagoodlifebabe/its-a-good-life-babe-podcast-...,1559263147
3737923,1,3.700877,3.602060,58,14.000000,0,0,0,1,0,...,0,0,0,0,0,0,6,2019,pilotplatform/pilot,1559145618
3723676,0,1.491362,3.778151,3,4.000000,1,0,0,0,0,...,0,0,0,0,0,0,6,2019,hellyes/alex-stop-my-metoo-stirring-the-hellye...,1559422814
3741568,1,3.207096,3.154728,70,12.552477,0,0,0,0,0,...,0,0,0,1,0,0,6,2019,1860098210/another-unconventional-literary-gem...,1559133866
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4388088,1,3.742332,3.698970,93,30.000000,0,0,0,0,0,...,0,0,0,0,0,0,5,2022,ksparrock/correctable-error,1652285748
4385947,0,1.477121,3.477121,2,60.000000,0,0,0,0,0,...,0,0,0,0,0,0,5,2022,caranddrone/car-and-drone-fountain-hills-az-lo...,1649702841
4225380,1,4.603985,4.477121,148,30.000000,0,0,0,0,0,...,0,0,0,0,0,0,5,2022,foilfilm/foil,1652709377
4400476,1,2.803245,2.698970,15,35.916748,0,0,0,0,0,...,0,0,0,0,0,0,5,2022,takethiscomedy/take-this,1652209373
