In [7]:
import os
import time
import pandas as pd
from datetime import datetime
import gender_guesser.detector as gg


def cleanData(data, scrapeDate):
    # Remove ongoing projects (at the time of scraping)
    scrapeDateUNIX = time.mktime(datetime(scrapeDate[0], scrapeDate[1], scrapeDate[2], 12, 0, 0).timetuple())  # conversion to UNIX timestamp
    data = data[(data['deadline'] < scrapeDateUNIX)]

    # Modifying columns
    data['goal'] *= data['fx_rate']
    data['raised'] = data['pledged'] * data['fx_rate']
    data['country'] = data['country'].apply(lambda entry: getCountry(entry))

    # Creating new columns
    data['projectID'] = data['profile'].apply(lambda entry: getProjectID(entry))
    data['successful'] = (data['raised'] >= data['goal'])
    data['disaster'] = (data['raised'] == 0)
    data['duration'] = ((data['deadline'] - data['launched_at']) / 3600 / 24)
    data['creator_gender'] = data[['country', 'creator']].apply(lambda entry: getGender(entry), axis=1)
    data['U.S.'] = (data['country'] == "usa")
    data['url'] = data['urls'].apply(lambda entry: getUrl(entry))

    # Remove entries with an 'unknown' or 'andy' gender
    data = data[(data['creator_gender'].isin(['male', 'female', 'mostly_male', 'mostly_female']))]

    # Modifying columns, again
    data = data.set_index('projectID')
    data['successful'] = data['successful'].replace({True: 1, False: 0})
    data['disaster'] = data['disaster'].replace({True: 1, False: 0})
    data['creator_gender'] = data['creator_gender'].replace({'male': 1, 'female': 0, 'mostly_male': 1, 'mostly_female': 0})
    data['U.S.'] = data['U.S.'].replace({True: 1, False: 0})

    # Misc
    data = data.rename(columns={"backers_count": "no_backers"})
    data = data[['successful', 'disaster', 'goal', 'raised', 'no_backers', 'duration', 'creator_gender', 'U.S.', 'url']]
    return data

def getCountry(countryCode):
    if countryCode in countries.keys():
        return countries.get(countryCode)
    else:
        return "other_countries"

def getProjectID(entry):
    projectID = entry.split(",")[1]
    projectID = int(projectID.split(":")[1])
    return projectID

def getGender(entry):
    fullName = entry['creator'].split(",")[1]
    fullName = fullName.split(":")[1]
    fullName = fullName.replace("\"", "")
    firstName = fullName.split(" ")[0]
    gender = genderDetector.get_gender(firstName, entry['country'])
    return gender

def getUrl(entry):
    url = entry.split(",")[0]
    url = url.split("\"")[5]
    url = url.split("?")[0]
    return url

def getDateFromFileName(fileName):
    split = fileName.split('-')
    del split[2]
    return '-'.join(split)

scrapeDates = [
    [2019, 6, 13], [2019, 7, 18], [2019, 8, 15], [2019, 9, 12], [2019, 10, 17], [2019, 11, 14], [2019, 12, 12],
    [2020, 1, 16], [2020, 2, 13], [2020, 3, 12], [2020, 4, 16], [2020, 5, 14], [2020, 6, 18], [2020, 7, 16], [2020, 8, 13], [2020, 9, 17], [2020, 10, 15], [2020, 11, 12], [2020, 12, 17],
    [2021, 1, 14], [2021, 2, 11], [2021, 3, 18], [2021, 4, 15], [2021, 5, 17], [2021, 6, 17], [2021, 7, 15], [2021, 8, 12], [2021, 9, 16], [2021, 10, 15], [2021, 11, 19], [2021, 12, 14],
    [2022, 1, 20], [2022, 2, 10], [2022, 3, 24], [2022, 4, 21], [2022, 5, 19], [2022, 6, 9], [2022, 7, 14], [2022, 8, 11]
]
countries = {
    "GB": "great_britain", "IE": "ireland", "US": "usa", "IT": "italy", "MT": "malta", "PT": "portugal", "ES": "spain", "FR": "france",
    "BE": "belgium", "LU": "luxembourg", "NL": "the_netherlands", "DE": "germany", "AT": "austria", "CH": "swiss", "IS": "iceland",
    "DK": "denmark", "NO": "norway", "SE": "sweden", "FI": "finland", "EE": "estonia", "LV": "latvia", "LT": "lithuania", "PL": "poland",
    "CZ": "czech_republic", "SK": "slovakia", "HU": "hungary", "RO": "romania", "BG": "bulgaria", "BA": "bosniaand", "HR": "croatia",
    "XK": "kosovo", "MK": "macedonia", "ME": "montenegro", "RS": "serbia", "SI": "slovenia", "AL": "albania", "GR": "greece", "RU": "russia",
    "BY": "belarus", "MD": "moldova", "UA": "ukraine", "AM": "armenia", "AZ": "azerbaijan", "GE": "georgia", "KZ": "the_stans", "KG": "the_stans",
    "TJ": "the_stans", "TM": "the_stans", "UZ": "the_stans", "TR": "turkey", "SA": "arabia", "IL": "israel", "CN": "china", "IN": "india",
    "JP": "japan", "KR": "korea", "VN": "vietnam"
}
files = os.listdir("Kickstarter Data")
genderDetector = gg.Detector()

allData = cleanData(pd.read_csv("Kickstarter Data\\" + files[0]), scrapeDates[0])

previousDate = getDateFromFileName(files.pop(0))
scrapeDate = scrapeDates.pop(0)
for file in files:
    date = getDateFromFileName(file)
    if date != previousDate:
        previousDate = date
        scrapeDate = scrapeDates.pop(0)

    allData = pd.concat([allData, cleanData(pd.read_csv("Kickstarter Data\\" + file), scrapeDate)])

allData

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['goal'] *= data['fx_rate']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['raised'] = data['pledged'] * data['fx_rate']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['country'] = data['country'].apply(lambda entry: getCountry(entry))
A value is trying to be set on a copy of a slice 

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [34]:
import pandas as pd


def cleanData(data):
    # Select relevant columns
    data = data[['backers_count', 'category', 'country', 'creator', 'deadline', 'fx_rate', 'goal', 'launched_at', 'photo', 'pledged', 'profile', 'state', 'urls']]

    # Filtering
    data = data[data['country'] == "US"]
    data = data[data['state'].isin(['successful', 'failed'])]

    # Modifying existing columns
    data['category'] = data['category'].apply(lambda entry: getCategoryName(entry))
    data['creator'] = data['creator'].apply(lambda entry: getCreatorProfileURL(entry))
    data['goal'] *= data['fx_rate']
    data['pledged'] *= data['fx_rate']
    data['profile'] = data['profile'].apply(lambda entry: getProjectID(entry))
    data['state'] = data['state'].replace({'successful': 1, 'failed': 0})
    data['urls'] = data['urls'].apply(lambda entry: getProjectURL(entry))

    # Creating new columns
    data['disaster'] = (data['pledged'] == 0)
    data['disaster'] = data['disaster'].replace({True: 1, False: 0})
    data['duration'] = (data['deadline'] - data['launched_at']) / 3600 / 24
    data['margin'] = (data['pledged'] - data['goal'])

    # Misc
    data = data.rename(columns={"creator": "creator_profile_url", "pledged": "raised", "profile": "projectID", "state": "success", "urls": "project_url"})
    data = data.set_index('projectID')
    data = data[['backers_count', 'category', 'creator_profile_url', 'disaster', 'duration', 'goal', 'margin', 'project_url', 'raised', 'success']]

    return data

def getCreatorProfileURL(entry):
    tmp = entry.split(",")
    tmp = tmp[len(tmp) - 2]
    tmp = tmp.split("{")[2]
    creatorProfileURL = tmp.split("\"")[3]
    return creatorProfileURL

def getCategoryName(entry):
    tmp = entry.split(",")[2]
    tmp = tmp.split(":")[1]
    tmp = tmp.split("/")[0]
    categoryName = tmp.replace("\"", "")
    return categoryName

def getProjectID(entry):
    tmp = entry.split(",")[1]
    projectID = int(tmp.split(":")[1])
    return projectID

def getProjectURL(entry):
    tmp = entry.split(",")[0]
    tmp = tmp.split("\"")[5]
    projectURL = tmp.split("?")[0]
    return projectURL

allData = cleanData(pd.read_csv("Kickstarter Data\\2019-06-01.csv"))
allData

Unnamed: 0_level_0,backers_count,category,creator_profile_url,disaster,duration,goal,margin,project_url,raised,success
projectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2246288,740,art,https://www.kickstarter.com/profile/skullgarden,0,31.003067,3400.0,15159.00,https://www.kickstarter.com/projects/skullgard...,18559.00,1
2944476,3,film & video,https://www.kickstarter.com/profile/2031793373,0,58.959757,4000.0,-3920.00,https://www.kickstarter.com/projects/203179337...,80.00,0
1985482,1752,film & video,https://www.kickstarter.com/profile/madoverlord,0,29.958333,30000.0,105589.94,https://www.kickstarter.com/projects/madoverlo...,135589.94,1
2309240,185,publishing,https://www.kickstarter.com/profile/672576444,0,30.000000,4000.0,652.00,https://www.kickstarter.com/projects/672576444...,4652.00,1
3398789,111,art,https://www.kickstarter.com/profile/pabkins,0,30.003738,4900.0,730.00,https://www.kickstarter.com/projects/pabkins/g...,5630.00,1
...,...,...,...,...,...,...,...,...,...,...
4097,2,art,https://www.kickstarter.com/profile/343865871,0,84.242627,1000.0,-940.00,https://www.kickstarter.com/projects/343865871...,60.00,0
1764554,1,crafts,https://www.kickstarter.com/profile/1689407442,0,30.000000,1000.0,-990.00,https://www.kickstarter.com/projects/168940744...,10.00,0
3465487,263,music,https://www.kickstarter.com/profile/423088081,0,30.000000,17500.0,991.00,https://www.kickstarter.com/projects/423088081...,18491.00,1
2029134,3,food,https://www.kickstarter.com/profile/1888439791,0,27.171227,3300.0,-3230.00,https://www.kickstarter.com/projects/188843979...,70.00,0
