In [3]:
import os
import time
import warnings
import numpy as np
import pandas as pd
from datetime import datetime


def cleanData(data, projectIDs):
    # Misc
    data = data[['backers_count', 'category', 'country', 'deadline', 'fx_rate', 'goal', 'launched_at', 'pledged', 'profile', 'state', 'urls']]
    data['midway_date'] = (data['launched_at'] + data['deadline']) / 2
    data['projectID'] = data['profile'].apply(lambda entry: getProjectID(entry))
    data = data.set_index('projectID')

    # Filtering
    data = data[data['country'] == "US"]
    data = data[data['pledged'] > 0]
    data = data[data['state'].isin(['successful', 'failed'])]
    data = data[data['midway_date'] >= time.mktime(datetime(2019, 6, 1, 0, 0, 0).timetuple())]
    data = data[data['midway_date'] <= time.mktime(datetime(2022, 5, 31, 23, 59, 59).timetuple())]
    data = data[~data.index.isin(projectIDs)]
    data = data.drop_duplicates()
    if len(data) == 0:
        return None

    # Modifying existing columns
    data['category'] = data['category'].apply(lambda entry: getCategoryName(entry))
    data['Lgoal'] = np.log10(data['goal'] * data['fx_rate'])
    data['Lamount'] = np.log10(data['pledged'] * data['fx_rate'])
    data['state'] = data['state'].replace({'successful': 1, 'failed': 0})
    data['slug'] = data['urls'].apply(lambda entry: getSlug(entry))

    # Creating new columns
    data['category_art'] = (data['category'] == "art").replace({True: 1, False: 0})
    data['category_comics'] = (data['category'] == "comics").replace({True: 1, False: 0})
    data['category_crafts'] = (data['category'] == "crafts").replace({True: 1, False: 0})
    data['category_dance'] = (data['category'] == "dance").replace({True: 1, False: 0})
    data['category_design'] = (data['category'] == "design").replace({True: 1, False: 0})
    data['category_fashion'] = (data['category'] == "fashion").replace({True: 1, False: 0})
    data['category_film&video'] = (data['category'] == "film & video").replace({True: 1, False: 0})
    data['category_food'] = (data['category'] == "food").replace({True: 1, False: 0})
    data['category_games'] = (data['category'] == "games").replace({True: 1, False: 0})
    data['category_journalism'] = (data['category'] == "journalism").replace({True: 1, False: 0})
    data['category_music'] = (data['category'] == "music").replace({True: 1, False: 0})
    data['category_photography'] = (data['category'] == "photography").replace({True: 1, False: 0})
    data['category_publishing'] = (data['category'] == "publishing").replace({True: 1, False: 0})
    data['category_technology'] = (data['category'] == "technology").replace({True: 1, False: 0})
    data['category_theater'] = (data['category'] == "theater").replace({True: 1, False: 0})
    data['duration'] = (data['deadline'] - data['launched_at']) / 3600 / 24
    data['month'] = data.apply(lambda entry: datetime.fromtimestamp(entry['midway_date']).month, axis=1)
    data['year'] = data.apply(lambda entry: datetime.fromtimestamp(entry['midway_date']).year, axis=1)

    # Misc
    data = data.rename(columns={"state": "success"})
    data = data[['success', 'Lamount', 'Lgoal', 'backers_count', 'duration', 'category_art', 'category_comics', 'category_crafts', 'category_dance', 'category_design', 'category_fashion', 'category_film&video', 'category_food', 'category_games', 'category_journalism', 'category_music', 'category_photography', 'category_publishing', 'category_technology', 'category_theater', 'month', 'year', 'slug', 'launched_at']]

    return data


def getCategoryName(entry):
    tmp = entry.split(",")[2]
    tmp = tmp.split(":")[1]
    tmp = tmp.split("/")[0]
    categoryName = tmp.replace("\"", "")
    return categoryName


def getProjectID(entry):
    tmp = entry.split(",")[1]
    projectID = int(tmp.split(":")[1])
    return projectID


def getSlug(entry):
    tmp = entry.split(",")[0]
    tmp = tmp.split("/")[4:6]
    tmp = '/'.join(tmp)
    slug = tmp.split("?")[0]
    return slug


warnings.filterwarnings("ignore")
files = os.listdir("Kickstarter Data")
allData = pd.DataFrame()
for file in files:
    allData = pd.concat([allData, cleanData(pd.read_csv("Kickstarter Data\\" + file), allData.index)])
allData = allData.groupby(['year', 'month']).sample(n=300)
allData.to_csv('input.csv')
allData

Unnamed: 0_level_0,success,Lamount,Lgoal,backers_count,duration,category_art,category_comics,category_crafts,category_dance,category_design,...,category_journalism,category_music,category_photography,category_publishing,category_technology,category_theater,month,year,slug,launched_at
projectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3733950,0,0.477121,4.000000,3,30.000000,0,0,0,0,0,...,0,0,0,0,0,0,6,2019,beatenpathco/clothing-for-beach-bums-mountain-...,1558057933
3735666,1,3.233250,2.698970,38,30.000000,0,0,0,0,0,...,0,0,0,1,0,0,6,2019,pjp/blazing-a-trail-the-story-of-minna-anthony...,1558321599
3731059,1,4.063784,3.477121,180,30.661609,0,0,0,0,0,...,0,0,0,1,0,0,6,2019,cosplaydeviants/cosplay-deviants-2020-wall-cal...,1558094777
3743927,0,1.505150,3.000000,4,7.000000,0,0,0,0,0,...,0,0,0,0,0,0,6,2019,arch-angel/broke-t-shirts,1559575620
3753934,1,2.775974,2.698970,19,3.000000,0,0,0,1,0,...,0,0,0,0,0,0,6,2019,5x5campaign/5x5,1561744849
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4349010,1,3.903253,3.124830,131,30.000000,0,0,0,0,0,...,0,0,0,0,0,0,5,2022,barinstacoffee/barinsta-coffee-freeze-dried-pr...,1650371481
4390665,0,3.207096,4.176091,11,30.000000,0,0,0,0,0,...,0,0,0,0,0,0,5,2022,cocoaandbutter/cocoa-and-butter-palo-alto,1650344109
4392008,0,3.961279,4.342423,32,30.000000,0,0,0,0,0,...,0,0,0,0,0,0,5,2022,72395701/macbook-pro-docking-station-by-landin...,1651597201
4370213,1,4.002166,4.000000,81,25.000000,0,0,0,0,0,...,0,0,0,0,0,0,5,2022,thomaspk/the-last-railroad-town-an-apca-short-...,1651874406


In [1]:
import nltk
import warnings
import contractions
import pandas as pd
import gender_guesser.detector as gg

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from lingua import LanguageDetectorBuilder


def getWordNetTag(treebankTag):
    if treebankTag.startswith("JJ"):
        return "a"
    elif treebankTag.startswith("RB"):
        return "r"
    elif treebankTag.startswith("VB"):
        return "v"
    else:
        return "n"


lemmatizer = WordNetLemmatizer()
listOfStopwords = stopwords.words('english')
specialCharacters = ["`", "~", "!", "@", "#", "$", "%", "€", "^", "&", "*", "(", ")", "-", "_",
                     "+", "=", "{", "[", "}", "]", ":", ";", "\"", "|", "\\", "<", ",", ">", ".",
                     "?", "/"]
def cleanStory(story):
    cleanedStory = story.replace("You'll need an HTML5 capable browser to see this content.", "")

    # Isolate special characters
    for specialCharacter in specialCharacters:
        cleanedStory = cleanedStory.replace(specialCharacter, " " + specialCharacter + " ")

    # Fix contractions
    cleanedStory = cleanedStory.split()
    tmp_cleanedStory = []
    for word in cleanedStory:
        tmp_cleanedStory += contractions.fix(word).split()
    cleanedStory = tmp_cleanedStory

    # Tag parts of speech
    tags = nltk.pos_tag(cleanedStory)

    # 1) Lemmatize words
    # 2) Lowercase words
    # 3) Remove stop words
    tmp_cleanedStory = []
    for i in range(len(cleanedStory)):
        word = cleanedStory[i]
        tag = getWordNetTag(tags[i][1])
        lemmatizedWord = lemmatizer.lemmatize(word, tag).lower()
        if lemmatizedWord not in listOfStopwords:
            tmp_cleanedStory.append(lemmatizedWord)
    cleanedStory = ' '.join(tmp_cleanedStory)

    # Remove useless characters
    tmp_cleanedStory = ""
    for character in cleanedStory:
        if character.isalpha() or character == " ":
            tmp_cleanedStory += character
    cleanedStory = ' '.join(tmp_cleanedStory.split())

    return cleanedStory


detector = LanguageDetectorBuilder.from_all_languages().build()
def detectLanguage(story):
    language = detector.detect_language_of(story)
    if language is None:
        return "UNKNOWN"
    else:
        return language.name


warnings.filterwarnings("ignore")
projects = pd.read_csv('Scraped Data\\data.csv', encoding='cp1252')
projects = projects.set_index('projectID')
projects = projects[projects['creator_name'] != ""]
projects = projects[projects['story'] != ""]

genderGuesser = gg.Detector()
projects['creator_gender'] = projects['creator_name'].apply(lambda name: genderGuesser.get_gender(name.split()[0]))
projects = projects[projects['creator_gender'].isin(['male', 'female', 'mostly_male', 'mostly_female'])]
projects['creator_gender'] = projects['creator_gender'].replace({'male': 0, 'female': 1, 'mostly_male': 0, 'mostly_female': 1})

projects['story'] = projects['story'].apply(lambda story: cleanStory(story))
projects['words'] = projects['story'].apply(lambda story: len(story.split()))
projects['language'] = projects['story'].apply(lambda story: detectLanguage(story))
projects = projects[projects['language'] == "ENGLISH"]

projects['sustainability'] = projects['story'].apply(lambda story: ('sustainability' in story) or ('sustainable' in story)).replace({True: 1, False: 0})
projects['url'] = projects['slug'].apply(lambda slug: "https://www.kickstarter.com/projects/" + slug)

projects = projects[['success', 'Lamount', 'creator_gender', 'Lgoal', 'words', 'backers_count', 'serial_entrepreneur', 'duration', 'category_art', 'category_comics', 'category_crafts', 'category_dance', 'category_design', 'category_fashion', 'category_film&video', 'category_food', 'category_games', 'category_journalism', 'category_music', 'category_photography', 'category_publishing', 'category_technology', 'category_theater', 'month', 'year', 'media', 'sustainability', 'story', 'url']]

groups = projects.groupby(['year', 'month'])
projects = groups.sample(groups.size().min())

projects.to_csv('finalData.csv')
projects

Unnamed: 0_level_0,success,Lamount,creator_gender,Lgoal,words,backers_count,serial_entrepreneur,duration,category_art,category_comics,...,category_photography,category_publishing,category_technology,category_theater,month,year,media,sustainability,story,url
projectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3731940,0,2.307496,0,3.778151,71,5,0,60.000000,0,0,...,0,0,1,0,6,2019,0,0,app go benefit trainer personnel look get fit ...,https://www.kickstarter.com/projects/fitnesspo...
3451605,1,5.066844,0,4.000000,497,2003,1,14.583148,0,0,...,0,0,0,0,6,2019,1,0,even miss kickstarter still get hand papillon ...,https://www.kickstarter.com/projects/kolossalg...
3722583,0,1.447158,0,3.778151,443,6,0,30.000000,0,1,...,0,0,0,0,6,2019,1,0,journey blak blak blak officially entitle blak...,https://www.kickstarter.com/projects/myheart/b...
3745862,1,3.266232,1,3.176091,474,48,0,39.687488,0,0,...,0,0,0,0,6,2019,1,0,isabel jazz folk musical teach empathy metoo s...,https://www.kickstarter.com/projects/annawestb...
3735376,1,3.000434,1,3.000000,130,17,0,25.989560,0,0,...,0,0,0,1,6,2019,0,0,main stage kids head new york young performer ...,https://www.kickstarter.com/projects/mainstage...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4378823,1,4.797053,0,4.447158,837,232,1,30.000000,0,0,...,0,0,0,0,5,2022,1,0,book maui mauka makai companion book maui coas...,https://www.kickstarter.com/projects/danielsul...
4394019,1,4.044657,0,4.000000,167,80,0,26.377384,0,0,...,0,0,0,0,5,2022,1,0,myrcene ale co dream mike goergen cy higgins m...,https://www.kickstarter.com/projects/myrceneal...
4385597,1,4.402519,0,4.397940,138,73,0,14.147211,0,0,...,0,0,0,0,5,2022,1,0,firefly theatrical need help build theatre spa...,https://www.kickstarter.com/projects/fireflyth...
4367598,1,3.625827,1,3.602060,304,94,0,36.537940,0,0,...,0,0,0,0,5,2022,1,0,big feelings coloring book come idea recovery ...,https://www.kickstarter.com/projects/bigfeelin...


In [8]:
import os
import time
import warnings
import pandas as pd
from datetime import datetime


def getCategory(entry):
    for category in entry.index:
        if entry.get(category) == 1:
            return category.split("_")[1]

    return "unknown"


def cleanData(data, projectIDs):
    data = data[['category', 'country', 'deadline', 'launched_at', 'pledged', 'profile', 'state']]
    data['midway_date'] = (data['launched_at'] + data['deadline']) / 2
    data['projectID'] = data['profile'].apply(lambda entry: getProjectID(entry))
    data = data.set_index('projectID')

    data = data[data['country'] == "US"]
    data = data[data['pledged'] > 0]
    data = data[data['state'].isin(['successful', 'failed'])]
    data = data[data['midway_date'] >= time.mktime(datetime(2019, 6, 1, 0, 0, 0).timetuple())]
    data = data[data['midway_date'] <= time.mktime(datetime(2022, 5, 31, 23, 59, 59).timetuple())]
    data = data[~data.index.isin(projectIDs)]
    data = data.drop_duplicates()
    if len(data) == 0:
        return None

    data['category'] = data['category'].apply(lambda entry: getCategoryName(entry))
    return data[['category']]


def getProjectID(entry):
    tmp = entry.split(",")[1]
    projectID = int(tmp.split(":")[1])
    return projectID


def getCategoryName(entry):
    tmp = entry.split(",")[3]
    tmp = tmp.split(":")[1]
    tmp = tmp.split("/")[0]
    tmp = tmp.replace("\"", "")
    categoryName = ''.join(tmp.split())

    if categoryName[0].isalpha():
        return categoryName
    else:
        return "unknown"


def repairCategory(projectID, category):
    if category == "unknown":
        if projectID in allData.index:
            return allData.loc[projectID]['category']

    return category


categories = ["art", "comics", "crafts", "dance", "design", "fashion", "film&video", "food",
              "games", "journalism", "music", "photography", "publishing", "technology", "theater"]
def categoryVariablesCheck(projectID, dummyVariables, categoryVariable):
    if (sum(dummyVariables) != 1) or (categoryVariable not in categories):
        print("Category error: " + projectID)


warnings.filterwarnings("ignore")
dataset = pd.read_csv('finalData.csv')
dataset = dataset.set_index('projectID')
dataset['category'] = dataset.apply(lambda entry: getCategory(entry[8:23]), axis=1)

files = os.listdir("Kickstarter Data")
allData = pd.DataFrame()
for file in files:
    year = int(file.split("-")[0])
    if year == 2021 or year == 2022:
        allData = pd.concat([allData, cleanData(pd.read_csv("Kickstarter Data\\" + file), allData.index)])

allData = allData[allData['category'] != "unknown"]
dataset['category'] = dataset.apply(lambda entry: repairCategory(entry.name, entry['category']), axis=1)
dataset['category_art'] = (dataset['category'] == "art").replace({True: 1, False: 0})
dataset['category_comics'] = (dataset['category'] == "comics").replace({True: 1, False: 0})
dataset['category_crafts'] = (dataset['category'] == "crafts").replace({True: 1, False: 0})
dataset['category_dance'] = (dataset['category'] == "dance").replace({True: 1, False: 0})
dataset['category_design'] = (dataset['category'] == "design").replace({True: 1, False: 0})
dataset['category_fashion'] = (dataset['category'] == "fashion").replace({True: 1, False: 0})
dataset['category_film&video'] = (dataset['category'] == "film&video").replace({True: 1, False: 0})
dataset['category_food'] = (dataset['category'] == "food").replace({True: 1, False: 0})
dataset['category_games'] = (dataset['category'] == "games").replace({True: 1, False: 0})
dataset['category_journalism'] = (dataset['category'] == "journalism").replace({True: 1, False: 0})
dataset['category_music'] = (dataset['category'] == "music").replace({True: 1, False: 0})
dataset['category_photography'] = (dataset['category'] == "photography").replace({True: 1, False: 0})
dataset['category_publishing'] = (dataset['category'] == "publishing").replace({True: 1, False: 0})
dataset['category_technology'] = (dataset['category'] == "technology").replace({True: 1, False: 0})
dataset['category_theater'] = (dataset['category'] == "theater").replace({True: 1, False: 0})

dataset.apply(lambda entry: categoryVariablesCheck(entry.name, entry[8:23], entry[29]), axis=1)
dataset

Unnamed: 0_level_0,success,Lamount,creator_gender,Lgoal,words,backers_count,serial_entrepreneur,duration,category_art,category_comics,...,category_publishing,category_technology,category_theater,month,year,media,sustainability,story,url,category
projectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3731940,0,2.307496,0,3.778151,71,5,0,60.000000,0,0,...,0,1,0,6,2019,0,0,app go benefit trainer personnel look get fit ...,https://www.kickstarter.com/projects/fitnesspo...,technology
3451605,1,5.066844,0,4.000000,497,2003,1,14.583148,0,0,...,0,0,0,6,2019,1,0,even miss kickstarter still get hand papillon ...,https://www.kickstarter.com/projects/kolossalg...,games
3722583,0,1.447158,0,3.778151,443,6,0,30.000000,0,1,...,0,0,0,6,2019,1,0,journey blak blak blak officially entitle blak...,https://www.kickstarter.com/projects/myheart/b...,comics
3745862,1,3.266232,1,3.176091,474,48,0,39.687488,0,0,...,0,0,0,6,2019,1,0,isabel jazz folk musical teach empathy metoo s...,https://www.kickstarter.com/projects/annawestb...,music
3735376,1,3.000434,1,3.000000,130,17,0,25.989560,0,0,...,0,0,1,6,2019,0,0,main stage kids head new york young performer ...,https://www.kickstarter.com/projects/mainstage...,theater
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4378823,1,4.797053,0,4.447158,837,232,1,30.000000,0,0,...,0,0,0,5,2022,1,0,book maui mauka makai companion book maui coas...,https://www.kickstarter.com/projects/danielsul...,photography
4394019,1,4.044657,0,4.000000,167,80,0,26.377384,0,0,...,0,0,0,5,2022,1,0,myrcene ale co dream mike goergen cy higgins m...,https://www.kickstarter.com/projects/myrceneal...,food
4385597,1,4.402519,0,4.397940,138,73,0,14.147211,0,0,...,0,0,1,5,2022,1,0,firefly theatrical need help build theatre spa...,https://www.kickstarter.com/projects/fireflyth...,theater
4367598,1,3.625827,1,3.602060,304,94,0,36.537940,0,0,...,1,0,0,5,2022,1,0,big feelings coloring book come idea recovery ...,https://www.kickstarter.com/projects/bigfeelin...,publishing
