In [1]:
# Activate intellisense for this notebook:
# Press tab while hovering over a python object to reveal its methods
%config IPCompleter.greedy=True

# Kickstarter Dataset

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

df = pd.read_csv("./kickstarter-projects/ks-projects-201801.csv")[:1000]

df.head()

## Data Cleaning
    1) I don't care about the project ID's so I'm going to drop that column.
    2) For simplicity, I'm only going to use the usd_pledged_real conversion option for every price measurement, so that I don't have to deal with currency issues. (Dropping columns 'usd pledged', 'pledged', and 'currency'.
    3) I need to fix the launched date so that it matches the deadline (remove time)
    
    

In [None]:

df = df.drop(columns=["ID", "usd pledged", "pledged", "currency", "goal"])
df = df.rename(columns={"usd_pledged_real": "pledged", "usd_goal_real": "goal"})

df['launched'] = df['launched'].apply(lambda x: x.split(' ')[0])

df = df.dropna()
df.head()

## Diving Into The Data
---
### Our main points of focus:
    1) What is the median timeframe for a kickstarter project?
        > Is there a corrolation between success rate and timeframe?
        > Is there a correlation between date launched and timeframe?
     2) Does the goal amount affect a projects success rate?
        > Compare the average goal amounts of projects that failed vs succeeded
     3) What was the most successful general category?

## What is the median timeframe for a kickstarter project?
---
#### Answer: 30 days.

In [None]:
from datetime import date
def date_difference_days(row):
    split_components = lambda d: list(map(int, d.split('-')))
    
    d1, d2 = split_components(row.launched), split_components(row.deadline)
    launched = date(d1[0], d1[1], d1[2])
    deadline = date(d2[0], d2[1], d2[2])

    return (deadline - launched).days

df['days'] = df.apply(date_difference_days, axis=1)

# Remove outliers:
df = df[np.abs(df.days-df.days.mean()) <= 3*df.days.std()]
df.days.median()

## Is there a corrolation between timeframe and success rate?
---
#### In all likelihood, we can't assume that timeframe is corrolated with success rate. But we do see from the graph below that projects that last longer than 60 days rarely recieve pledges. Maybe they're too ambicious/over-scoped?

In [None]:
plot = df.plot.scatter(x="days", y="pledged")

# Is there a corrolation between date started/deadline date and success rate?
---

In [None]:
# Add helper columns
split_components = lambda d: list(map(int, d.split('-')))
def getEndMonth(row):
    return split_components(row.deadline)[1]
def getStartMonth(row):
    return split_components(row.launched)[1]

df['start_month'] = df.apply(getStartMonth, axis=1)
df['end_month'] = df.apply(getEndMonth, axis=1)

In [None]:
sns.countplot(y="pledged", hue="start_month", data=df)

## What was the most successful category?

In [None]:
sns.countplot(x="main_category", hue="status", data=df)