![Kickstarer](./images/kickstarter.png)

---
Imports
---

In [None]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')
#pd.options.display.max_columns = None
#pd.options.display.max_rows = None

In [None]:
# Import the .csv files and concat them into one dataframe
original_dataframe = pd.concat(map(pd.read_csv, glob.glob('data/*.csv')))
# Reset the indices
original_dataframe.reset_index(drop=True, inplace=True)

In [None]:
# Set a working dataframe, so that we don't have to wait 10s it to import again if we want to start fresh
df = original_dataframe

---
EDA - Part 1
---

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# Only a very limited amount of suspended projects (drop), canceled projects will be treated as though they failed
df['state'].value_counts()

In [None]:
# Check for duplicate projects and store them in a table
dups = df.groupby(df.id.tolist()).size().reset_index().rename(columns={0:'count'})
# Sum the final col of that table, and subtract the number of culprits:
dups['count'].sum() - dups.shape[0]

---
Data Cleaning
---

In [None]:
# Drop features which will not be needed for further analysis
dropped_features = ['blurb', 'currency_symbol', 'backers_count', 'is_backing', 'permissions', 'is_starred', 'source_url',
                    'slug', 'name', 'static_usd_rate', 'profile', 'friends', 'spotlight', 'is_starrable', 'photo', 'pledged', 'usd_type',
                    'fx_rate', 'location', 'creator', 'currency_trailing_code','current_currency', 'created_at', 'urls', 'disable_communication', 'usd_pledged' ]
df = df.drop(dropped_features, axis=1)

In [None]:
# Built array which contains live projects for later use 
array_live = ['live']
live_projects = df.loc[df['state'].isin(array_live)]

# Filter and concat. for target variable
array_notlive = ['successful', 'failed', 'canceled']
df = df.loc[df['state'].isin(array_notlive)]
df.replace('canceled','failed', inplace=True)

In [None]:
# Replace successful and failed entries
df.replace(['successful','failed'],[1,0], inplace=True)

In [None]:
# Sort dataframe by 'date_changed_at' so that we will keep the entry that was most recently updated
df.sort_values('state_changed_at')
# Remove duplicates
duplicates = df.duplicated(subset='id', keep='last')
df = df[~duplicates]

---
Feature Engineering
---

In [None]:
# Extract category names from long string in 'category' column
list = []
for i, j in df['category'].iteritems():
    try:
        found = re.search('slug":"(.+?)/', j).group(1)
        list.append(found)
    except AttributeError:
        pass

list_2 = []
for i, j in enumerate(list):
    try:
        found = re.search('(.+?)"', j).group(1)
        list_2.append(found)
    except AttributeError:
        pass

# Add the categories and delete the original cluttered category
df['categories'] = pd.Series(list_2)
df.drop('category', axis=1, inplace=True)

In [None]:
# Generate new column with readable timeformat
df['launched_at_new'] = pd.to_datetime(df['launched_at'], unit='s')
df['deadline_new'] = pd.to_datetime(df['deadline'], unit='s')
df['state_changed_at_new'] = pd.to_datetime(df['state_changed_at'], unit='s')

In [None]:
# Create new feature 'duration' that displays wheather the project timespan was more or less than 30 days
df = df.eval('duration = deadline - launched_at')
df['duration'] = ['over' if x > 2592000 else 'under' for x in df['duration']]

In [None]:
# Add new column 'time' that displays the time from project launch to project end
df.eval('time = state_changed_at_new - launched_at_new', inplace=True)
# Convert to days
df['time'] = df['time'].apply(lambda x: pd.Timedelta(x).days)

In [None]:
# Change dates to weekend(1) or weekday(0)
def change_time(dataframe, column_list):
    for column in column_list:
        dataframe[column] = [1 if x >= 6 else 0 for x in pd.to_datetime(dataframe[column], unit='s').dt.weekday]
    return dataframe

In [None]:
times_lst = ['launched_at', 'deadline', 'state_changed_at']
change_time(df, times_lst)

In [None]:
df.info()

---
EDA - Part 2
---

##### Plots

In [None]:
# Which data needs to be plotted categorical and which numerical
categorical = ['country','currency', 'staff_pick', 'categories','duration']
numerical = ['usd_pledged', 'goal', 'converted_pledged_amount']

In [None]:
def bar_plot(df, column):
    """Generates barplots of categorical data

    Args:
        df (pd dataframe): Dataframe
        column (object): list of names of columns which should be plotted
    """
    # get feature
    for i in column:
        varValue = df[i].value_counts()

        plt.figure(figsize = (12,3))
        plt.bar(varValue.index, varValue, color = '#87c442', edgecolor = 'black')
        plt.xticks(varValue.index, varValue.index.values)
        plt.ylabel("Frequency")
        plt.title(i.capitalize())
        plt.xticks(rotation = 90)
        plt.show()

##### Categorical Data

In [None]:
bar_plot(df, categorical)

##### Numerical Data

In [None]:
# Which data needs to be plotted categorical and which numerical
numerical = ['goal', 'converted_pledged_amount']

In [None]:
df['goal'] = df[np.abs(df.goal-df.goal.mean()) <= (3*df.goal.std())]

In [None]:
plt.figure(figsize = (12,3))
plt.hist(df['goal'], bins = None, facecolor = '#87c442', edgecolor = 'black', range = [0.0,200000])

In [None]:
plt.figure(figsize = (12,3))
plt.hist(df['converted_pledged_amount'], bins = None, facecolor = '#87c442', edgecolor = 'black', range = [0.0,200000]);

##### Misc

In [None]:
# Drop id (not needed anymore)
df.drop('id', axis = 1, inplace = True)
# Drop disable communication as well, only false values
# Drop usd_pledged

# Replace successful and failed entries
df.replace(['successful','failed'],[1,0], inplace=True)

#### One-hot encoding

In [None]:
# One hot encode all categorical data (country, currency, staff_pick, categories, disable_communication, duration) boolean values might 
# need to be changed to 0,1 in column; includes: duration, disable_communication, staff_pick
one_hot_featurelist = ['country', 'currency', 'staff_pick', 'categories', 'duration']
one_hot = pd.get_dummies(df[one_hot_featurelist])
df.drop(one_hot_featurelist, axis = 1, inplace=True)
df = df.join(one_hot)

#### Scalerize

In [None]:
def scale_columns(df, column):
    """Function that scales the data with a min_max scaler

    Args:
        df (dataframe): Dataframe
        column (object): Name or list of names including the columns which should be normalized

    Returns:
        Dataframe object: Returns the dataframe including the normalized columns
    """
    scaler = MinMaxScaler()
    for i in column:
        scaler.fit(df[[i]])
        df[i] = scaler.transform(df[[i]])
    
    return df

In [None]:
# Standardize numerical data
nummerical = ['goal','converted_pledged_amount','usd_pledged']
df = scale_columns(df, nummerical)