## Kickstarter research 
### Processing data from [Funding Successful Projects on Kickstarter](https://www.kaggle.com/dilipajm/kickstarter-project-funding-prediction/data)

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as stats 
import seaborn as snsa
import os
import re

# For Text processing
from wordcloud import WordCloud, STOPWORDS  
import nltk 
# Lemmatizer reduces words into their root form: wolves -> wolf, jumping -> jump, etc. 
from nltk.stem import WordNetLemmatizer as WNL    
from nltk.corpus import wordnet 
from nltk import pos_tag
from nltk.corpus import stopwords

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Feature Comparison: Train vs. Test
Train adds two responses: <font color=pink> Backer Count </font> and <font color=pink> Success/Fail </font> <br>
   Notably missing was the <font color=green> amount pledged. </font> 

In [9]:
data = pd.read_csv('ks_train.csv')
data.head()
data.columns
data.shape

Unnamed: 0,project_id,name,desc,goal,keywords,disable_communication,country,currency,deadline,state_changed_at,created_at,launched_at,backers_count,final_status
0,kkst1451568084,drawing for dollars,I like drawing pictures. and then i color them...,20.0,drawing-for-dollars,False,US,USD,1241333999,1241334017,1240600507,1240602723,3,1
1,kkst1474482071,Sponsor Dereck Blackburn (Lostwars) Artist in ...,"I, Dereck Blackburn will be taking upon an inc...",300.0,sponsor-dereck-blackburn-lostwars-artist-in-re...,False,US,USD,1242429000,1242432018,1240960224,1240975592,2,0
2,kkst183622197,Mr. Squiggles,So I saw darkpony's successfully funded drawin...,30.0,mr-squiggles,False,US,USD,1243027560,1243027818,1242163613,1242164398,0,0
3,kkst597742710,Help me write my second novel.,Do your part to help out starving artists and ...,500.0,help-me-write-my-second-novel,False,US,USD,1243555740,1243556121,1240963795,1240966730,18,1
4,kkst1913131122,Support casting my sculpture in bronze,"I'm nearing completion on a sculpture, current...",2000.0,support-casting-my-sculpture-in-bronze,False,US,USD,1243769880,1243770317,1241177914,1241180541,1,0


Index(['project_id', 'name', 'desc', 'goal', 'keywords',
       'disable_communication', 'country', 'currency', 'deadline',
       'state_changed_at', 'created_at', 'launched_at', 'backers_count',
       'final_status'],
      dtype='object')

(108129, 14)

In [4]:
# import test 
data_test = pd.read_csv('ks_test.csv')
data_test.columns

Index(['project_id', 'name', 'desc', 'goal', 'keywords',
       'disable_communication', 'country', 'currency', 'deadline',
       'state_changed_at', 'created_at', 'launched_at'],
      dtype='object')

## Formatting Dates 
Dates are in Unix time, which can be converted with ```pd.to_datetime```, and setting origin to 'unix'

In [5]:
data.deadline = pd.to_datetime(data.deadline, origin='unix', unit='s')
data.state_changed_at = pd.to_datetime(data.state_changed_at, origin='unix', unit='s')
data.created_at = pd.to_datetime(data.created_at, origin='unix', unit='s')
data.launched_at = pd.to_datetime(data.launched_at, origin='unix', unit='s')
data[['deadline','state_changed_at','created_at','launched_at']].dtypes
data[['deadline','state_changed_at','created_at','launched_at']].head()

deadline            datetime64[ns]
state_changed_at    datetime64[ns]
created_at          datetime64[ns]
launched_at         datetime64[ns]
dtype: object

Unnamed: 0,deadline,state_changed_at,created_at,launched_at
0,2009-05-03 06:59:59,2009-05-03 07:00:17,2009-04-24 19:15:07,2009-04-24 19:52:03
1,2009-05-15 23:10:00,2009-05-16 00:00:18,2009-04-28 23:10:24,2009-04-29 03:26:32
2,2009-05-22 21:26:00,2009-05-22 21:30:18,2009-05-12 21:26:53,2009-05-12 21:39:58
3,2009-05-29 00:09:00,2009-05-29 00:15:21,2009-04-29 00:09:55,2009-04-29 00:58:50
4,2009-05-31 11:38:00,2009-05-31 11:45:17,2009-05-01 11:38:34,2009-05-01 12:22:21


In [6]:
data_test.deadline = pd.to_datetime(data_test.deadline, origin='unix', unit='s')
data_test.state_changed_at = pd.to_datetime(data_test.state_changed_at, origin='unix', unit='s')
data_test.created_at = pd.to_datetime(data_test.created_at, origin='unix', unit='s')
data_test.launched_at = pd.to_datetime(data_test.launched_at, origin='unix', unit='s')
data_test[['deadline','state_changed_at','created_at','launched_at']].dtypes
data_test[['deadline','state_changed_at','created_at','launched_at']].head()


deadline            datetime64[ns]
state_changed_at    datetime64[ns]
created_at          datetime64[ns]
launched_at         datetime64[ns]
dtype: object

Unnamed: 0,deadline,state_changed_at,created_at,launched_at
0,2015-12-08 23:59:45,2015-12-08 23:59:45,2015-10-28 03:23:01,2015-10-29 22:59:45
1,2016-01-22 04:07:00,2016-01-22 04:07:00,2015-12-16 20:22:03,2015-12-18 04:07:00
2,2016-01-03 00:25:00,2016-01-03 00:25:00,2015-11-26 23:42:36,2015-11-28 00:55:28
3,2015-10-16 18:51:58,2015-10-16 18:52:10,2015-08-30 20:33:50,2015-09-16 18:51:58
4,2016-05-01 02:14:00,2016-05-01 02:14:04,2016-02-18 03:14:36,2016-03-18 21:01:30


Run summary stats on date fields. <br><br>
*Note* <font color=pink> Train</font> set covers 2009-2005 while <font color=green> Test </font> covers 2015-2017 <br><br> 
From this point, we will only look at the Train set to connect data to outcomes. <br>
we also notice the <font color=green> Average Sucess Rate is 32% </font> for the dataset

In [94]:
print("train set") 
data.launched_at.describe()
print("Avg. Success Rate: ", np.mean(data['final_status']) )

train set


count    1.081290e+05
mean     1.377299e+09
std      4.294421e+07
min      1.240603e+09
25%      1.343917e+09
50%      1.390870e+09
75%      1.412807e+09
max      1.432658e+09
Name: launched_at, dtype: float64

Avg. Success Rate:  0.3196274819891056


See no. of unique values for each field 

In [90]:
data.nunique(axis=0)

project_id               108129
name                     107838
desc                     107377
goal                       3567
keywords                 108129
disable_communication         2
country                      11
currency                      9
deadline                 100933
state_changed_at         102417
created_at               108062
launched_at              108039
backers_count              2426
final_status                  2
sust_kw                       2
dtype: int64

### Research on Key words 
Data Studies with keyword/text fields: <br> 
1. Funding Successful Projects (320k records) 
2. Kaggle Project Stats (4k study): Atlantic Study <br> 

Title: 3 studies had title/name, which can also be explored for following words <br> 
eco, sustainability, zero-waste, men, unisex, vegan, plant-based, travel, multi-functional, functional, versatile, modern

In [19]:
str_list = ['eco', ['sustainability','sustainable'], ['zero waste', 'zero-waste'], 'men', 'unisex', 
            'vegan', 'plant-based', 'travel', 'ulti-functional', 'functional', 'versatile', 'modern']
str_records_all = data[data.keywords.str.contains('|'.join(str_list))]
print("number of records with keywords: ", len(str_records_all)) 
str_records_all.sample(n=3)
# x_test = df1[['Lead ID','description',]][df1.description.str.contains('|'.join(patterns))]

number of records with keywords:  7733


Unnamed: 0,project_id,name,desc,goal,keywords,disable_communication,country,currency,deadline,state_changed_at,created_at,launched_at,backers_count,final_status
12065,kkst1133389511,Recording an EP,Songwriter/Producer looking for help to record...,7000.0,recording-an-ep,False,US,USD,1319857760,1319857760,1315967205,1315969760,0,0
58924,kkst187198617,"EveryWare: Your OS, Software and development t...",EveryWare is an online platform that allows yo...,50000.0,everyware-your-os-software-and-development-too...,False,CA,CAD,1400741899,1400741900,1393525438,1396853899,0,0
92490,kkst140446887,Demo Development,"My first single ALIVE is making the rounds, bu...",1800.0,demo-development,False,CA,CAD,1424469600,1424469616,1415647060,1421508890,1,0


### Text Processing 
Note: A straight "str.contains" will not work because of words also containing strings <br> 
 Example: "Documentary" and "women" contain the word "men." 

In [84]:
# Function for word presence: takes a string of words, seperates them, and checks for presence of word 
def str_presence(input_string, search_string):
    token = input_string.lower()
    token = [tk.strip() for tk in token.split("-")] 
    token = [tk for tk in token if len(tk)>2]   # remove article words
    return int(any(x in token for x in search_string))

In [85]:
# Function to run loop to populate array (fill_array) with values 
def str_presence_full(string_column, search_string): 
    fill_array = [] 
    for str in string_column:
        present_fl = str_presence(str, search_string)
        fill_array.append(present_fl)
    return fill_array

In [101]:
# add flags to data set 
data['sust_fl'] = str_presence_full(data.keywords, ['sustainable','sustainability'])
data['eco_fl'] = str_presence_full(data.keywords, ['eco'])
data['zero_waste_fl'] = str_presence_full(data.keywords, ['zero-waste','zero waste'])
data['men_fl'] = str_presence_full(data.keywords, ['men','male'])
data['unisex_fl'] = str_presence_full(data.keywords, 'unisex')
data['vegan_fl'] = str_presence_full(data.keywords, ['vegan','plant-based','plant based'])
data['travel_fl'] = str_presence_full(data.keywords, 'travel')
data['functional_fl'] = str_presence_full(data.keywords, ['functional', 'ultra-functional','ultra functional'])
data['versatile_fl'] = str_presence_full(data.keywords, 'versatile')
data['modern_fl'] = str_presence_full(data.keywords, ['modern','modernity'])