## Kickstarter research 
### Processing data from [Funding Successful Projects on Kickstarter](https://www.kaggle.com/dilipajm/kickstarter-project-funding-prediction/data)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as stats 
import seaborn as snsa
import os
import re

# For Text processing
from wordcloud import WordCloud, STOPWORDS  

In [2]:
os.chdir(r'C:\Users\Owner\OneDrive\Research\Tech\DS_Projects\Other\Marketing\Kickstarter\Data')

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Feature Comparison: Train vs. Test
Train adds two responses: <font color=pink> Backer Count </font> and <font color=pink> Success/Fail </font> <br>
   Notably missing was the <font color=green> amount pledged. </font> 

In [12]:
data = pd.read_csv('ks_train.csv')
data.head()
data.columns

Unnamed: 0,project_id,name,desc,goal,keywords,disable_communication,country,currency,deadline,state_changed_at,created_at,launched_at,backers_count,final_status
0,kkst1451568084,drawing for dollars,I like drawing pictures. and then i color them...,20.0,drawing-for-dollars,False,US,USD,1241333999,1241334017,1240600507,1240602723,3,1
1,kkst1474482071,Sponsor Dereck Blackburn (Lostwars) Artist in ...,"I, Dereck Blackburn will be taking upon an inc...",300.0,sponsor-dereck-blackburn-lostwars-artist-in-re...,False,US,USD,1242429000,1242432018,1240960224,1240975592,2,0
2,kkst183622197,Mr. Squiggles,So I saw darkpony's successfully funded drawin...,30.0,mr-squiggles,False,US,USD,1243027560,1243027818,1242163613,1242164398,0,0
3,kkst597742710,Help me write my second novel.,Do your part to help out starving artists and ...,500.0,help-me-write-my-second-novel,False,US,USD,1243555740,1243556121,1240963795,1240966730,18,1
4,kkst1913131122,Support casting my sculpture in bronze,"I'm nearing completion on a sculpture, current...",2000.0,support-casting-my-sculpture-in-bronze,False,US,USD,1243769880,1243770317,1241177914,1241180541,1,0


Index(['project_id', 'name', 'desc', 'goal', 'keywords',
       'disable_communication', 'country', 'currency', 'deadline',
       'state_changed_at', 'created_at', 'launched_at', 'backers_count',
       'final_status'],
      dtype='object')

In [13]:
# import test 
data_test = pd.read_csv('ks_test.csv')
data_test.columns

Index(['project_id', 'name', 'desc', 'goal', 'keywords',
       'disable_communication', 'country', 'currency', 'deadline',
       'state_changed_at', 'created_at', 'launched_at'],
      dtype='object')

## Formatting Dates 
Dates are in Unix time, which can be converted with ```pd.to_datetime```, and setting origin to 'unix'

In [17]:
data.deadline = pd.to_datetime(data.deadline, origin='unix', unit='s')
data.state_changed_at = pd.to_datetime(data.state_changed_at, origin='unix', unit='s')
data.created_at = pd.to_datetime(data.created_at, origin='unix', unit='s')
data.launched_at = pd.to_datetime(data.launched_at, origin='unix', unit='s')
data[['deadline','state_changed_at','created_at','launched_at']].dtypes
data[['deadline','state_changed_at','created_at','launched_at']].head()

deadline            datetime64[ns]
state_changed_at    datetime64[ns]
created_at          datetime64[ns]
launched_at         datetime64[ns]
dtype: object

Unnamed: 0,deadline,state_changed_at,created_at,launched_at
0,2009-05-03 06:59:59,2009-05-03 07:00:17,2009-04-24 19:15:07,2009-04-24 19:52:03
1,2009-05-15 23:10:00,2009-05-16 00:00:18,2009-04-28 23:10:24,2009-04-29 03:26:32
2,2009-05-22 21:26:00,2009-05-22 21:30:18,2009-05-12 21:26:53,2009-05-12 21:39:58
3,2009-05-29 00:09:00,2009-05-29 00:15:21,2009-04-29 00:09:55,2009-04-29 00:58:50
4,2009-05-31 11:38:00,2009-05-31 11:45:17,2009-05-01 11:38:34,2009-05-01 12:22:21


In [10]:
data_test.deadline = pd.to_datetime(data_test.deadline, origin='unix', unit='s')
data_test.state_changed_at = pd.to_datetime(data_test.state_changed_at, origin='unix', unit='s')
data_test.created_at = pd.to_datetime(data_test.created_at, origin='unix', unit='s')
data_test.launched_at = pd.to_datetime(data_test.launched_at, origin='unix', unit='s')
data_test[['deadline','state_changed_at','created_at','launched_at']].dtypes
data_test[['deadline','state_changed_at','created_at','launched_at']].head()


deadline            datetime64[ns]
state_changed_at    datetime64[ns]
created_at          datetime64[ns]
launched_at         datetime64[ns]
dtype: object

Unnamed: 0,deadline,state_changed_at,created_at,launched_at
0,2015-12-08 23:59:45,2015-12-08 23:59:45,2015-10-28 03:23:01,2015-10-29 22:59:45
1,2016-01-22 04:07:00,2016-01-22 04:07:00,2015-12-16 20:22:03,2015-12-18 04:07:00
2,2016-01-03 00:25:00,2016-01-03 00:25:00,2015-11-26 23:42:36,2015-11-28 00:55:28
3,2015-10-16 18:51:58,2015-10-16 18:52:10,2015-08-30 20:33:50,2015-09-16 18:51:58
4,2016-05-01 02:14:00,2016-05-01 02:14:04,2016-02-18 03:14:36,2016-03-18 21:01:30


Run summary stats on date fields. <br><br>
*Note* <font color=pink> Train</font> set covers 2009-2005 while <font color=green> Test </font> covers 2015-2017 <br><br> 
From this point, we will only look at the Train set to connect data to outcomes. 

In [19]:
print("train set") 
data.launched_at.describe()

train set


count                  108129
unique                 108039
top       2012-11-08 19:02:18
freq                        2
first     2009-04-24 19:52:03
last      2015-05-26 16:41:13
Name: launched_at, dtype: object

SyntaxError: invalid syntax (<ipython-input-4-fefa71e4e4bd>, line 1)