In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from datetime import date

In [2]:
df = pd.read_csv('ks-projects-201801.csv')

In [3]:
df.shape

(378661, 15)

In [4]:
df.columns

Index(['ID', 'name', 'category', 'main_category', 'currency', 'deadline',
       'goal', 'launched', 'pledged', 'state', 'backers', 'country',
       'usd pledged', 'usd_pledged_real', 'usd_goal_real'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [6]:
df.drop(['ID', 'name', 'category', 'currency', 'country', 'goal', 'pledged'],
        axis = 1,
        inplace = True)

In [7]:
df.head()

Unnamed: 0,main_category,deadline,launched,state,backers,usd pledged,usd_pledged_real,usd_goal_real
0,Publishing,2015-10-09,2015-08-11 12:12:28,failed,0,0.0,0.0,1533.95
1,Film & Video,2017-11-01,2017-09-02 04:43:57,failed,15,100.0,2421.0,30000.0
2,Film & Video,2013-02-26,2013-01-12 00:20:50,failed,3,220.0,220.0,45000.0
3,Music,2012-04-16,2012-03-17 03:24:11,failed,1,1.0,1.0,5000.0
4,Film & Video,2015-08-29,2015-07-04 08:35:03,canceled,14,1283.0,1283.0,19500.0


In [8]:
# change datatype to datetime 64
df['deadline'] = pd.to_datetime(df.deadline)

# change launched to only date
df['launched'] = pd.to_datetime(df.launched)
df['launched'] = df.launched.dt.date
df['launched'] = pd.to_datetime(df.launched)

In [9]:
# calculate time differece
df['delta'] = (df.deadline - df.launched).dt.days

# drop deadline and launched dates
df.drop(['deadline', 'launched'], axis = 1, inplace = True)

In [10]:
print(df.main_category.unique())
print(df.state.unique())

['Publishing' 'Film & Video' 'Music' 'Food' 'Design' 'Crafts' 'Games'
 'Comics' 'Fashion' 'Theater' 'Art' 'Photography' 'Technology' 'Dance'
 'Journalism']
['failed' 'canceled' 'successful' 'live' 'undefined' 'suspended']


In [11]:
df.groupby('state')['state'].count()

state
canceled       38779
failed        197719
live            2799
successful    133956
suspended       1846
undefined       3562
Name: state, dtype: int64

* 'state' will be the target variable
* 'canceled + failed + suspended' together as 'failure' = 0
* 'sucessful' = 1
* store 'live' and 'undefined' in a separate dataframe and run it with the final model

In [12]:
# store 'live' in live_df and 'undefined' in undef_df
live_df = df.loc[df['state'] == 'live']
undef_df = df.loc[df['state'] == 'undefined']

In [13]:
# create new dataframe without 'live state' and 'undefined state'
new_df = df.drop(live_df.index | undef_df.index)
new_df['state'] = new_df.state.apply(lambda x: 1 if x == 'successful' else 0)

In [14]:
# transform object variables to binary 
new_df = pd.get_dummies(new_df)

# drop null values
new_df = new_df.dropna()

In [15]:
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler()
col_names = new_df.columns   #name of columns
new_df = scale.fit_transform(new_df)
new_df = pd.DataFrame(new_df, columns = col_names)

In [18]:
# take a subsample of original dataset
sample_df = new_df.copy()

X = sample_df['state']
Y = sample_df.loc[:, ~sample_df.columns.isin(['state'])]

In [19]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(sample_df, test_size = 0.2)

#set target variable and training + test set
Y_train, Y_test = train_df['state'], test_df['state']
X_train, X_test = train_df.loc[:, ~train_df.columns.isin(['state'])], test_df.loc[:, ~test_df.columns.isin(['state'])]

In [20]:
from sklearn.neural_network import MLPClassifier

# establish and fit the model, with a double layer, 100 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(100, 2))
mlp.fit(X_train, Y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [21]:
# training set
mlp.score(X_train, Y_train)

0.94318533052020481

In [22]:
Y_train.value_counts()/len(Y_train)

0.0    0.639754
1.0    0.360246
Name: state, dtype: float64

In [23]:
# test set
mlp.score(X_test, Y_test)

0.9426989544978096

In [24]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, Y, cv=5)

NameError: name 'X' is not defined

    FIX THIS

## Predice 'Live State'

In [144]:
# transform object variables to binary 
live_df = pd.get_dummies(live_df)

# drop null values
live_df = live_df.dropna()

In [146]:
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler()
col_names = live_df.columns   #name of columns
live_df = scale.fit_transform(live_df)
live_df = pd.DataFrame(live_df, columns = col_names)

In [149]:
# use model to predict 'live state'
Y_live = live_df['state_live']
X_live = live_df.loc[:, ~live_df.columns.isin(['state_live'])]

mlp.score(X_live)

0.97498213009292356

    Can't score the 'live state', can only make predictions
        use predict