In [76]:
import pandas as pd

In [77]:
df = pd.read_csv('data_small.csv')
print(df.shape)

(289551, 17)


In [78]:
df.head()

Unnamed: 0,id,campaignName,description,categories,duration,monetaryGoal,country,city,state,date_created,deadline_date,launched_date,state_changed_at,backers_count,usd_pledged,url,target
0,1123889576,mars-on-earth-an-art-residency,Help a fine art photographer continue her proj...,Space Exploration,27,1000,US,Boston,MA,2015-06-24,2015-10-23,2015-09-26,2015-10-23,53,1884,https://www.kickstarter.com/projects/cassandra...,successful
1,1724173143,vulcan-i-rocket-powered-by-3d-printed-engine,Team of undergraduates racing to be the first ...,Space Exploration,30,15000,US,San Diego,CA,2014-05-06,2015-05-21,2015-04-21,2015-05-21,465,21882,https://www.kickstarter.com/projects/105499101...,successful
2,707260502,starscraper-the-next-generation-of-suborbital-...,What if we built a rocket that is better than ...,Space Exploration,31,10000,US,Boston,MA,2014-11-29,2015-01-09,2014-12-09,2015-01-09,294,17176,https://www.kickstarter.com/projects/burpg/sta...,successful
3,497637964,students-building-a-near-space-balloon-with-li...,A group of high school students are building a...,Space Exploration,30,150,US,Mountain View,CA,2014-11-19,2015-11-26,2015-10-27,2015-11-26,45,970,https://www.kickstarter.com/projects/136362214...,successful
4,1546008758,earth-360,Re-inventing the way we look at our planet by ...,Space Exploration,30,7500,US,Fairfield,CT,2012-04-11,2012-09-21,2012-08-22,2012-09-21,28,7576,https://www.kickstarter.com/projects/211370922...,successful


## Encode Strings

In [84]:
def create_dictionary(data):
    temp_dict = {}
    for key in data.iteritems():
        if key[1] not in temp_dict.keys():
            temp_dict[key[1]] = len(temp_dict)
    return temp_dict

In [85]:
# create dictionaries for string variables
city_dict = create_dictionary(df['city'])
state_dict = create_dictionary(df['state'])
country_dict = create_dictionary(df['country'])
cat_dict = create_dictionary(df['categories'])

# map dictionaries to dataframe
df['categories'] = df['categories'].map(cat_dict)
df['country'] = df['country'].map(country_dict)
df['city'] = df['city'].map(city_dict)
df['state'] = df['state'].map(state_dict)
df['target'] = df['target'].map({'failed':0, 'successful':1})

## Train/Test Split

In [86]:
from sklearn.model_selection import cross_val_score, train_test_split

In [87]:
# pick features
target = 'target'
features = ['categories', 'duration'
            , 'monetaryGoal', 'country']

# train/test split
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=22)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((231640, 4), (57911, 4), (231640,), (57911,))

In [88]:
X_train.head()

Unnamed: 0,categories,duration,monetaryGoal,country
132205,3,20,3000,0
271533,61,30,2500,0
260566,151,32,3500,0
201906,110,30,1000,0
230423,124,30,2000,0


## Model

In [93]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [90]:
model = RandomForestClassifier()
model.fit(X_train,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [91]:
%%time

y_pred = model.predict(X_test)
len(set(y_pred))
score = accuracy_score(y_pred, y_test)
print(f'model 1 accuracy score {score}')

model 1 accuracy score 0.7258551915870906
CPU times: user 281 ms, sys: 8.42 ms, total: 289 ms
Wall time: 333 ms


In [92]:
import pickle
# pickle.dump(model, open('model_rf_wed.pkl', 'wb'))

In [94]:
model2 = LogisticRegression()
model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [95]:
y_pred = model.predict(X_test)
len(set(y_pred))
score = accuracy_score(y_pred, y_test)
print(f'model 1 accuracy score {score}')

model 1 accuracy score 0.7264422993904439


In [97]:
from xgboost import XGBClassifier

In [100]:
model2 = XGBClassifier()
model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [101]:
y_pred = model.predict(X_test)
len(set(y_pred))
score = accuracy_score(y_pred, y_test)
print(f'model 1 accuracy score {score}')

model 1 accuracy score 0.7274265683548894


## Hyperparameter Tuning

In [None]:
rfc = RandomForestClassifier()
xgb = XGBClassifier()
lr = LogisticRegression()

param_grid = {'n_jobs': [-1]
            , 'n_estimators' :[100,500,1000]
    
}

gs_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
gs_rfc.fit(X_train, y_train)
print("rfc best params:", gs_rfc.best_params_)

In [None]:
gs_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid, cv= 5)
gs_xgb.fit(X_train, y_train)
print("rfc best params:", gs_rfc.best_params_)

## Heroku Testing

In [71]:
import json
import requests

In [72]:
url = 'http://kickstarter-success.herokuapp.com/'


data = {
  "campaignName": 1,
  "categories": 1,
  "description": 1,
  "monetaryGoal": 100000,
  "duration": 30,
  "country": 3
}

data = json.dumps(data)

In [73]:
send_request = requests.post(url, data)
print(send_request)

<Response [200]>


In [74]:
print(send_request.json())

{'custom_stats': {'average_backers': '86.4286', 'average_duration': '31.7520', 'average_over': '106212.4516', 'category_average': '141170.0625', 'category_success': '0.4023', 'raising_more_success': 154574}, 'results': 1}


In [75]:
df.head()

Unnamed: 0,id,campaignName,description,categories,duration,monetaryGoal,country,city,state,date_created,deadline_date,launched_date,state_changed_at,backers_count,usd_pledged,url,target
0,1123889576,mars-on-earth-an-art-residency,Help a fine art photographer continue her proj...,0,27,1000,0,0,0,2015-06-24,2015-10-23,2015-09-26,2015-10-23,53,1884,https://www.kickstarter.com/projects/cassandra...,1
1,1724173143,vulcan-i-rocket-powered-by-3d-printed-engine,Team of undergraduates racing to be the first ...,0,30,15000,0,1,1,2014-05-06,2015-05-21,2015-04-21,2015-05-21,465,21882,https://www.kickstarter.com/projects/105499101...,1
2,707260502,starscraper-the-next-generation-of-suborbital-...,What if we built a rocket that is better than ...,0,31,10000,0,0,0,2014-11-29,2015-01-09,2014-12-09,2015-01-09,294,17176,https://www.kickstarter.com/projects/burpg/sta...,1
3,497637964,students-building-a-near-space-balloon-with-li...,A group of high school students are building a...,0,30,150,0,2,1,2014-11-19,2015-11-26,2015-10-27,2015-11-26,45,970,https://www.kickstarter.com/projects/136362214...,1
4,1546008758,earth-360,Re-inventing the way we look at our planet by ...,0,30,7500,0,3,2,2012-04-11,2012-09-21,2012-08-22,2012-09-21,28,7576,https://www.kickstarter.com/projects/211370922...,1


In [None]:
df_money = 