In [1]:
import re
import joblib
import pandas as pd
import numpy as np
import spacy
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from category_encoders import OrdinalEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier 
from sklearn.model_selection import RandomizedSearchCV

In [2]:
df = pd.read_csv('kickstarter.csv')

In [3]:
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [4]:
print(df.shape)

(378661, 15)


In [5]:
df = df[(df['state'] == 'failed') | (df['state'] == 'successful')].copy()
print(df.shape)

(331675, 15)


In [6]:
print(list(df.main_category.unique()))

['Publishing', 'Film & Video', 'Music', 'Food', 'Crafts', 'Games', 'Design', 'Comics', 'Fashion', 'Theater', 'Art', 'Photography', 'Technology', 'Dance', 'Journalism']


In [7]:
print(list(df.category.unique()))

['Poetry', 'Narrative Film', 'Music', 'Restaurants', 'Food', 'Drinks', 'Nonfiction', 'Indie Rock', 'Crafts', 'Games', 'Tabletop Games', 'Design', 'Comic Books', 'Art Books', 'Fashion', 'Childrenswear', 'Theater', 'Comics', 'DIY', 'Webseries', 'Animation', 'Food Trucks', 'Product Design', 'Public Art', 'Documentary', 'Illustration', 'Photography', 'Pop', 'People', 'Art', 'Family', 'Fiction', 'Film & Video', 'Accessories', 'Rock', 'Hardware', 'Software', 'Weaving', 'Web', 'Jazz', 'Ready-to-wear', 'Festivals', 'Video Games', 'Anthologies', 'Publishing', 'Shorts', 'Gadgets', 'Electronic Music', 'Radio & Podcasts', 'Cookbooks', 'Apparel', 'Metal', 'Comedy', 'Hip-Hop', 'Periodicals', 'Dance', 'Technology', 'Painting', 'World Music', 'Photobooks', 'Drama', 'Architecture', 'Young Adult', 'Latin', 'Mobile Games', 'Flight', 'Fine Art', 'Action', 'Playing Cards', 'Makerspaces', 'Punk', "Children's Books", 'Apps', 'Audio', 'Performance Art', 'Ceramics', 'Vegan', 'Graphic Novels', 'Fabrication Tool

In [8]:
df['launched'] = pd.to_datetime(df['launched'])
df['deadline'] = pd.to_datetime(df['deadline'])

df['duration_days'] = df['deadline'].subtract(df['launched'])
df['duration_days'] = df['duration_days'].astype('timedelta64[D]')

In [9]:
df = df.assign(outcome=(df['state'] == 'successful').astype(int))

In [10]:
df = df.drop("ID", 1)

In [11]:
df = df.drop("deadline", 1)

In [12]:
df = df.drop("launched", 1)

In [13]:
df = df.drop("state", 1)

In [14]:
df = df.drop("usd_goal_real", 1)

In [15]:
df = df.drop("name", 1)

In [16]:
df = df.drop("usd_pledged_real", 1)

In [17]:
df = df.drop("usd pledged", 1)

In [18]:
df = df.drop("pledged", 1)

In [19]:
df = df.drop("backers", 1)

In [20]:
df.head()

Unnamed: 0,category,main_category,currency,goal,country,duration_days,outcome
0,Poetry,Publishing,GBP,1000.0,GB,58.0,0
1,Narrative Film,Film & Video,USD,30000.0,US,59.0,0
2,Narrative Film,Film & Video,USD,45000.0,US,44.0,0
3,Music,Music,USD,5000.0,US,29.0,0
5,Restaurants,Food,USD,50000.0,US,34.0,1


In [21]:
y = df['outcome']

print(y.shape)
y.head(5)

(331675,)


0    0
1    0
2    0
3    0
5    1
Name: outcome, dtype: int32

In [22]:
df = df.drop('outcome', 1)

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(
    df,
    y, 
    test_size = 0.2,
    random_state=69
)

print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(265340, 6) (265340,)
(66335, 6) (66335,)


In [25]:
model_xgb = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    XGBClassifier(random_state=42, n_jobs=-1, n_estimators=5, max_depth=100)
)

model_xgb.fit(X_train, Y_train);





In [26]:
print('XGBoost Training Accuracy:', model_xgb.score(X_train, Y_train))
print('XGBoost Validation Accuracy:', model_xgb.score(X_test, Y_test))

XGBoost Training Accuracy: 0.7792718775910153
XGBoost Validation Accuracy: 0.6599231175096103


In [27]:
parms = {
    'xgbclassifier__learning_rate'    : [0.03, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    'xgbclassifier__max_depth'        : range(80, 200, 20),
    'xgbclassifier__min_child_weight' : [8, 9, 10, 11, 12, 13, 14],
    'xgbclassifier__gamma'            : [0.2, 0.3, 0.4, 0.5, 0.6, 0.7],
    'xgbclassifier__colsample_bytree' : [0.01, 0.02, 0.3, 0.4, 0.5, 0.7, 0.8, 0.9],
    'xgbclassifier__n_estimators'     : [3, 10, 30]
    
}

In [28]:
model_xgb2 = RandomizedSearchCV(
    model_xgb,
    param_distributions=parms,
    n_iter=10,
    cv=5,
    n_jobs=-1,
    verbose=1
)

model_xgb2.fit(X_train, Y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  elif pd.api.types.is_categorical(cols):




RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('ordinalencoder',
                                              OrdinalEncoder(cols=['category',
                                                                   'main_category',
                                                                   'currency',
                                                                   'country'],
                                                             mapping=[{'col': 'category',
                                                                       'data_type': dtype('O'),
                                                                       'mapping': Music                1
Film & Video         2
Blues                3
Video Games          4
Spaces               5
                  ... 
Video              156
Pottery            157
Taxidermy          158
Literary Spaces    159
NaN                 -2
Length: 160, dtype: int64},
                                         

In [29]:
model_xgb2.score(X_test, Y_test)
model_xgb2.score(X_train, Y_train)

0.7198273912715761

In [30]:
model_xgb2.best_params_

{'xgbclassifier__n_estimators': 30,
 'xgbclassifier__min_child_weight': 8,
 'xgbclassifier__max_depth': 80,
 'xgbclassifier__learning_rate': 0.1,
 'xgbclassifier__gamma': 0.3,
 'xgbclassifier__colsample_bytree': 0.7}

In [31]:
model_xgb = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    XGBClassifier(random_state=42, n_jobs=-1, n_estimators=10, min_child_weight=10, max_depth=100, learning_rate=0.25, gamma=.2, colsample_bytree=0.7)
)

model_xgb.fit(X_train, Y_train);





In [32]:
print('XGBoost Training Accuracy:', model_xgb.score(X_train, Y_train))
print('XGBoost Validation Accuracy:', model_xgb.score(X_test, Y_test))

XGBoost Training Accuracy: 0.7108502298937213
XGBoost Validation Accuracy: 0.6808019898997513


In [34]:
pickle_out = open("model_xgb2.pkl","wb")
pickle.dump(model_xgb2, pickle_out)
pickle_out.close()