In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt
import pickle
# from sklearn.inspection import permutation_importance
# from pdpbox.pdp import pdp_isolate, pdp_plot, pdp_interact, pdp_interact_plot

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/mkucz95/kickstarter_data/master/ks-projects-201801.csv')

# IA. Exploratory Data Analysis

In [6]:
# report = ProfileReport(df)
# report

In [7]:
df

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.00
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.00
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.00
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378656,999976400,ChknTruk Nationwide Charity Drive 2014 (Canceled),Documentary,Film & Video,USD,2014-10-17,50000.0,2014-09-17 02:35:30,25.0,canceled,1,US,25.0,25.0,50000.00
378657,999977640,The Tribe,Narrative Film,Film & Video,USD,2011-07-19,1500.0,2011-06-22 03:35:14,155.0,failed,5,US,155.0,155.0,1500.00
378658,999986353,Walls of Remedy- New lesbian Romantic Comedy f...,Narrative Film,Film & Video,USD,2010-08-16,15000.0,2010-07-01 19:40:30,20.0,failed,1,US,20.0,20.0,15000.00
378659,999987933,BioDefense Education Kit,Technology,Technology,USD,2016-02-13,15000.0,2016-01-13 18:13:53,200.0,failed,6,US,200.0,200.0,15000.00


# IB. Wrangle Data/Feature Engineering

In [8]:
# drop the ID , name , category(sub_category) , usd_pledged , usd_pledged_real ,usd_goal_real
df = df[['main_category','launched','deadline','currency','goal','pledged','backers','state']]

In [9]:
# turned the deadline and launched columns to datetime objects
df['deadline'] = pd.to_datetime(df['deadline'])
df['launched'] = pd.to_datetime(df['launched'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['deadline'] = pd.to_datetime(df['deadline'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['launched'] = pd.to_datetime(df['launched'])


In [10]:
# subtracted the deadline from the launched to get the campaign days and created a new feature campaign_length
df['campaign_length'] = df['deadline'] - df['launched']
df['campaign_length'] = df['campaign_length'].dt.days

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['campaign_length'] = df['deadline'] - df['launched']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['campaign_length'] = df['campaign_length'].dt.days


In [11]:
# main_category is the only category I will be concerned with
df = df.rename(columns={'main_category': 'category'})

In [12]:
df['percentage_pledged'] = (df['pledged'] / df['goal']) * 100
# divided the pledged and goal and multiplied by 100 
df = df[['category','campaign_length','currency','percentage_pledged','backers','state']]

# drop the live and undefined rows , group the cancelled and suspended states to failed
df.drop(df[(df.state == 'live')].index,inplace=True)
df.drop(df[(df.state == 'undefined')].index,inplace=True)
df.loc[df.state == "canceled", 'state'] = "failed"
df.loc[df.state == "suspended", 'state'] = "failed"

# create dummy variables of currencies 
currencies=pd.get_dummies(df.currency,prefix='currency')
df=pd.concat([df,currencies],axis=1)
df.drop('currency', axis=1, inplace=True)

# create dummy variables of categories
categories=pd.get_dummies(df.category,prefix='category')
df=pd.concat([df,categories],axis=1)
df.drop('category', axis=1, inplace=True)

# create dummy variables of states 
states=pd.get_dummies(df.state,drop_first=True)
df=pd.concat([df,states],axis=1)
df.drop('state',axis=1,inplace=True)

# II. Split Data

In [13]:
target = 'successful'
y = df[target]
X = df.drop(columns=target)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=.6, test_size=.4, random_state=1)

X_test, X_val, y_test, y_val = train_test_split(X_test, y_test,train_size=.5,test_size=.5, random_state=1)

# III. Establish Baseline

In [15]:
y_train.value_counts(normalize=True).max()

0.6408317664965529

In [16]:
df.columns

Index(['campaign_length', 'percentage_pledged', 'backers', 'currency_AUD',
       'currency_CAD', 'currency_CHF', 'currency_DKK', 'currency_EUR',
       'currency_GBP', 'currency_HKD', 'currency_JPY', 'currency_MXN',
       'currency_NOK', 'currency_NZD', 'currency_SEK', 'currency_SGD',
       'currency_USD', 'category_Art', 'category_Comics', 'category_Crafts',
       'category_Dance', 'category_Design', 'category_Fashion',
       'category_Film & Video', 'category_Food', 'category_Games',
       'category_Journalism', 'category_Music', 'category_Photography',
       'category_Publishing', 'category_Technology', 'category_Theater',
       'successful'],
      dtype='object')

In [17]:
df

Unnamed: 0,campaign_length,percentage_pledged,backers,currency_AUD,currency_CAD,currency_CHF,currency_DKK,currency_EUR,currency_GBP,currency_HKD,...,category_Film & Video,category_Food,category_Games,category_Journalism,category_Music,category_Photography,category_Publishing,category_Technology,category_Theater,successful
0,58,0.000000,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,59,8.070000,15,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,44,0.488889,3,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,29,0.020000,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,55,6.579487,14,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378656,29,0.050000,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
378657,26,10.333333,5,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
378658,45,0.133333,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
378659,30,1.333333,6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


# IV. Build Model

In [18]:
model = RandomForestClassifier()
model.fit(X_train,y_train)

RandomForestClassifier()

In [19]:
# list_currencies = ['AUD','CAD', 'CHF', 'DKK', 'EUR','GBP', 'HKD', 
#                  'JPY', 'MXN','NOK','NZD', 'SEK', 'SGD','USD']
# list_categories = ['Art', 'Comics', 'Crafts','Dance', 'Design','Fashion','Film & Video',
#                    'Food', 'Games','Journalism','Music', 'Photography',
#                    'Publishing', 'Technology','Theater']


# def currency_category_select(user_input,list_input):
#     for x in range(len(list_input)):
#         if list_input[x] == user_input:
#             list_input[x] = 1
#         else:
#             list_input[x] = 0
#     return list_input 


# input_currency=input('Select Currency: ')
# input_categories=input('Select Category: ')
# input_campaign_length=input('Campaign Length: ')
# input_percentage_pledged =input('percentage_pledged as int: ')
# input_backers=input('Quantity of backers: ')

# input_currency=currency_category_select(input_currency,list_currencies)
# input_category=currency_category_select(input_categories,list_categories)

# model_input = np.array([input_campaign_length,input_percentage_pledged,input_backers])
# model_input = np.append(model_input,input_currency)
# model_input = np.append(model_input,input_category).reshape(1, -1)

# if model.predict(model_input)[0] == 1:
#     print('Your kickstarter is a success!')
# else: 
#     print('Your kickstarter failed, try again.')



In [20]:
print('Training Accuracy', model.score(X_train, y_train))
print('Validation Accuracy', model.score(X_val,y_val))
print('Testing Accuracy', model.score(X_test, y_test))

Training Accuracy 0.9999328498522697
Validation Accuracy 0.9973677142089713
Testing Accuracy 0.9971528337362342


# V Pickle Data

In [21]:
with open('model_pickle_3.8','wb') as f:
     pickle.dump(model,f)

In [22]:
# with open('model_pickle','rb') as f:
#     mp=pickle.load(f)

In [23]:
# Some quick and dirty to grab a list of unique currencies and categories from the dataset, format to HTML option blocks

# unique_currencies = df['currency'].unique()
# uniques = df.category.unique()

# for i in uniques:
#     print('<option value=' + '"' + i + '"' + ">"  + i + "</option>")

In [24]:
# list2 = ['Art', 'Comics', 'Crafts','Dance', 'Design','Fashion','Film & Video',
#                    'Food', 'Games','Journalism','Music', 'Photography',
#                    'Publishing', 'Technology','Theater']

In [3]:
import pickle
import pandas as pd

def model(input_data):
    rf = pickle.load(open('model', 'rb'))

    pred = rf.predict(input_data)


    return pred

input_data = pd.DataFrame({'category':['publishing'],'currency':['gbp'],
           'goal':[1000],'launched':['august'],'backers':[200],
           'campaign_length':[58],'name_char_length':[31]})

# name_char_length = len('pizza parlor')
# currency = 'USD'
# category = 'Fashion'
# goal = 1000
# backers = 2000
# campaign_length = 200
# num_backers = 200
# launched = 'august'
model(input_data)


FileNotFoundError: [Errno 2] No such file or directory: 'model'