In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
# Read in csv dataset
kickstarter_df = pd.read_csv('Kickstarter_original_data.csv')
kickstarter_df.head()

Unnamed: 0,id,name,blurb,goal,pledged,outcomes,country,currency,deadline,launched_at,...,backers_count,spotlight,Category and Subcategory,Percentage Funded,Average Donation,Parent Category,Subcategory,Date Created Conversion,Date Ended Conversion,Years
0,0,GIRLS STATE a new musical comedy TV project,"In this new TV show ""All Politics is Vocal"" as...","$8,500.00","$11,633.00",successful,US,USD,1437620400,1434931811,...,182,True,film & video/television,137,63.92,film & video,television,6/22/2015,7/23/2015,2015
1,1,FannibalFest Fan Convention,A Hannibal TV Show Fan Convention and Art Coll...,"$10,275.00","$14,653.00",successful,US,USD,1488464683,1485872683,...,79,True,film & video/television,143,185.48,film & video,television,1/31/2017,3/2/2017,2017
2,2,Charlie teaser completion,Completion fund for post-production for teaser...,$500.00,$525.00,successful,GB,GBP,1455555083,1454691083,...,35,True,film & video/television,105,15.0,film & video,television,2/5/2016,2/15/2016,2016
3,3,Unsure/Positive: A Dramedy Series About Life w...,We already produced the *very* beginning of th...,"$10,000.00","$10,390.00",successful,US,USD,1407414107,1404822107,...,150,True,film & video/television,104,69.27,film & video,television,7/8/2014,8/7/2014,2014
4,4,Party Monsters,19th century’s most notorious literary charact...,"$44,000.00","$54,116.28",successful,US,USD,1450555279,1447963279,...,284,True,film & video/television,123,190.55,film & video,television,11/19/2015,12/19/2015,2015


In [4]:
# Check data types
kickstarter_df.dtypes

id                            int64
name                         object
blurb                        object
 goal                        object
pledged                      object
outcomes                     object
country                      object
currency                     object
deadline                      int64
launched_at                   int64
staff_pick                     bool
backers_count                 int64
spotlight                      bool
Category and Subcategory     object
Percentage Funded             int64
Average Donation            float64
Parent Category              object
Subcategory                  object
Date Created Conversion      object
Date Ended Conversion        object
Years                         int64
dtype: object

In [5]:
# drop dollar signs from pledged column
kickstarter_df['pledged'] = kickstarter_df['pledged'].str.replace('$', '')

In [6]:
# drop dollar signs from goal column
kickstarter_df[' goal '] = kickstarter_df[' goal '].str.replace('$', '')

In [7]:
kickstarter_df.head()

Unnamed: 0,id,name,blurb,goal,pledged,outcomes,country,currency,deadline,launched_at,...,backers_count,spotlight,Category and Subcategory,Percentage Funded,Average Donation,Parent Category,Subcategory,Date Created Conversion,Date Ended Conversion,Years
0,0,GIRLS STATE a new musical comedy TV project,"In this new TV show ""All Politics is Vocal"" as...",8500.0,11633.0,successful,US,USD,1437620400,1434931811,...,182,True,film & video/television,137,63.92,film & video,television,6/22/2015,7/23/2015,2015
1,1,FannibalFest Fan Convention,A Hannibal TV Show Fan Convention and Art Coll...,10275.0,14653.0,successful,US,USD,1488464683,1485872683,...,79,True,film & video/television,143,185.48,film & video,television,1/31/2017,3/2/2017,2017
2,2,Charlie teaser completion,Completion fund for post-production for teaser...,500.0,525.0,successful,GB,GBP,1455555083,1454691083,...,35,True,film & video/television,105,15.0,film & video,television,2/5/2016,2/15/2016,2016
3,3,Unsure/Positive: A Dramedy Series About Life w...,We already produced the *very* beginning of th...,10000.0,10390.0,successful,US,USD,1407414107,1404822107,...,150,True,film & video/television,104,69.27,film & video,television,7/8/2014,8/7/2014,2014
4,4,Party Monsters,19th century’s most notorious literary charact...,44000.0,54116.28,successful,US,USD,1450555279,1447963279,...,284,True,film & video/television,123,190.55,film & video,television,11/19/2015,12/19/2015,2015


In [8]:
# drop commas from goal and pledged columns
kickstarter_df[' goal '] = kickstarter_df[' goal '].str.replace(',', '')
kickstarter_df['pledged'] = kickstarter_df['pledged'].str.replace(',', '')

In [9]:
# Remove spaces from goal column name
kickstarter_df.rename(columns={' goal ' : 'goal'}, inplace=True)

In [10]:
# drop blurb and categorical columns
kickstarter_df.drop('blurb', axis=1, inplace=True)
kickstarter_df.drop('Category and Subcategory', axis=1, inplace=True)
kickstarter_df.drop('Parent Category', axis=1, inplace=True)
kickstarter_df.drop('Subcategory', axis=1, inplace=True)

In [11]:
kickstarter_df.head()

Unnamed: 0,id,name,goal,pledged,outcomes,country,currency,deadline,launched_at,staff_pick,backers_count,spotlight,Percentage Funded,Average Donation,Date Created Conversion,Date Ended Conversion,Years
0,0,GIRLS STATE a new musical comedy TV project,8500.0,11633.0,successful,US,USD,1437620400,1434931811,False,182,True,137,63.92,6/22/2015,7/23/2015,2015
1,1,FannibalFest Fan Convention,10275.0,14653.0,successful,US,USD,1488464683,1485872683,False,79,True,143,185.48,1/31/2017,3/2/2017,2017
2,2,Charlie teaser completion,500.0,525.0,successful,GB,GBP,1455555083,1454691083,False,35,True,105,15.0,2/5/2016,2/15/2016,2016
3,3,Unsure/Positive: A Dramedy Series About Life w...,10000.0,10390.0,successful,US,USD,1407414107,1404822107,False,150,True,104,69.27,7/8/2014,8/7/2014,2014
4,4,Party Monsters,44000.0,54116.28,successful,US,USD,1450555279,1447963279,False,284,True,123,190.55,11/19/2015,12/19/2015,2015


In [12]:
# drop id
kickstarter_df.drop('id', axis=1, inplace=True)

In [14]:
# drop pct funded column to remove the simple equation from the model
kickstarter_df.drop('Percentage Funded', axis=1, inplace=True)

In [15]:
# recheck data types
kickstarter_df.dtypes

name                        object
goal                        object
pledged                     object
outcomes                    object
country                     object
currency                    object
deadline                     int64
launched_at                  int64
staff_pick                    bool
backers_count                int64
spotlight                     bool
Average Donation           float64
Date Created Conversion     object
Date Ended Conversion       object
Years                        int64
dtype: object

In [16]:
# Convert goal and pledged to float
g = kickstarter_df['goal']
kickstarter_df.goal = pd.to_numeric(g)

In [17]:
p = kickstarter_df['pledged']
kickstarter_df.pledged = pd.to_numeric(p)

In [18]:
# check types
kickstarter_df.dtypes

name                        object
goal                       float64
pledged                    float64
outcomes                    object
country                     object
currency                    object
deadline                     int64
launched_at                  int64
staff_pick                    bool
backers_count                int64
spotlight                     bool
Average Donation           float64
Date Created Conversion     object
Date Ended Conversion       object
Years                        int64
dtype: object

In [19]:
# drop null values
kickstarter_df.dropna()

Unnamed: 0,name,goal,pledged,outcomes,country,currency,deadline,launched_at,staff_pick,backers_count,spotlight,Average Donation,Date Created Conversion,Date Ended Conversion,Years
0,GIRLS STATE a new musical comedy TV project,8500.0,11633.00,successful,US,USD,1437620400,1434931811,False,182,True,63.92,6/22/2015,7/23/2015,2015
1,FannibalFest Fan Convention,10275.0,14653.00,successful,US,USD,1488464683,1485872683,False,79,True,185.48,1/31/2017,3/2/2017,2017
2,Charlie teaser completion,500.0,525.00,successful,GB,GBP,1455555083,1454691083,False,35,True,15.00,2/5/2016,2/15/2016,2016
3,Unsure/Positive: A Dramedy Series About Life w...,10000.0,10390.00,successful,US,USD,1407414107,1404822107,False,150,True,69.27,7/8/2014,8/7/2014,2014
4,Party Monsters,44000.0,54116.28,successful,US,USD,1450555279,1447963279,False,284,True,190.55,11/19/2015,12/19/2015,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4109,Jack the Lad,500.0,0.00,failed,GB,GBP,1448805404,1446209804,False,0,False,0.00,10/30/2015,11/29/2015,2015
4110,Take Tartuffe to Edinburgh Fringe Festival!,300.0,86.00,failed,GB,GBP,1469113351,1463929351,False,6,False,14.33,5/22/2016,7/21/2016,2016
4111,REBORN IN LOVE,3000.0,94.00,failed,US,USD,1424747740,1422155740,False,6,False,15.67,1/25/2015,2/24/2015,2015
4112,"A Great New Controversial Play - ""The Divide"".",2500.0,1.00,failed,IE,EUR,1456617600,1454280186,False,1,False,1.00,1/31/2016,2/28/2016,2016


In [20]:
# convert True/False values to boolean
kickstarter_df[['staff_pick', 'spotlight']] = (kickstarter_df[['staff_pick', 'spotlight']] == True).astype(int)

In [21]:
kickstarter_df.head()

Unnamed: 0,name,goal,pledged,outcomes,country,currency,deadline,launched_at,staff_pick,backers_count,spotlight,Average Donation,Date Created Conversion,Date Ended Conversion,Years
0,GIRLS STATE a new musical comedy TV project,8500.0,11633.0,successful,US,USD,1437620400,1434931811,0,182,1,63.92,6/22/2015,7/23/2015,2015
1,FannibalFest Fan Convention,10275.0,14653.0,successful,US,USD,1488464683,1485872683,0,79,1,185.48,1/31/2017,3/2/2017,2017
2,Charlie teaser completion,500.0,525.0,successful,GB,GBP,1455555083,1454691083,0,35,1,15.0,2/5/2016,2/15/2016,2016
3,Unsure/Positive: A Dramedy Series About Life w...,10000.0,10390.0,successful,US,USD,1407414107,1404822107,0,150,1,69.27,7/8/2014,8/7/2014,2014
4,Party Monsters,44000.0,54116.28,successful,US,USD,1450555279,1447963279,0,284,1,190.55,11/19/2015,12/19/2015,2015


In [22]:
#drop other unnecessary text columns
kickstarter_df.drop('name', axis=1, inplace=True)
kickstarter_df.drop('Date Created Conversion', axis=1, inplace=True)
kickstarter_df.drop('Date Ended Conversion', axis=1, inplace=True)

In [23]:
kickstarter_df.head()

Unnamed: 0,goal,pledged,outcomes,country,currency,deadline,launched_at,staff_pick,backers_count,spotlight,Average Donation,Years
0,8500.0,11633.0,successful,US,USD,1437620400,1434931811,0,182,1,63.92,2015
1,10275.0,14653.0,successful,US,USD,1488464683,1485872683,0,79,1,185.48,2017
2,500.0,525.0,successful,GB,GBP,1455555083,1454691083,0,35,1,15.0,2016
3,10000.0,10390.0,successful,US,USD,1407414107,1404822107,0,150,1,69.27,2014
4,44000.0,54116.28,successful,US,USD,1450555279,1447963279,0,284,1,190.55,2015


In [24]:
kickstarter_df = kickstarter_df[kickstarter_df.outcomes != 'live']

In [25]:
kickstarter_df = kickstarter_df[kickstarter_df.outcomes != 'canceled']

In [26]:
# check for nulls
kickstarter_df.isnull().sum().sum()

0

In [27]:
#Define target variable
target= 'outcomes'

In [28]:
# Create features
X = kickstarter_df.drop(columns='outcomes')
X = pd.get_dummies(X)

# Create target
y = kickstarter_df.loc[:, target].copy()


In [29]:
X.describe()

Unnamed: 0,goal,pledged,deadline,launched_at,staff_pick,backers_count,spotlight,Average Donation,Years,country_AT,...,currency_DKK,currency_EUR,currency_GBP,currency_HKD,currency_MXN,currency_NOK,currency_NZD,currency_SEK,currency_SGD,currency_USD
count,3715.0,3715.0,3715.0,3715.0,3715.0,3715.0,3715.0,3715.0,3715.0,3715.0,...,3715.0,3715.0,3715.0,3715.0,3715.0,3715.0,3715.0,3715.0,3715.0,3715.0
mean,30743.11,11691.71,1418824000.0,1415960000.0,0.148587,121.64603,0.588156,79.929612,2014.386003,0.001346,...,0.002692,0.041184,0.153701,0.000538,0.002153,0.001884,0.002153,0.004307,0.000269,0.739704
std,270106.0,58882.4,48333790.0,48479480.0,0.355729,654.595678,0.492233,138.241661,1.552394,0.036667,...,0.05182,0.198743,0.360711,0.023199,0.046361,0.043373,0.046361,0.065494,0.016407,0.438855
min,1.0,0.0,1249932000.0,1242533000.0,0.0,0.0,0.0,0.0,2009.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2000.0,214.5,1402470000.0,1399895000.0,0.0,5.0,0.0,27.145,2014.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5000.0,1908.0,1427310000.0,1424281000.0,0.0,28.0,1.0,53.34,2015.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,15000.0,6300.5,1454433000.0,1452008000.0,0.0,82.5,1.0,91.22,2016.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,10000000.0,2344135.0,1489536000.0,1488823000.0,1.0,26457.0,1.0,3000.0,2017.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [30]:
# Check the balance of the target values
y.value_counts()

successful    2185
failed        1530
Name: outcomes, dtype: int64

In [31]:
#Split the data for training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [32]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.preprocessing import StandardScaler

#Create a StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

brf_model = BalancedRandomForestClassifier(n_estimators = 100, random_state = 1)
brf_model = brf_model.fit(X_train_scaled, y_train)

y_pred = brf_model.predict(X_test_scaled)

In [33]:
#from sklearn.metrics import accuracy_score on training set
y_pred_train = brf_model.predict(X_train_scaled)
balanced_accuracy_score(y_train, y_pred_train)

1.0

In [34]:
# Calculated the balanced accuracy score on test set
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

1.0

In [35]:
# Display the confusion matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,392,0
Actual 1,0,537


In [36]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     failed       1.00      1.00      1.00      1.00      1.00      1.00       392
 successful       1.00      1.00      1.00      1.00      1.00      1.00       537

avg / total       1.00      1.00      1.00      1.00      1.00      1.00       929



In [37]:
importances = sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)
importances

[(0.5361612837299523, 'spotlight'),
 (0.16300095449634544, 'backers_count'),
 (0.14124328694767493, 'pledged'),
 (0.05638612719592101, 'Average Donation'),
 (0.053413201987878155, 'goal'),
 (0.013726626252612034, 'launched_at'),
 (0.011340385923171688, 'staff_pick'),
 (0.010797098866611388, 'deadline'),
 (0.005113309327372972, 'Years'),
 (0.0012959284301771934, 'currency_GBP'),
 (0.0010343237056213764, 'country_GB'),
 (0.0009300889827082525, 'currency_USD'),
 (0.0008809931253104612, 'country_US'),
 (0.0006248925874543884, 'currency_EUR'),
 (0.00045947273362908467, 'country_NL'),
 (0.0004288680614054219, 'currency_CAD'),
 (0.0003871464005802391, 'country_AU'),
 (0.0003823112828322262, 'currency_AUD'),
 (0.00036751973394308693, 'country_CA'),
 (0.0002954826059156006, 'country_MX'),
 (0.0002752517987934019, 'country_DE'),
 (0.00024693609782461886, 'currency_MXN'),
 (0.000234060827850118, 'country_IT'),
 (0.0001703023723795907, 'country_ES'),
 (0.00011014939266322208, 'country_IE'),
 (0.00