# Importing Libraries 

In [1]:
import pandas as pd
import numpy as np

# Visualization Libraies
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix

# Loading in the Dataset 
source: https://www.kaggle.com/kemical/kickstarter-projects/notebooks?sortBy=dateRun&group=upvoted&pageSize=20&datasetId=4104

In [2]:
# To view all the columns 
pd.set_option('display.max_columns', None)

df = pd.read_csv('kickstarter.csv', 
                parse_dates=['deadline', 'launched'])
df.head(5)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


Ideas:
- delete ID
- feature engineering (deadline and launched)
- Consider top five countries
- Only consider failed or successful and make it a binary classifictaion
- Will keep all the 15 main categories
- For category we could pass it to the ordinal encoding 

# EDA

In [3]:
df.head(10)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0
5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,successful,224,US,52375.0,52375.0,50000.0
6,1000023410,Support Solar Roasted Coffee & Green Energy! ...,Food,Food,USD,2014-12-21,1000.0,2014-12-01 18:30:44,1205.0,successful,16,US,1205.0,1205.0,1000.0
7,1000030581,Chaser Strips. Our Strips make Shots their B*tch!,Drinks,Food,USD,2016-03-17,25000.0,2016-02-01 20:05:12,453.0,failed,40,US,453.0,453.0,25000.0
8,1000034518,SPIN - Premium Retractable In-Ear Headphones w...,Product Design,Design,USD,2014-05-29,125000.0,2014-04-24 18:14:43,8233.0,canceled,58,US,8233.0,8233.0,125000.0
9,100004195,STUDIO IN THE SKY - A Documentary Feature Film...,Documentary,Film & Video,USD,2014-08-10,65000.0,2014-07-11 21:55:48,6240.57,canceled,43,US,6240.57,6240.57,65000.0


In [4]:
total_nan = df.isna().sum().sort_values(ascending=False)
percentage_nan = (total_nan / df.shape[0]) * 100
tabel = pd.concat([total_nan, percentage_nan], axis=1, keys=['Total NAN', 'Percentage of NAN'])
tabel

Unnamed: 0,Total NAN,Percentage of NAN
usd pledged,3797,1.002744
name,4,0.001056
usd_goal_real,0,0.0
usd_pledged_real,0,0.0
country,0,0.0
backers,0,0.0
state,0,0.0
pledged,0,0.0
launched,0,0.0
goal,0,0.0


In [5]:
df.state.value_counts()

failed        197719
successful    133956
canceled       38779
undefined       3562
live            2799
suspended       1846
Name: state, dtype: int64

In [6]:
# filtering out the dataset for binary target variable - failed / successful
df = df.loc[(df['state'] == 'failed') | (df['state'] == 'successful')]
df.state.value_counts()

failed        197719
successful    133956
Name: state, dtype: int64

In [7]:
# Imbalance classification but not worrisome
successful = 131490/(192871+131490)
successful

0.4053816580908309

In [8]:
# # changing the target variable to 0 and 1 
# df['state'] = df['state'].map({'failed':0, 'successful':1})
# df['state'].value_counts()

## Deleting verticals that we don't need 
- Dropping USD pledged and pledged as usd_pledged_real has the same information
- Dropping country as currency and country are highly correlated
- Dropping goal as we have goal converted in USD as a vertical
- Backers seems to be the leak

In [9]:
df = df.drop(['usd pledged', 'pledged', 'country', 'goal', 'ID', 'backers'], axis=1) 
df.shape

(331675, 9)

In [10]:
# dropping 4 nan values in name
df.dropna(axis=0, inplace=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 331672 entries, 0 to 378660
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   name              331672 non-null  object        
 1   category          331672 non-null  object        
 2   main_category     331672 non-null  object        
 3   currency          331672 non-null  object        
 4   deadline          331672 non-null  datetime64[ns]
 5   launched          331672 non-null  datetime64[ns]
 6   state             331672 non-null  object        
 7   usd_pledged_real  331672 non-null  float64       
 8   usd_goal_real     331672 non-null  float64       
dtypes: datetime64[ns](2), float64(2), object(5)
memory usage: 25.3+ MB


## Checking high cardinality 

In [12]:
cols = ['category', 'main_category', 'currency']

for col in cols:
    print(f"{col} has {df[col].nunique()} unique values")

category has 159 unique values
main_category has 15 unique values
currency has 14 unique values


In [13]:
# Either we can delete category verticals or ordinal encode it!
df.category.value_counts()

Product Design     18680
Documentary        14523
Music              12633
Tabletop Games     11744
Shorts             11394
                   ...  
Residencies           68
Letterpress           46
Chiptune              33
Literary Spaces       19
Taxidermy             10
Name: category, Length: 159, dtype: int64

In [14]:
# Lets consider top 5 currencies and delete the rest 
df.currency.value_counts()

USD    261509
GBP     29475
EUR     14378
CAD     12375
AUD      6621
SEK      1510
MXN      1411
NZD      1274
DKK       929
CHF       652
NOK       584
HKD       477
SGD       454
JPY        23
Name: currency, dtype: int64

In [15]:
# Filtering out top 5 currencies 
df = df.loc[(df['currency'] == 'USD') | (df['currency'] == 'GBP') | (df['currency'] == 'EUR') \
      | (df['currency'] == 'CAD') | (df['currency'] == 'AUD')]

In [16]:
# changing the currency names into countries so it will be easier to ask a user for input
df['currency'] = df['currency'].map({
                'USD':'USA',
                'GBP':'UK',
                'EUR':'Europe',
                'CAD':'Canada',
                'AUD': 'Australia'})
df.currency.value_counts()

USA          261509
UK            29475
Europe        14378
Canada        12375
Australia      6621
Name: currency, dtype: int64

In [17]:
# dropping Category 
df = df.drop('category', axis=1)

In [18]:
# reset index as we deleted a few rows when we filtered
df.reset_index(drop=True, inplace=True)

# Feature Engineering 
- calculating the length of the campaign from deadline and launched 
- calculating the total number of words in name 

In [19]:
# making a new column length_days
df['length_days'] = (df['deadline'] - df['launched']).dt.days + 1

# deleting deadline and launched cols
df = df.drop(['deadline', 'launched'], axis=1)

In [20]:
# Total number of words in each row in name
df['name'] = df['name'].str.split().str.len()

In [21]:
df.head(25)

Unnamed: 0,name,main_category,currency,state,usd_pledged_real,usd_goal_real,length_days
0,6,Publishing,UK,failed,0.0,1533.95,59
1,8,Film & Video,USA,failed,2421.0,30000.0,60
2,3,Film & Video,USA,failed,220.0,45000.0,45
3,7,Music,USA,failed,1.0,5000.0,30
4,3,Food,USA,successful,52375.0,50000.0,35
5,8,Food,USA,successful,1205.0,1000.0,20
6,8,Food,USA,failed,453.0,25000.0,45
7,4,Publishing,Canada,failed,0.0,2406.39,30
8,4,Music,USA,successful,12700.0,12500.0,30
9,3,Crafts,USA,failed,0.0,5000.0,30


# Baseline Model 

In [22]:
# we need our model to beat this score
baseline = 1 - successful
baseline

0.5946183419091691

# Split Dataset

In [23]:
# Feature Matrix and Target Variable
X = df.drop('state', axis=1)
y = df['state']

In [24]:
# Splitting the dataset into train and test dataset 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=105)

In [25]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(259486, 6)
(64872, 6)
(259486,)
(64872,)


# Pipeline 

In [26]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [27]:
# lets look at numerical attributes for simple imputer with median 
num_attribs = X_train.select_dtypes(exclude='object')
num_attribs.columns

Index(['name', 'usd_pledged_real', 'usd_goal_real', 'length_days'], dtype='object')

In [28]:
# lets look at categorical attributes for simple imputer with 'most_frequent'
cat_attribs = X_train.select_dtypes(include='object')
cat_attribs.columns

Index(['main_category', 'currency'], dtype='object')

In [29]:
# Using median as the strategy for Simple Imputer to predict NaN values considering the ouliers in the data
num_pipeline = make_pipeline(
                            SimpleImputer(strategy='median'),
                            StandardScaler()
)
# Adding the Custom Transformer to impute using 'most_frequent' strategy and giving out an output as a dataframe instead of an array
cat_pipeline = make_pipeline(
                            SimpleImputer(strategy='most_frequent'),
                            OneHotEncoder(handle_unknown='ignore')
)

In [30]:
# generating a list of categorical and numerical columns to pass it in the column transformer
cat_attributes = list(cat_attribs)
num_attributes = list(num_attribs)

# putting two pipelines together using ColumnTransformer
final_pipeline = ColumnTransformer([
                            ('num_pipeline', num_pipeline, num_attributes),
                            ('cat_pipeline', cat_pipeline, cat_attributes)
])

In [31]:
# Fit and trasnform on X_train
X_train_transformed = final_pipeline.fit_transform(X_train)

# Trasnform on X_test
X_test_transformed = final_pipeline.transform(X_test)

In [32]:
print(X_train_transformed.shape)
print(X_test_transformed.shape)

(259486, 24)
(64872, 24)


# Model Selection

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


forest_clf = RandomForestClassifier(n_jobs=-1, random_state=105)

forest_clf.fit(X_train_transformed,y_train)

y_pred_train = forest_clf.predict(X_train_transformed)


In [34]:
score = accuracy_score(y_train, y_pred_train)
score

1.0

In [35]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(forest_clf,
                         X_train_transformed,
                         y_train,
                         scoring='accuracy',
                         cv=10,
                         n_jobs=-1)

scores

array([0.99583799, 0.99633897, 0.99630044, 0.99506725, 0.99641605,
       0.99599214, 0.99576075, 0.99591491, 0.9961076 , 0.99595345])

In [36]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=10000)

lr.fit(X_train_transformed, y_train)

y_pred_train = lr.predict(X_train_transformed)

score = accuracy_score(y_train, y_pred_train)
score


0.8954124692661647

In [37]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(lr,
                         X_train_transformed,
                         y_train,
                         scoring='accuracy',
                         cv=7)

scores.mean()

0.8906607745190184

In [38]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()

gb.fit(X_train_transformed, y_train)

y_pred_train = gb.predict(X_train_transformed)

score = accuracy_score(y_train, y_pred_train)
score


0.9923040164016556

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(gb,
                         X_train_transformed,
                         y_train,
                         scoring='accuracy',
                         cv=7)

scores.mean()

In [40]:
X_test_transformed = final_pipeline.transform(X_test)

y_test_pred = gb.predict(X_test_transformed)

score = accuracy_score(y_test, y_test_pred)
score

0.9917221605623382

In [41]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

      failed       1.00      0.99      0.99     38659
  successful       0.98      1.00      0.99     26213

    accuracy                           0.99     64872
   macro avg       0.99      0.99      0.99     64872
weighted avg       0.99      0.99      0.99     64872

