In [None]:
# !pip install catboost

In [None]:
import pandas as pd
import sys
# from google.colab import files
# import io 
import re
import datetime
import ast
import json

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

df = pd.read_csv("Kickstarter.csv")

"""
The dataset can be found here:
https://webrobots.io/kickstarter-datasets/
https://s3.amazonaws.com/weruns/forfun/Kickstarter/Kickstarter_2021-06-17T03_20_03_179Z.zip
"""

# Load The Dataset

In [None]:
file_path = "Kickstarter.csv"


def clean_data(file_path):
    df = pd.read_csv(file_path)
    
    def change_to_time_series(item):
        item = datetime.datetime.fromtimestamp(item)
        return item
    
    # change the 'created-at', 'deadline', 'state_changed_at' columns
    # to time series columns
    time_series_columns = ['created_at', 'deadline', 'state_changed_at', 'launched_at']
    for column in time_series_columns:
        df[column] = df[column].apply(change_to_time_series)
    
    # dropping columns with all null values
    all_null_columns = df.isnull().sum() == len(df)

    to_drop = ['friends', 'is_backing', 'is_starred', 'permissions']
    
    df = df.drop(columns=to_drop, axis=1)
    
    df = df.drop(columns=["currency_symbol",
                          "country_displayable_name",
                          "creator",
                          "country",
                          "urls",
                          "source_url",
                          "currency_trailing_code",
                          "fx_rate",
                          "id",
                          "name",
                          "disable_communication",
                          "photo", # has url for photos might be useful later 
                          "usd_type", 
                          'converted_pledged_amount',
                          'pledged',
                          'usd_exchange_rate',
                          'static_usd_rate',
                          'currency', # dropping due to have pledged in USD
                          'current_currency'  # dropping due to have pledged in USD
                         ], axis=1)
    
    def extract_dictionary_info(item):
        """ Use the apply method with the column name.
        Takes in dictionary in string form,
        converts it into a dictionary, and
        returns info_type requested in a new column.
        """
        my_dic = ast.literal_eval(item)
        return my_dic

    cat_df = df['category'].apply(extract_dictionary_info).apply(pd.Series)
    cat_df['parent_name'].loc[(cat_df.parent_name.isnull() == True)& (cat_df['name'] == 'Dance')] = 'Dance'
    cat_df['parent_name'].loc[(cat_df.parent_name.isnull() == True)& (cat_df['name'] == 'Photography')] = 'Photography'
    cat_df['parent_name'].loc[(cat_df.parent_name.isnull() == True)& (cat_df['name'] == 'Games')] = 'Games'
    
    df['category_2'] = cat_df.parent_name    
    
    le = preprocessing.LabelEncoder()
    df["sub_categories"] = le.fit_transform(df["category_2"])
    # 2 is Flim and Video 
    # 0 is Dance
    # 3 is Games
    # 1 is Fashion
    
    df.drop(columns=['category', 'category_2'], inplace=True)
      
    
    # location
    df.dropna(inplace=True)
    loc_df = df['location'].apply(json.loads).apply(pd.Series)
    df['location_2'] = loc_df.country
    df.drop(columns='location', inplace=True)
    df["sub_location"] = le.fit_transform(df["location_2"])
    # Need to find locations from the integer and add here
    df.drop(columns=['location_2'], inplace=True)

    
    # in dictionary format need to work on OR text data
    drop_for_now = ['blurb', 'profile', 'slug']
    df.drop(columns=drop_for_now, inplace=True)
    
    
    # creating our target variable, we had the four sub-categories:
    # successful, failed, canceled, live
    # turn it into a binary variable
    df['target'] = 0
    df['target'].loc[df.state == 'successful'] = 1
    df.drop(columns='state', inplace=True)
    
    # changing boolean to integers
    df['spotlight_2'] = 0
    df['spotlight_2'].loc[df.spotlight == True] = 1

    df['staff_pick_2'] = 0
    df['staff_pick_2'].loc[df.staff_pick == True] = 1

    df['is_starrable_2'] = 0
    df['is_starrable_2'].loc[df.is_starrable == True] = 1
    df.drop(columns=['spotlight', 'staff_pick', 'is_starrable', "spotlight_2", "backers_count"],
           inplace=True)

    # removing the launched_at and state_changed_at time series columns
    # add at your discretion
    time_series = ['created_at', 'deadline','state_changed_at', 'launched_at', "usd_pledged"] 
    # Drop USD pledged for leakage, duh 
    df.drop(columns=time_series, inplace=True)

    return df

In [None]:
# Reading in the data and passing through the cleaning function
df = clean_data(file_path)
df.info()

In [None]:
df.corr()

# Split the Data

In [None]:
# Split the data to avoid leakage
x = df.drop(columns=['target'])
y = df['target']

In [None]:
# Create a training and validation set
# Smaller test size is acceptable due to the amount of data available
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=.1, random_state=42)

print('x_train:', len(X_train))
print('x_val:', len(X_val))
print('y_train:',y_train.shape)
print('y_val:', y_val.shape)

## Set Model Baseline

In [None]:
baseline = y_train.value_counts(normalize = True).max()
print("The baseline score for our dataset is:", baseline)

# Explore Models

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, ReLU
from tensorflow.keras.callbacks import TensorBoard

from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# required for compatibility bewteen sklearn and keras
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

## Configure a Neural Network ##

In [None]:
# Define input dimensions to avoid hidden layer issues
input_dims = X_train.shape[1]
input_dims

## Create Model Function ##
This function should help you save time on creating more models.

In [None]:
# Function to create model, required for KerasClassifier
def create_model(units = 32, optimizer = "adam", activation = "sigmoid"):
    """"
    Returns a complied keras model 
    
    Parameters
    ----------
    units: int 
        number of neruons/nodes/units to use in each hidden layer
        
    Returns
    -------
    model: keras object 
    """

    model = Sequential()
    model.add(Dense(units = 64, input_dim = input_dims, activation = activation))
    model.add(Dense(units = 32, activation = activation))
    model.add(Dense(units = 10, activation = activation))   
    model.add(Dense(1, activation="sigmoid")) # USE SIGMOID FOR BINARY CLASSIFICATION
    model.compile(loss="binary_crossentropy",
                    optimizer = "adam",
                    metrics=["accuracy"])
    return model

In [None]:
# Instantiate a base model for grid search
base_model = KerasClassifier(build_fn = create_model)

### Perform a Grid Search To Optimize Our Model 

In [None]:
# define the grid search parameters
param_grid = {'batch_size': [128, 64, 32],
              'epochs': [25, 10 , 5],
              'units':[128, 64, 32],
              "optimizer": ["sigmoid", "adam", "relu"]
              }

In [None]:
# Create Grid Search
grid = GridSearchCV(estimator = base_model,
                    param_grid = param_grid, 
                    n_jobs=-2, 
                    verbose=1, 
                    cv=3)
grid_result = grid.fit(X_train, y_train)

# Report Results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"Means: {mean}, Stdev: {stdev} with: {param}")

In [None]:
# Assign the best model 
best_nn_model = grid_result.best_estimator_
# Confirm the models params
best_nn_model.get_params()

In [None]:
    best_nn_model.fit(X_train, 
                        y_train, 
                        validation_data = (X_val,y_val),
                        verbose=2)

### Check NN score

In [None]:
best_nn_model_val_score = best_nn_model.score(X_val,y_val)
print("The best NN model validation score is:", best_nn_model_val_score)

# Create A Classifier Model 
Compare this model with our NN

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

In [None]:
clf_model = make_pipeline(
        RandomForestClassifier(
        random_state = 42))


## Perform Gridsearch to Optimize Model

In [None]:
# define the grid search parameters
param_grid = {
            "randomforestclassifier__max_depth": range(5,40,5),
            "randomforestclassifier__n_estimators": range(25,125,25),
            "randomforestclassifier__max_features": ["auto", "sqrt", "log2"]}

In [None]:
grid = GridSearchCV(estimator = clf_model,
                    param_grid = param_grid, 
                    n_jobs = -2, 
                    verbose = 1, 
                    cv = 3)

grid_result = grid.fit(X_train, y_train)

In [None]:
# Report Results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"Means: {mean}, Stdev: {stdev} with: {param}")

In [None]:
# Assign the best model 
best_clf_model = grid_result.best_estimator_
# Confirm the models params
best_clf_model.get_params()

### Check the CLF score

In [None]:
best_clf_model_val_score = best_clf_model.score(X_val,y_val)
print("The best CLF model validation score is:", best_clf_model_val_score)

# Create Logistic Regressor Model 
Compare this model with our NN and CLF Models

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
lr_model = make_pipeline(
        LogisticRegression(
        random_state = 42))

## Optimize Logistic Regressor

In [38]:
# define the grid search parameters
param_grid = {
            "logisticregression__solver": ["lbfgs", "liblinear", "sag", "saga"]
}

In [39]:
grid = GridSearchCV(estimator = lr_model,
                    param_grid = param_grid, 
                    n_jobs = -2, 
                    verbose = 1, 
                    cv = 3)

grid_result = grid.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [40]:
# Report Results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"Means: {mean}, Stdev: {stdev} with: {param}")

Best: 0.7182756527018822 using {'logisticregression__solver': 'lbfgs'}
Means: 0.7182756527018822, Stdev: 0.010551394776552982 with: {'logisticregression__solver': 'lbfgs'}
Means: 0.7170613236187006, Stdev: 0.009705141082662915 with: {'logisticregression__solver': 'liblinear'}
Means: 0.310261080752884, Stdev: 0.00085866032930973 with: {'logisticregression__solver': 'sag'}
Means: 0.310261080752884, Stdev: 0.00085866032930973 with: {'logisticregression__solver': 'saga'}


In [41]:
# Assign the best model 
best_lr_model = grid_result.best_estimator_
# Confirm the models params
best_lr_model.get_params()

{'memory': None,
 'steps': [('logisticregression', LogisticRegression(random_state=42))],
 'verbose': False,
 'logisticregression': LogisticRegression(random_state=42),
 'logisticregression__C': 1.0,
 'logisticregression__class_weight': None,
 'logisticregression__dual': False,
 'logisticregression__fit_intercept': True,
 'logisticregression__intercept_scaling': 1,
 'logisticregression__l1_ratio': None,
 'logisticregression__max_iter': 100,
 'logisticregression__multi_class': 'auto',
 'logisticregression__n_jobs': None,
 'logisticregression__penalty': 'l2',
 'logisticregression__random_state': 42,
 'logisticregression__solver': 'lbfgs',
 'logisticregression__tol': 0.0001,
 'logisticregression__verbose': 0,
 'logisticregression__warm_start': False}

In [43]:
best_lr_model_val_score = best_lr_model.score(X_val,y_val)
print("The best lr model validation score is:", best_lr_model_val_score)

The best lr model validation score is: 0.7404371584699454


# Compare Model Scores

In [44]:
print("The best NN model validation score is:", best_nn_model_val_score)
print("The best CLF model validation score is:", best_clf_model_val_score)
print("The best lr model validation score is:", best_lr_model_val_score)

The best NN model validation score is: 0.9098360538482666
The best CLF model validation score is: 0.9426229508196722
The best lr model validation score is: 0.7404371584699454


In [None]:
#TODO Compare Precision and Recall with Confusion Matrix

# Explore The Best Model's Features

In [None]:
#TODO Create Feature importance

In [None]:
#TODO Visualize Feature Importances

In [None]:
#TODO Create Shapley

# Save The Best Model

In [None]:
import joblib
joblib.dump(#TODO Insert best model here, "my_h5_model.h5")

In [None]:
# # Save the entire model as a SavedModel.
# !mkdir -p saved_model
# clf_model.save('my_h5_model.h5') 