![Kickstarer](./images/kickstarter.png)

---
Imports
---

In [None]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import MinMaxScaler


import json
from sklearn.dummy import DummyClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split



import warnings
warnings.filterwarnings('ignore')

RSEED = 42069

In [None]:
# Import the .csv files and concat them into one dataframe
original_dataframe = pd.concat(map(pd.read_csv, glob.glob('data/data-2/*.csv')))
# Reset the indices
original_dataframe.reset_index(drop=True, inplace=True)

In [None]:
# Set a working dataframe, so that we don't have to wait 10s it to import again if we want to start fresh
df = original_dataframe

---
EDA - Part 1
---

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# Only a very limited amount of suspended projects (drop), canceled projects will be treated as though they failed
df['state'].value_counts()

In [None]:
# Check for duplicate projects and store them in a table
dups = df.groupby(df.id.tolist()).size().reset_index().rename(columns={0:'count'})
# Sum the final col of that table, and subtract the number of culprits:
dups['count'].sum() - dups.shape[0]

---
Data Cleaning
---

In [None]:
# Drop features which will not be needed for further analysis
dropped_features = ['blurb', 'currency_symbol', 'backers_count', 'is_backing', 'permissions', 'is_starred', 'source_url',
                    'slug', 'name', 'static_usd_rate', 'profile', 'friends', 'spotlight', 'is_starrable', 'photo', 'pledged', 'usd_type',
                    'fx_rate', 'location', 'creator', 'currency_trailing_code','current_currency', 'created_at', 'urls', 'disable_communication', 'usd_pledged' ]
df = df.drop(dropped_features, axis=1)

In [None]:
# Built array which contains live projects for later use 
array_live = ['live']
live_projects = df.loc[df['state'].isin(array_live)]

# Filter and concat. for target variable
array_notlive = ['successful', 'failed', 'canceled']
df = df.loc[df['state'].isin(array_notlive)]
df.replace('canceled','failed', inplace=True)

In [None]:
# Replace successful and failed entries
df.replace(['successful','failed'],[1,0], inplace=True)

In [None]:
# Sort dataframe by 'date_changed_at' so that we will keep the entry that was most recently updated
df.sort_values('state_changed_at')
# Remove duplicates
duplicates = df.duplicated(subset='id', keep='last')
df = df[~duplicates]

---
Feature Engineering
---

In [None]:
df['category'] = df['category'].apply(lambda x: json.loads(x)['slug'])

In [None]:
df['category'] = df['category'].apply(lambda x: x.split('/',)[0])

In [None]:
# Generate new column with readable timeformat
df['launched_at_new'] = pd.to_datetime(df['launched_at'], unit='s')
df['deadline_new'] = pd.to_datetime(df['deadline'], unit='s')
df['state_changed_at_new'] = pd.to_datetime(df['state_changed_at'], unit='s')

In [None]:
# Create new feature 'duration' that displays wheather the project timespan was more or less than 30 days
df = df.eval('duration = deadline - launched_at')
df['duration'] = ['over' if x > 2592000 else 'under' for x in df['duration']]

In [None]:
# Add new column 'time' that displays the time from project launch to project end
df.eval('time = state_changed_at_new - launched_at_new', inplace=True)
# Convert to days
df['time'] = df['time'].apply(lambda x: pd.Timedelta(x).days)

In [None]:
# Change dates to weekend(1) or weekday(0)
def change_time(dataframe, column_list):
    for column in column_list:
        dataframe[column] = [1 if x >= 6 else 0 for x in pd.to_datetime(dataframe[column], unit='s').dt.weekday]
    return dataframe

In [None]:
times_lst = ['launched_at', 'deadline', 'state_changed_at']
change_time(df, times_lst)

---
EDA - Part 2
---

##### Plots

In [None]:
# Which data needs to be plotted categorical and which numerical
categorical = ['country','currency', 'staff_pick', 'category','duration']
numerical = ['usd_pledged', 'goal', 'converted_pledged_amount']

In [None]:
def bar_plot(df, column):
    """Generates barplots of categorical data

    Args:
        df (pd dataframe): Dataframe
        column (object): list of names of columns which should be plotted
    """
    # get feature
    for i in column:
        varValue = df[i].value_counts()

        plt.figure(figsize = (12,3))
        plt.bar(varValue.index, varValue, color = '#87c442', edgecolor = 'black')
        plt.xticks(varValue.index, varValue.index.values)
        plt.ylabel("Frequency")
        plt.title(i.capitalize())
        plt.xticks(rotation = 90)
        plt.show()

##### Categorical Data

In [None]:
bar_plot(df, categorical)

##### Numerical Data

In [None]:
# Outlier analysis
# We have some values in goal which are unrealistically high
# cutoff at 100000, we are focusing on small to average kickstarter projects
df = df.query('goal < 100000')

In [None]:
plt.figure(figsize = (12,5))
plt.hist(df['goal'], bins = None, facecolor = '#87c442', edgecolor = 'black');

##### Misc

In [None]:
# Drop id (not needed anymore) and converted pledged amount
df.drop('id', axis = 1, inplace = True)
df.drop('converted_pledged_amount', axis = 1, inplace = True)

#### One-hot encoding

In [None]:
# One hot encode all categorical data (country, currency, staff_pick, categories, disable_communication, duration) boolean values might 
# Replace True False with strings, otherwise one-hot encoding doesnt work
df['staff_pick'].replace([True,False],['t','f'], inplace=True)


one_hot_featurelist = ['country', 'currency', 'staff_pick', 'category', 'duration']
one_hot = pd.get_dummies(df[one_hot_featurelist])
df.drop(one_hot_featurelist, axis = 1, inplace=True)
df = df.join(one_hot)

#### Scalerize

In [None]:
def scale_columns(df, column):
    """Function that scales the data with a min_max scaler

    Args:
        df (dataframe): Dataframe
        column (object): Name or list of names including the columns which should be normalized

    Returns:
        Dataframe object: Returns the dataframe including the normalized columns
    """
    scaler = MinMaxScaler()
    for i in column:
        scaler.fit(df[[i]])
        df[i] = scaler.transform(df[[i]])
    
    return df

In [None]:
# Standardize numerical data
numerical = ['goal']
df = scale_columns(df, numerical)

#### Dummy Classifier and Baseline Model

In [None]:
# Drop features which will not be needed for further analysis
dropped_features = ['launched_at_new','deadline_new','state_changed_at_new']
df = df.drop(dropped_features, axis=1)

In [None]:
# Set x and y
X = df.drop('state', axis = 1)
y = df['state']

In [None]:
dummy_1 = DummyClassifier(strategy='stratified', random_state=RSEED, constant=None)
dummy_1.fit(X,y)

dummy_2 = DummyClassifier(strategy='most_frequent', random_state=RSEED, constant=None)
dummy_2.fit(X,y)

y_pred_1 = dummy_1.predict(X)
y_pred_2 = dummy_2.predict(X)

In [None]:
f1_score(y, y_pred_1)
f1_score(y, y_pred_2)

In [None]:
confusion_matrix(y, y_pred_1)

#### Model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.3, random_state=RSEED)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from xgboost import XGBClassifier
from tqdm import tqdm
import time
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

models = list()

#here is the list of the models we'll try 
models.append(LogisticRegression())
models.append(DecisionTreeClassifier())
models.append(RandomForestClassifier())
models.append(GradientBoostingClassifier())
models.append(AdaBoostClassifier(DecisionTreeClassifier(),learning_rate=0.1))
models.append(KNeighborsClassifier())
models.append(XGBClassifier())

In [None]:
kfold = StratifiedKFold(n_splits=5)

In [None]:
cv_results = []
cv_names = []
cv_times = []

for model in tqdm(models):
    start_time = time.time()
    print('go:', model)
    cv_results.append(cross_val_score(model, X_train, y = y_train, scoring = "f1", cv = kfold, n_jobs=4, verbose=5))
    cv_names.append(model.__class__.__name__)
    end_time = time.time()
    total_time = round(end_time - start_time ,2)
    cv_times.append(total_time)
    print('end:', model)

cv_means = []
cv_std = []

for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())
    

cv_res = pd.DataFrame({"CrossValMeans":cv_means,"CrossValerrors": cv_std,"Algorithm":cv_names, "Time needed for training":cv_times})

g = sns.barplot("CrossValMeans","Algorithm",data = cv_res,orient = "h",**{'xerr':cv_std})
g.set_xlabel("Average F1 score")
g = g.set_title("K-fold Cross validation average F1 score")

In [None]:
cv_res['criterion'] = cv_res.CrossValMeans - cv_res.CrossValerrors/2
cv_res.sort_values(by='criterion', ascending=False)
# We select XGBClassifier, LogisticRegression, and RandomForestClassifier for hyperparameter tuning

In [60]:
### XGB classifier

XGB = XGBClassifier()

max_depth = [1,2,4,8,10]
min_child_weight = np.linspace(1, 10, 5, endpoint=True) 

gamma = np.linspace(0.5, 5, 5, endpoint=True)
subsample = np.linspace(0.5, 1, 5, endpoint=True)
colsample_bytree = np.linspace(0.5, 1, 5, endpoint=True)

XGB_param_grid = {
        'min_child_weight': min_child_weight,
        'gamma': gamma,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'max_depth': max_depth
        }


gsXGB = HalvingGridSearchCV(estimator = XGB, 
                    param_grid = XGB_param_grid, cv=kfold, scoring="f1", n_jobs= 4, verbose = 1)

gsXGB.fit(X_train,y_train)

XGB_best = gsXGB.best_estimator_
print(XGB_best.get_params())

# Best score
gsXGB.best_score_

n_iterations: 8
n_required_iterations: 8
n_possible_iterations: 8
min_resources_: 54
max_resources_: 118616
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 3125
n_resources: 54
Fitting 5 folds for each of 3125 candidates, totalling 15625 fits


0it [29:47, ?it/s]


----------
iter: 1
n_candidates: 1042
n_resources: 162
Fitting 5 folds for each of 1042 candidates, totalling 5210 fits
----------
iter: 2
n_candidates: 348
n_resources: 486
Fitting 5 folds for each of 348 candidates, totalling 1740 fits
----------
iter: 3
n_candidates: 116
n_resources: 1458
Fitting 5 folds for each of 116 candidates, totalling 580 fits
----------
iter: 4
n_candidates: 39
n_resources: 4374
Fitting 5 folds for each of 39 candidates, totalling 195 fits
----------
iter: 5
n_candidates: 13
n_resources: 13122
Fitting 5 folds for each of 13 candidates, totalling 65 fits
----------
iter: 6
n_candidates: 5
n_resources: 39366
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 7
n_candidates: 2
n_resources: 118098
Fitting 5 folds for each of 2 candidates, totalling 10 fits
{'objective': 'binary:logistic', 'use_label_encoder': False, 'base_score': 0.5, 'booster': 'gbtree', 'callbacks': None, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytre

0.7580658969814261

In [61]:
# RFC Parameters tunning 
RFC = RandomForestClassifier()

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1400, num = 4)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 20, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [ 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [ 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

## Search grid for optimal parameters
rf_param_grid = {"max_depth": max_depth,
              "max_features": max_features,
              "min_samples_split": min_samples_split,
              "min_samples_leaf": min_samples_leaf,
              "bootstrap": bootstrap,
              "n_estimators" :n_estimators,
              "criterion": ["gini"]}
              

gsRFC = HalvingGridSearchCV(RFC,param_grid = rf_param_grid, cv=kfold, scoring="f1", n_jobs= 4, verbose = 1‚)

gsRFC.fit(X_train,y_train)

RFC_best = gsRFC.best_estimator_

print(RFC_best.get_params())


# Best score
gsRFC.best_score_

KeyboardInterrupt: 

In [56]:
# Logistic Regression Parameters tunning
LR = LogisticRegression( )

# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(-2, 2, 20)


lr_param_grid = {'penalty': penalty, 'C': C }

clf = GridSearchCV(LogisticRegression(), lr_param_grid)

gsLR = GridSearchCV(LR, param_grid = lr_param_grid, cv=kfold, scoring="f1", n_jobs= 4, verbose = -1)

gsLR.fit(X_train,y_train)

LR_best = gsLR.best_estimator_

# View best hyperparameters
print('Best Penalty:', LR_best.get_params()['penalty'])
print('Best C:', LR_best.get_params()['C'])

# Best score
gsLR.best_score_


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Penalty: l2
Best C: 8.858667904100823


0.736831411902355