In [30]:
import pandas as pd
import numpy as np
import time

import ipywidgets as widgets

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report, confusion_matrix

In [7]:
#Train/Test split
def split(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2, 
    random_state=2023,
    stratify=y
    ) 
    return X_train, X_test, y_train, y_test

In [8]:
def select_cols_to_remove(df):
    # Create a list of column names
    column_names = df.columns.tolist()

    # Create the select menu
    select_cols = widgets.Select(
        options=column_names,
        rows=5,
        layout=widgets.Layout(width='50%')
    )

    # Create the button to add the selected column to a list
    add_button = widgets.Button(description='Add')
    remove_cols = []

    def add_col_to_list(b):
        col_name = select_cols.value
        if col_name not in remove_cols:
            remove_cols.append(col_name)
        print('Selected columns to remove:', remove_cols)

    add_button.on_click(add_col_to_list)

    # Display the select menu and button
    print("Select columns to remove:")
    display(select_cols)
    display(add_button)

    return remove_cols


In [36]:
def run_model(X_train, y_train, X_test, y_test, transformer=None, model=None, params=None, cv=None):
    # Timing how long the model takes to run
    start_time = time.time()
    
    # Create a pipeline with the transformer and model
    if transformer is None:
        pipe = Pipeline(steps=[('model', model)])
    else:
        pipe = Pipeline(steps=[('transformer', transformer), ('model', model)])
    
    # Use GridSearchCV to test different hyperparameters
    if params is None:
        grid = GridSearchCV(pipe, cv=cv)
    else:
        grid = GridSearchCV(pipe, params, cv=cv)

    # Fit the model
    grid.fit(X_train, y_train)
    
    # Get the predictions
    y_pred = grid.predict(X_test)
    
    # Print the classification report and confusion matrix
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    
    # Print the best score and parameters
    print(f"Best score: {grid.best_score_}")
    print(f"Best params: {grid.best_params_}")
    
    # Print the time taken to run the model
    end_time = time.time()
    print(f"Time taken to run: {round((end_time - start_time)/60,1)} minutes")

## Start here

In [37]:
#Import the CSV
df = pd.read_csv('Data/Indypygo_model.csv')

___________
If we want to remove some features. If not, please skip

In [38]:
# IF NEEDED: Call the function to select columns to remove
cols_to_remove = select_cols_to_remove(df)

Select columns to remove:


Select(layout=Layout(width='50%'), options=('tagline', 'title', 'goal_usd', 'australia', 'canada', 'switzerlan…

Button(description='Add', style=ButtonStyle())

Selected columns to remove: ['tagline']
Selected columns to remove: ['tagline', 'title']


In [39]:
# Create a new DataFrame with the remaining columns
df = df.drop(cols_to_remove, axis=1)

_________________
Here:

In [40]:
#Separate into X and Y
X = df.drop('is_success', axis=1)
y = df['is_success']

In [27]:
#train/test split
X_train, X_test, y_train, y_test = split(X,y)

In [33]:
rfc = RandomForestClassifier()

params = {'model__n_estimators': [100, 500, 1000],
          'model__class_weight': [None, {0: 1, 1: 5}]}

run_model(X_train, y_train, X_test, y_test, model=rfc, params=params)

              precision    recall  f1-score   support

           0       0.82      0.90      0.86      3207
           1       0.46      0.31      0.37       917

    accuracy                           0.77      4124
   macro avg       0.64      0.60      0.61      4124
weighted avg       0.74      0.77      0.75      4124

[[2873  334]
 [ 631  286]]
Best score: 0.7735546045432294
Best params: {'model__class_weight': None, 'model__n_estimators': 1000}
Time taken to run: 6.7 minutes


In [58]:
rfc = RandomForestClassifier()
pt = PowerTransformer()

params = {'model__n_estimators': [10, 50, 100],
          'model__class_weight': [None, {0: 1, 1: 5}]}

run_model(X_train, y_train, X_test, y_test, model=rfc, transformer=pt, params=params)

  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np

              precision    recall  f1-score   support

           0       0.82      0.90      0.86      3207
           1       0.48      0.31      0.38       917

    accuracy                           0.77      4124
   macro avg       0.65      0.61      0.62      4124
weighted avg       0.75      0.77      0.75      4124

[[2895  312]
 [ 630  287]]
Best score: 0.7737362388698195
Best params: {'model__class_weight': None, 'model__n_estimators': 50}
Time taken to run: 0.6 minutes


In [57]:
lr = LogisticRegression()

params = {'model__penalty': ['none', 'l2']}

run_model(X_train, y_train, X_test, y_test, model=lr, params=params)

              precision    recall  f1-score   support

           0       0.78      1.00      0.87      3207
           1       0.00      0.00      0.00       917

    accuracy                           0.78      4124
   macro avg       0.39      0.50      0.44      4124
weighted avg       0.60      0.78      0.68      4124

[[3207    0]
 [ 917    0]]
Best score: 0.7776161105842574
Best params: {'model__penalty': 'none'}
Time taken to run: 0.0 minutes


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [61]:
from xgboost import XGBClassifier
xgb = XGBClassifier()

params = {'model__max_depth': [10, 20],
          'model__learning_rate': [0.1, 0.01],
          'model__n_estimators': [100, 200]}

run_model(X_train, y_train, X_test, y_test, model=xgb, params=params, cv=5)

              precision    recall  f1-score   support

           0       0.85      0.95      0.90      3207
           1       0.72      0.42      0.53       917

    accuracy                           0.83      4124
   macro avg       0.78      0.69      0.71      4124
weighted avg       0.82      0.83      0.82      4124

[[3054  153]
 [ 532  385]]
Best score: 0.8435185258373498
Best params: {'model__learning_rate': 0.1, 'model__max_depth': 10, 'model__n_estimators': 200}
Time taken to run: 3.3 minutes
