In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer

In [99]:
df = pd.read_csv('Data/IndiePyGo_nulldropped.csv')

In [100]:
df_model = df.drop(['category', 'year_end', 'month_end', 'day_end', 'funded_percent'], axis=1)

In [52]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20618 entries, 0 to 20617
Data columns (total 62 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tagline             20618 non-null  object 
 1   title               20618 non-null  object 
 2   goal_usd            20618 non-null  float64
 3   australia           20618 non-null  int64  
 4   canada              20618 non-null  int64  
 5   switzerland         20618 non-null  int64  
 6   denmark             20618 non-null  int64  
 7   western_europe      20618 non-null  int64  
 8   great_britain       20618 non-null  int64  
 9   hong_kong           20618 non-null  int64  
 10  norway              20618 non-null  int64  
 11  sweden              20618 non-null  int64  
 12  singapore           20618 non-null  int64  
 13  united_states       20618 non-null  int64  
 14  education           20618 non-null  int64  
 15  productivity        20618 non-null  int64  
 16  ener

In [None]:
#df_model.to_csv('Data/Indypygo_model.csv', index=False)

In [None]:
#df_model = df_model.drop(['tagline', 'title'], axis=1)

In [None]:
df_model['text'] = df_model['title'] + ' ' + df_model['tagline']

In [101]:
df_model = df_model.drop(['tagline', 'title'], axis=1)

In [None]:
X = df_model.drop('is_success', axis=1)
y = df_model['is_success']

In [12]:
# removes location and text
X = df_model.iloc[:, [2] + list(range(14,53)) + list(range(55,61))]
y = df_model['is_success']

In [105]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2, 
    random_state=2023,
    stratify=df_model.is_success
)


In [91]:
class_weights = {0: 1, 1: 5}

In [25]:
# Create an instance of Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=1000, random_state=42, class_weight=class_weights)

# Fit the model on the training data
rfc.fit(X_train, y_train)

# Predict on the test data
y_pred = rfc.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.85      0.84      3207
           1       0.40      0.33      0.36       917

    accuracy                           0.74      4124
   macro avg       0.61      0.59      0.60      4124
weighted avg       0.72      0.74      0.73      4124



In [None]:
X = df_model.iloc[:, [2] + list(range(14,60))]
y = df_model['is_success']

In [None]:
X

In [None]:
# Define a preprocessor for the text data
text_preprocessor = TfidfVectorizer()

# Define a preprocessor for the numerical data
num_preprocessor = PowerTransformer()

# Combine the preprocessor for the text data and numerical data using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_preprocessor, 'text'),
        ('num', num_preprocessor, X.columns.difference(['text']))
    ])

In [None]:
# Define the pipeline with the preprocessor and random forest classifier
clf = Pipeline([
     ('preprocessor', preprocessor),
     ('Random Forest', RandomForestClassifier())         
])

# Fit the pipeline on the training data
clf.fit(X_train, y_train)

# Predict on the test data
y_pred = clf.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

In [30]:
# everything but text
X = df_model.drop('is_success', axis=1)
y = df_model['is_success']

In [33]:
# Create an instance of Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=1000, random_state=42, class_weight=class_weights)

# Fit the model on the training data
rfc.fit(X_train, y_train)

# Predict on the test data
y_pred = rfc.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85      3207
           1       0.45      0.33      0.38       917

    accuracy                           0.76      4124
   macro avg       0.63      0.61      0.62      4124
weighted avg       0.74      0.76      0.75      4124



In [65]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20618 entries, 0 to 20617
Data columns (total 60 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   goal_usd            20618 non-null  float64
 1   australia           20618 non-null  int64  
 2   canada              20618 non-null  int64  
 3   switzerland         20618 non-null  int64  
 4   denmark             20618 non-null  int64  
 5   western_europe      20618 non-null  int64  
 6   great_britain       20618 non-null  int64  
 7   hong_kong           20618 non-null  int64  
 8   norway              20618 non-null  int64  
 9   sweden              20618 non-null  int64  
 10  singapore           20618 non-null  int64  
 11  united_states       20618 non-null  int64  
 12  education           20618 non-null  int64  
 13  productivity        20618 non-null  int64  
 14  energy_greentech    20618 non-null  int64  
 15  wellness            20618 non-null  int64  
 16  comi

In [66]:
#remove times
X = df_model.iloc[:, list(range(0,40))]
y = df_model['is_success']

In [68]:
# Create an instance of Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=1000, random_state=42, class_weight=class_weights)

# Fit the model on the training data
rfc.fit(X_train, y_train)

# Predict on the test data
y_pred = rfc.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.57      0.68      3207
           1       0.30      0.66      0.41       917

    accuracy                           0.59      4124
   macro avg       0.58      0.61      0.55      4124
weighted avg       0.73      0.59      0.62      4124



In [88]:
#remove days
X = df_model.iloc[:, list(range(0,52)) ]
y = df_model['is_success']

In [75]:
X

Unnamed: 0,goal_usd,australia,canada,switzerland,denmark,western_europe,great_britain,hong_kong,norway,sweden,...,mar,apr,may,jun,jul,aug,sep,oct,nov,dec
0,5000.0000,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,1200.0000,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1200.0000,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,2000.0000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,55000.0000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20613,9338.0000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
20614,3697.5021,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
20615,3346.0297,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
20616,2000.0000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [72]:
# Create an instance of Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=1000, random_state=42, class_weight=class_weights)

# Fit the model on the training data
rfc.fit(X_train, y_train)

# Predict on the test data
y_pred = rfc.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.82      0.82      3207
           1       0.40      0.42      0.41       917

    accuracy                           0.73      4124
   macro avg       0.61      0.62      0.62      4124
weighted avg       0.73      0.73      0.73      4124



In [77]:
# Create an instance of Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=1000, random_state=42)

# Fit the model on the training data
rfc.fit(X_train, y_train)

# Predict on the test data
y_pred = rfc.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.88      0.85      3207
           1       0.46      0.36      0.41       917

    accuracy                           0.76      4124
   macro avg       0.64      0.62      0.63      4124
weighted avg       0.75      0.76      0.75      4124



In [78]:
#remove months
X = df_model.iloc[:, list(range(0,40)) + list(range(53, 60))]
y = df_model['is_success']

In [80]:
# Create an instance of Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=1000, random_state=42)

# Fit the model on the training data
rfc.fit(X_train, y_train)

# Predict on the test data
y_pred = rfc.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86      3207
           1       0.49      0.35      0.41       917

    accuracy                           0.78      4124
   macro avg       0.66      0.63      0.64      4124
weighted avg       0.75      0.78      0.76      4124



In [83]:
# Create an instance of Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=5000, random_state=42, class_weight=class_weights)

# Fit the model on the training data
rfc.fit(X_train, y_train)

# Predict on the test data
y_pred = rfc.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.86      0.85      3207
           1       0.45      0.40      0.42       917

    accuracy                           0.76      4124
   macro avg       0.64      0.63      0.63      4124
weighted avg       0.75      0.76      0.75      4124



In [89]:
X

Unnamed: 0,goal_usd,australia,canada,switzerland,denmark,western_europe,great_britain,hong_kong,norway,sweden,...,mar,apr,may,jun,jul,aug,sep,oct,nov,dec
0,5000.0000,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,1200.0000,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1200.0000,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,2000.0000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,55000.0000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20613,9338.0000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
20614,3697.5021,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
20615,3346.0297,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
20616,2000.0000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [98]:
# Define the pipeline with the preprocessor and random forest classifier

clf = Pipeline([
     ('powertransformer', PowerTransformer()),
     ('Random Forest', RandomForestClassifier(n_estimators=500, random_state=42, class_weight=class_weights))         
])

# Fit the pipeline on the training data
clf.fit(X_train, y_train)

# Predict on the test data
y_pred = clf.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

  loglike = -n_samples / 2 * np.log(x_trans.var())


              precision    recall  f1-score   support

           0       0.83      0.85      0.84      3207
           1       0.43      0.39      0.41       917

    accuracy                           0.75      4124
   macro avg       0.63      0.62      0.63      4124
weighted avg       0.74      0.75      0.75      4124



In [102]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20618 entries, 0 to 20617
Data columns (total 60 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   goal_usd            20618 non-null  float64
 1   australia           20618 non-null  int64  
 2   canada              20618 non-null  int64  
 3   switzerland         20618 non-null  int64  
 4   denmark             20618 non-null  int64  
 5   western_europe      20618 non-null  int64  
 6   great_britain       20618 non-null  int64  
 7   hong_kong           20618 non-null  int64  
 8   norway              20618 non-null  int64  
 9   sweden              20618 non-null  int64  
 10  singapore           20618 non-null  int64  
 11  united_states       20618 non-null  int64  
 12  education           20618 non-null  int64  
 13  productivity        20618 non-null  int64  
 14  energy_greentech    20618 non-null  int64  
 15  wellness            20618 non-null  int64  
 16  comi

In [103]:
# removes location and text
X = df_model.iloc[:, [0] + list(range(11,52))]
y = df_model['is_success']

Unnamed: 0,goal_usd,united_states,education,productivity,energy_greentech,wellness,comics,fashion_wearables,video_games,photography,...,mar,apr,may,jun,jul,aug,sep,oct,nov,dec
0,5000.0000,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,1200.0000,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1200.0000,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,2000.0000,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,55000.0000,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20613,9338.0000,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
20614,3697.5021,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
20615,3346.0297,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
20616,2000.0000,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [106]:
# Create an instance of Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=500, random_state=42, class_weight=class_weights)

# Fit the model on the training data
rfc.fit(X_train, y_train)

# Predict on the test data
y_pred = rfc.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.84      0.83      3207
           1       0.41      0.38      0.39       917

    accuracy                           0.74      4124
   macro avg       0.62      0.61      0.61      4124
weighted avg       0.73      0.74      0.74      4124

