In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
import sklearn
from sklearn import linear_model
from sklearn import preprocessing
%matplotlib inline
sns.set_style('white')

In [2]:
df = pd.read_csv('/Users/markespina/Downloads/kickstarter-projects/ks-projects-201801.csv')

In [3]:
# creature binary outcome variable
df['outcome'] = df['usd_pledged_real'] >= df['usd_goal_real']

# recode bool to numeric
df['outcome'] = np.where(df['outcome'] == True, 1, 0)
df['country'] = np.where(df['country'] == 'US', 1, 0)

In [71]:
# filter out categorical features
features = [f for f in df.columns if df[f].dtype != 'object']

#remove redudant/unecessary features
features.remove('goal')
features.remove('pledged')
features.remove('ID')
features.remove('usd pledged')
features.remove('usd_pledged_real')

In [72]:
features

['backers', 'country', 'usd_goal_real', 'outcome']

In [5]:
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso

In [73]:
# creature additional feature sets
categories = pd.get_dummies(df.main_category)
more_features = pd.concat([df[features], categories], axis=1)

even_more_features = pd.get_dummies(df.category)
even_more_features = pd.concat([df[features], even_more_features], axis=1)

In [75]:
# set X, y and additional feature sets

y = df[features.pop()]
X = df[features]


X2 = more_features.drop('outcome', axis=1)
X3 = even_more_features.drop('outcome', axis=1)

In [65]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y)

logr = LogisticRegression()
fit = logr.fit(X_train, y_train)
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = logr.predict(X_test)

print('\n Accuracy by project outcome')
print(pd.crosstab(pred_y_sklearn, y_test))

print('\n Percentage accuracy')
print(logr.score(X_test, y_test))

print(cross_val_score(logr, X_test, y_test, cv=10))

Coefficients
[[ 0.03680844 -0.00032972 -0.00022858]]
[-0.00047613]

 Accuracy by project outcome
outcome      0      1
row_0                
0        54665   2999
1         5758  31244

 Percentage accuracy
0.9074958274354045
[0.90758344 0.90927334 0.90874525 0.9065075  0.90724699 0.9100993
 0.90988802 0.90312698 0.90988802 0.90502852]


In [43]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X2, y)

logr = LogisticRegression()
fit = logr.fit(X_train, y_train)
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = logr.predict(X_test)

print('\n Accuracy by project outcome')
print(pd.crosstab(pred_y_sklearn, y_test))

print('\n Percentage accuracy')
print(logr.score(X_test, y_test))

print(cross_val_score(logr, X_test, y_test, cv=10))

Coefficients
[[ 0.044594   -0.0096738  -0.00026849 -0.00106148 -0.00055946 -0.00080628
   0.00011121 -0.00107879 -0.00137504 -0.00043788 -0.00082849 -0.00256236
  -0.00033112 -0.00056253 -0.00064473 -0.00243291 -0.00088507  0.00028423]]
[-0.01317069]

 Accuracy by project outcome
outcome      0      1
row_0                
0        54960   2916
1         5371  31419

 Percentage accuracy
0.912460651131346
[0.90716096 0.90546108 0.91053132 0.91053132 0.91105947 0.91421931
 0.91126136 0.91347982 0.90893725 0.90883161]


Logististic regression performs pretty well with R2 over 90% and seem to be consisent over 10 folds
Adding some additional features barely increased accuracy
Theme seems to be a bit some class imbalance in the sample let's see if SMOTE will improve anything

In [77]:
from imblearn.over_sampling import SMOTE 
smote = SMOTE(random_state=0)
X_balanced, y_balanced = smote.fit_sample(X,y)

In [78]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced)
logr = LogisticRegression()
fit = logr.fit(X_train, y_train)
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = logr.predict(X_test)

print('\n Accuracy by project outcome')
print(pd.crosstab(pred_y_sklearn, y_test))

print('\n Percentage accuracy')
print(logr.score(X_test, y_test))



Coefficients
[[ 0.04544106 -0.00017497 -0.0002194 ]]
[-0.0002953]

 Accuracy by project outcome
col_0      0      1
row_0              
0      52515   3458
1       7661  57176

 Percentage accuracy
0.9079629169770714


In [42]:
print(str(round((2880/966476),4)*100) + "% original false positive rate")
print(str(round((5674/966476),4)*100) + "% original false negative rate")
print(str(round((3561/966476),4)*100) + "% SMOTE false positive rate")
print(str(round((7711/966476),4)*100) + "% SMOTE false negative rate")

"""Using Smote, model performed worse with an increase in both type 1 and type 2 errors"""

0.3% original false positive rate
0.59% original false negative rate
0.37% SMOTE false positive rate
0.8% SMOTE false negative rate


'Using Smote, model performed worse with an increase in both type 1 and type 2 errors'

In [87]:
# let's try rigde with our original feature set and added feature set
from sklearn import linear_model
X_train, X_test, y_train, y_test = train_test_split(X, y)
ridge_regr = linear_model.RidgeClassifier(alpha=5, fit_intercept=False)

ridge_regr.fit(X_train, y_train)
print('\nR² for the model with few features:')
print(ridge_regr.score(X_test, y_test))
origparams = ridge_regr.coef_[0]
print(origparams)

X_train, X_test, y_train, y_test = train_test_split(X2, y) # x2 has additional category columns
ridge_regr = linear_model.RidgeClassifier(alpha=5, fit_intercept=False)

ridge_regr.fit(X_train, y_train)
print('\nR² for the model with more features:')
print(ridge_regr.score(X_test, y_test))
newparams = ridge_regr.coef_[0]
print(newparams)


R² for the model with few features:
0.6198740836203072
[ 1.22263263e-04 -2.59628400e-01 -2.15207974e-08]

R² for the model with more features:
0.6742970020915641
[ 1.31290463e-04  8.35515614e-02 -1.85201065e-08 -2.47804223e-01
  8.82596215e-03 -5.85463237e-01  1.61925916e-01 -3.65650036e-01
 -5.67412762e-01 -3.21278688e-01 -5.69850526e-01 -3.75389040e-01
 -6.44074875e-01 -9.35957941e-02 -4.58422503e-01 -4.52809508e-01
 -6.59356483e-01  1.35836374e-01]


While rigde performs far worse than logistic regression, unlike logistic regression the model exprienced a increase in predictive accuracy with the more complex model let's run it one more time with more features

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X3, y) # x3 has even more category columns
ridge_regr = linear_model.RidgeClassifier(alpha=10, fit_intercept=False)

ridge_regr.fit(X_train, y_train)
print('\nParameter estimates for the model with many features:')
print(ridge_regr.score(X_test, y_test))
newparams = ridge_regr.coef_[0]
print(newparams)


Parameter estimates for the model with many features:
0.6858428580482961
[ 1.30340741e-04  6.30648254e-02 -1.61219916e-08 -3.59745172e-01
 -6.16587579e-01 -3.53995156e-01 -7.32665067e-01 -4.89827378e-01
 -5.23704845e-01  2.27812758e-01 -6.47539040e-01 -9.19478427e-01
 -5.62352787e-01 -2.32290155e-01 -6.36586082e-02 -4.85127610e-01
 -6.17411486e-01 -1.25869482e-01 -2.48412537e-01 -1.47924163e-01
 -8.01408114e-01 -1.99557346e-01 -3.63939895e-01 -5.84850955e-01
  3.37781481e-01 -3.26975614e-01  2.00536946e-01 -2.53153661e-01
  1.09928191e-01 -6.30671745e-02 -6.15128396e-01 -3.63052115e-01
 -5.50246238e-01  2.04005142e-01 -7.41543070e-01 -5.46212641e-01
 -5.84694719e-01 -7.23408019e-01 -1.27877463e-01  2.41498097e-01
 -3.78922466e-01 -4.77244404e-01 -3.31712889e-01 -2.69426901e-01
 -5.56719250e-01 -4.08827036e-01 -6.33544884e-01 -6.63559869e-01
 -2.40606888e-01 -4.72822162e-01 -2.22168095e-01 -6.46108049e-01
 -3.64845850e-01 -6.74292745e-01 -6.18985399e-01 -5.37074322e-01
  7.08342770e-02

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number5.133450e-17
  overwrite_a=True).T


Predictive accuracy is maxing out at 68%, lets try lasso

In [68]:
lass = linear_model.Lasso(alpha=.1)
X_train, X_test, y_train, y_test = train_test_split(X, y)
lassfit = lass.fit(X_train, y_train)
print('R² for the model with few features:')
print(lass.score(X_train, y_train))
origparams = np.append(lassfit.coef_, lassfit.intercept_)
print('\nParameter estimates for the model with few features:')
print(origparams)

X_train, X_test, y_train, y_test = train_test_split(X2, y)
# Larger number of parameters.
lassBig = linear_model.Lasso(alpha=.1)
lassBig.fit(X_train, y_train)
print('\nR² for the model with many features:')
print(lassBig.score(X_train, y_train))
origparams = np.append(lassBig.coef_, lassBig.intercept_)
print('\nParameter estimates for the model with many features:')
print(origparams)

R² for the model with few features:
0.01581156564264463

Parameter estimates for the model with few features:
[ 6.23546148e-05  0.00000000e+00 -9.41714309e-09  3.55926523e-01]

R² for the model with many features:
0.017204468414754426

Parameter estimates for the model with many features:
[ 6.84554788e-05  0.00000000e+00 -1.00057328e-08  0.00000000e+00
  0.00000000e+00 -0.00000000e+00  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00  3.54634665e-01]


## Conclusion
Of the three regression models , Logistic Regression preformed the best with an accurracy rate of 90% (with the simplest feature set to boot!)
Rigde seems to benefit the most from a heavier feature set
Lasso Performed the worse with the lowest accuracy and little benefit from more complex feature sets

