## Experiments 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.externals import joblib
import time



In [3]:
# load dataset
data = pd.read_csv('C:/Users/THINKPAD/Desktop/task/ai-interview-code/data.csv')

# remove lines with label 'undefined' (incomplete entries)
data = data.dropna()

# remove lines with uncertain labels such as 'live'
data = data[~data['state'].isin(['live', 'canceled', 'suspended'])]

# classMap = {'successful': 1, 'failed': 0}
# data['state'] = data['state'].map(classMap)

In [4]:
data.head()


Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,successful,224,US,52375.0,52375.0,50000.0


In [None]:
# show the distribution of 'state' samples
print(data.groupby('state').size())

In [6]:
# show the dimension of input data
print(data.shape)

# dataset description
print(data.describe())

(378661, 15)
                 ID          goal       pledged        backers   usd pledged  \
count  3.786610e+05  3.786610e+05  3.786610e+05  378661.000000  3.748640e+05   
mean   1.074731e+09  4.908079e+04  9.682979e+03     105.617476  7.036729e+03   
std    6.190862e+08  1.183391e+06  9.563601e+04     907.185035  7.863975e+04   
min    5.971000e+03  1.000000e-02  0.000000e+00       0.000000  0.000000e+00   
25%    5.382635e+08  2.000000e+03  3.000000e+01       2.000000  1.698000e+01   
50%    1.075276e+09  5.200000e+03  6.200000e+02      12.000000  3.947200e+02   
75%    1.610149e+09  1.600000e+04  4.076000e+03      56.000000  3.034090e+03   
max    2.147476e+09  1.000000e+08  2.033899e+07  219382.000000  2.033899e+07   

       usd_pledged_real  usd_goal_real  
count      3.786610e+05   3.786610e+05  
mean       9.058924e+03   4.545440e+04  
std        9.097334e+04   1.152950e+06  
min        0.000000e+00   1.000000e-02  
25%        3.100000e+01   2.000000e+03  
50%        6.243300e

In [4]:
# information of every feature (every column)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 331462 entries, 0 to 378660
Data columns (total 15 columns):
ID                  331462 non-null int64
name                331462 non-null object
category            331462 non-null object
main_category       331462 non-null object
currency            331462 non-null object
deadline            331462 non-null object
goal                331462 non-null float64
launched            331462 non-null object
pledged             331462 non-null float64
state               331462 non-null object
backers             331462 non-null int64
country             331462 non-null object
usd pledged         331462 non-null float64
usd_pledged_real    331462 non-null float64
usd_goal_real       331462 non-null float64
dtypes: float64(5), int64(2), object(8)
memory usage: 40.5+ MB


In [40]:
# select columns with values for feature analysis
# x stands for trained variables, y stands for target variable

x, y = data.iloc[:, [6,8,10,12,13,14]].values, data.iloc[:, 9].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

# use Random Forest model to find the most related features to the target
forest = RandomForestClassifier(n_estimators = 100, random_state = 0, n_jobs = 1)
forest.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [49]:
# show the importance of selected features
importances = forest.feature_importances_
fea_labels = data.columns[[6,8,10,12,13,14]]
indices = np.argsort(importances)[::-1]
for i in range(x_train.shape[1]):
    print("%2d) %-*s %f" % (i + 1, 30, fea_labels[indices[i]], importances[indices[i]]))

 1) usd_pledged_real               0.231361
 2) pledged                        0.221913
 3) backers                        0.191015
 4) usd_goal_real                  0.151379
 5) goal                           0.130362
 6) usd pledged                    0.073971


In [4]:
array = data.values
print(array.shape)

(331462, 15)


In [4]:
# divide our dataset into training set and testing set, and select the features according to importance

x = data[[ 'usd_pledged_real', 'usd_goal_real', 'backers']].values
y = data['state'].values

# perform scalling if using SVC:
# robust_scaler = RobustScaler()
# x = robust_scaler.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
print(x)
print(y)

[[  0.00000000e+00   1.53395000e+03   0.00000000e+00]
 [  2.42100000e+03   3.00000000e+04   1.50000000e+01]
 [  2.20000000e+02   4.50000000e+04   3.00000000e+00]
 ..., 
 [  2.00000000e+01   1.50000000e+04   1.00000000e+00]
 [  2.00000000e+02   1.50000000e+04   6.00000000e+00]
 [  5.24000000e+02   2.00000000e+03   1.70000000e+01]]
['failed' 'failed' 'failed' ..., 'failed' 'failed' 'failed']


In [5]:
# building the classification models and comparing their brief performance

models = []
names = []
results = []
models.append(('LR', LogisticRegression()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVC', LinearSVC()))
models.append(('NB', GaussianNB()))

for name, model in models:
    kfold = model_selection.KFold(n_splits = 10, random_state = 1)
    cv_result = model_selection.cross_val_score(model, x_train, y_train, cv = kfold, scoring = 'accuracy')
    results.append(cv_result)
    names.append(name)
    info = "%s: %f (%f)"% (name, cv_result.mean(), cv_result.std())
    print(info)

LR: 0.999823 (0.000089)
DT: 0.999198 (0.000181)
LDA: 0.606694 (0.002701)
KNN: 0.999629 (0.000134)
SVC: 0.999483 (0.001296)
NB: 0.707951 (0.002804)


In [27]:
# Linear SVC model (SVC is not considered here due to the time cost)
# process_time() is used to calculate the running time of each model

t1 = time.process_time()
svc = LinearSVC()
svc.fit(x_train, y_train)
predictions = svc.predict(x_test)
t2 = time.process_time()
print(t2 - t1, 's')
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

2.6875 s
0.999919548668
             precision    recall  f1-score   support

     failed       1.00      1.00      1.00     59307
 successful       1.00      1.00      1.00     40132

avg / total       1.00      1.00      1.00     99439



In [28]:
# K Nearest Neighbors model

t1 = time.process_time()
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
predictions = knn.predict(x_test)
t2 = time.process_time()
print(t2 - t1, 's')
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

4.640625 s
0.99959774334
             precision    recall  f1-score   support

     failed       1.00      1.00      1.00     59307
 successful       1.00      1.00      1.00     40132

avg / total       1.00      1.00      1.00     99439



In [7]:
# Decision Tree model

t1 = time.process_time()
DT = DecisionTreeClassifier()
DT.fit(x_train, y_train)
predictions = DT.predict(x_test)
t2 = time.process_time()
print(t2 - t1, 's')
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

1.28125 s
0.999265881596
             precision    recall  f1-score   support

     failed       1.00      1.00      1.00     59307
 successful       1.00      1.00      1.00     40132

avg / total       1.00      1.00      1.00     99439



In [30]:
# Naive Bayes model

t1 = time.process_time()
NB = GaussianNB()
NB.fit(x_train, y_train)
predictions = NB.predict(x_test)
t2 = time.process_time()
print(t2 - t1, 's')
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

0.703125 s
0.707609690363
             precision    recall  f1-score   support

     failed       0.67      0.99      0.80     59307
 successful       0.93      0.30      0.45     40132

avg / total       0.78      0.71      0.66     99439



In [31]:
# Logistic Regression model

t1 = time.process_time()
LR = LogisticRegression()
LR.fit(x_train, y_train)
predictions = LR.predict(x_test)
t2 = time.process_time()
print(t2 - t1, 's')
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

2.3125 s
0.999859210169
             precision    recall  f1-score   support

     failed       1.00      1.00      1.00     59307
 successful       1.00      1.00      1.00     40132

avg / total       1.00      1.00      1.00     99439



In [32]:
# Linear Discriminant Analysis model

t1 = time.process_time()
LDA = LinearDiscriminantAnalysis()
LDA.fit(x_train, y_train)
predictions = LDA.predict(x_test)
t2 = time.process_time()
print(t2 - t1, 's')
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

1.34375 s
0.606864509901
             precision    recall  f1-score   support

     failed       0.60      1.00      0.75     59307
 successful       0.98      0.03      0.05     40132

avg / total       0.76      0.61      0.47     99439



## Conclusion Notes ##

### Feature Analysis

* First, we put our focus on several features that can possibly affect the final state:

 __backers, pledged (usd_pledged and usd_pledged_real), and goal(usd_goal_real)__
 
 
* Based on the data preprocessing and feature importance ranking shown above, we get:
```
 1) usd_pledged_real                0.231361
 2) pledged                         0.221913
 3) backers                         0.191015
 4) usd_goal_real                   0.151379
 5) goal                            0.130362
 6) usd pledged                     0.073971
 ```
 
 I choose three features 'usd_pleadged_real', 'backers' and 'usd_goal_real' as major features for classification.
 
### Model Evaluation
 
 * Several commonly used models (Logistic Regression, Decision Tree, Linear Discriminant Analysis, K Nearest Neighbors, Naive Bayes and SVM) are selected as comparison.
 * For this application case, we need algorithms with high accuracy and high precision. The reason we choose precision is the main cost comes from False Positive (projects predicted as successful but failed in the end). So here Linear SVC, KNN, DT and LR are possible candidates.
 * According to the prediction performance and processing time, Decision Tree (DT) is chosen as the desirable model for this case.
 
### Model Output

* Finally, the selected model is trained with the complete dataset and saved as file using 'joblib'.

In [8]:
# final training with the whole dataset
DT.fit(x,y)

# output and save the trained model
joblib.dump(DT, 'my_prediction/trained_model.m')


['my_prediction/trained_model.m']

In [10]:
# you may test the model with other testing sets here
# clf = joblib.load('my_prediction/trained_model.m')
# predictions = clf.predict(x)
# print(classification_report(y, predictions))