In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [3]:
ks = pd.read_csv('../datasets/ks-projects-201801.csv', parse_dates=['deadline', 'launched'])
ks.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


 - We will take only those projects that are not live
 - Assign outcome as new column, 1 if `state == successful` else 0
 - Extract `hour`, `month`, `day` and `year` from `timestamp` feature
 - Label encode our categorical features

In [12]:
ks = ks.query('state != "live"')

In [17]:
ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))

In [18]:
ks = ks.assign(hour=ks.launched.dt.hour,
              day=ks.launched.dt.day,
              month=ks.launched.dt.month,
              year=ks.launched.dt.year)

In [20]:
ks.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,outcome,hour,day,month,year
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,0,12,11,8,2015
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,0,4,2,9,2017
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,0,0,12,1,2013
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,0,3,17,3,2012
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,0,8,4,7,2015


In [24]:
cat_features = ['category', 'currency', 'country']
encoder = LabelEncoder()
encoded = ks[cat_features].apply(encoder.fit_transform)

data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome']
baseline_data = ks[data_cols].join(encoded)

In [25]:
baseline_data.head()

Unnamed: 0,goal,hour,day,month,year,outcome,category,currency,country
0,1000.0,12,11,8,2015,0,108,5,9
1,30000.0,4,2,9,2017,0,93,13,22
2,45000.0,0,12,1,2013,0,93,13,22
3,5000.0,3,17,3,2012,0,90,13,22
4,19500.0,8,4,7,2015,0,55,13,22


Now we split our `baseline_data` into `train`, `validate` and `test` sets

In [29]:
import lightgbm as lgb
from sklearn import metrics

def get_data_splits(dataframe, valid_fraction=0.1):
    """
    Splits a dataframe in train, valid and test datasets based on the valid_fraction
    parameter.
    """
    valid_size = int(len(dataframe) * valid_fraction)
    train = dataframe[: -valid_size * 2]
    valid = dataframe[-valid_size * 2: -valid_size]
    test = dataframe[-valid_size:]
    
    return train, valid, test

In [32]:
def train_model(train, valid):
    """
    1. Creates Light GBM datasets
    2. Trains a light GMB model and 
    3. Tests its accuracy on the valid dataset
    """
    feature_cols = train.columns.drop('outcome')
    
    dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])
    
    param = {'num_leaves': 64, 'objective': 'binary', 'metric': 'auc', 'seed': 7}
    
    print('Training model...')
    
    bst = lgb.train(param, dtrain, num_boost_round=1000, valid_sets=[dvalid],
                   early_stopping_rounds=10, verbose_eval=True)
    
    valid_pred = bst.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred)
    
    print(f"Validation AUC score: {valid_score: .4f}")
    
    return bst

Actually Train & Evaluate

In [33]:
train, valid, _ = get_data_splits(baseline_data)
bst = train_model(train, valid)

Training model...
[1]	valid_0's auc: 0.694205
Training until validation scores don't improve for 10 rounds
[2]	valid_0's auc: 0.697948
[3]	valid_0's auc: 0.700725
[4]	valid_0's auc: 0.702387
[5]	valid_0's auc: 0.706414
[6]	valid_0's auc: 0.708354
[7]	valid_0's auc: 0.709729
[8]	valid_0's auc: 0.711345
[9]	valid_0's auc: 0.711938
[10]	valid_0's auc: 0.71335
[11]	valid_0's auc: 0.714963
[12]	valid_0's auc: 0.716481
[13]	valid_0's auc: 0.71696
[14]	valid_0's auc: 0.717515
[15]	valid_0's auc: 0.719399
[16]	valid_0's auc: 0.721109
[17]	valid_0's auc: 0.722065
[18]	valid_0's auc: 0.723151
[19]	valid_0's auc: 0.724676
[20]	valid_0's auc: 0.725117
[21]	valid_0's auc: 0.725586
[22]	valid_0's auc: 0.726378
[23]	valid_0's auc: 0.72806
[24]	valid_0's auc: 0.728584
[25]	valid_0's auc: 0.729776
[26]	valid_0's auc: 0.730433
[27]	valid_0's auc: 0.730665
[28]	valid_0's auc: 0.731105
[29]	valid_0's auc: 0.732173
[30]	valid_0's auc: 0.73257
[31]	valid_0's auc: 0.732986
[32]	valid_0's auc: 0.733731
[33]	v