In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
ks = pd.read_csv('../datasets/ks-projects-201801.csv', parse_dates=['deadline', 'launched'])
ks.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


 - We will take only those projects that are not live
 - Assign outcome as new column, 1 if `state == successful` else 0
 - Extract `hour`, `month`, `day` and `year` from `timestamp` feature
 - Label encode our categorical features

In [3]:
ks = ks.query('state != "live"')

In [4]:
ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))

In [5]:
ks = ks.assign(hour=ks.launched.dt.hour,
              day=ks.launched.dt.day,
              month=ks.launched.dt.month,
              year=ks.launched.dt.year)

In [6]:
ks.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,outcome,hour,day,month,year
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,0,12,11,8,2015
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,0,4,2,9,2017
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,0,0,12,1,2013
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,0,3,17,3,2012
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,0,8,4,7,2015


In [7]:
cat_features = ['category', 'currency', 'country']
encoder = LabelEncoder()
encoded = ks[cat_features].apply(encoder.fit_transform)

data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome']
baseline_data = ks[data_cols].join(encoded)

In [8]:
baseline_data.head()

Unnamed: 0,goal,hour,day,month,year,outcome,category,currency,country
0,1000.0,12,11,8,2015,0,108,5,9
1,30000.0,4,2,9,2017,0,93,13,22
2,45000.0,0,12,1,2013,0,93,13,22
3,5000.0,3,17,3,2012,0,90,13,22
4,19500.0,8,4,7,2015,0,55,13,22


Now we split our `baseline_data` into `train`, `validate` and `test` sets

In [9]:
import lightgbm as lgb
from sklearn import metrics

def get_data_splits(dataframe, valid_fraction=0.1):
    """
    Splits a dataframe in train, valid and test datasets based on the valid_fraction
    parameter.
    """
    valid_size = int(len(dataframe) * valid_fraction)
    train = dataframe[: -valid_size * 2]
    valid = dataframe[-valid_size * 2: -valid_size]
    test = dataframe[-valid_size:]
    
    return train, valid, test

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [10]:
def train_model(train, valid):
    """
    1. Creates Light GBM datasets
    2. Trains a light GMB model and 
    3. Tests its accuracy on the valid dataset
    """
    feature_cols = train.columns.drop('outcome')
    
    dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])
    
    param = {'num_leaves': 64, 'objective': 'binary', 'metric': 'auc', 'seed': 7}
    
    print('Training model...')
    
    bst = lgb.train(param, dtrain, num_boost_round=1000, valid_sets=[dvalid],
                   early_stopping_rounds=10, verbose_eval=False)
    
    valid_pred = bst.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred)
    
    print(f"Validation AUC score: {valid_score: .4f}")
    
    return bst

Actually Train & Evaluate

In [11]:
train, valid, _ = get_data_splits(baseline_data)
bst = train_model(train, valid)

Training model...
Validation AUC score:  0.7467


### Count Encoding

In [12]:
import category_encoders as ce
cat_features = ['category', 'currency', 'country']
count_enc = ce.CountEncoder()
count_encoded = count_enc.fit_transform(ks[cat_features])

In [13]:
count_encoded.head(3)

Unnamed: 0,category,currency,country
0,1362,33853,33393
1,5174,293624,290887
2,5174,293624,290887


In [14]:
data = baseline_data.join(count_encoded.add_suffix("_count"))
data.head(3)

Unnamed: 0,goal,hour,day,month,year,outcome,category,currency,country,category_count,currency_count,country_count
0,1000.0,12,11,8,2015,0,108,5,9,1362,33853,33393
1,30000.0,4,2,9,2017,0,93,13,22,5174,293624,290887
2,45000.0,0,12,1,2013,0,93,13,22,5174,293624,290887


In [15]:
train, valid, test = get_data_splits(data)
bst = train_model(train, valid)

Training model...
Validation AUC score:  0.7486


### Target Encoding

In [16]:
import category_encoders as ce

cat_features = ['category', 'currency', 'country']
target_enc = ce.TargetEncoder(cols=cat_features)

# first we split the dataset
train, valid, _ = get_data_splits(data)

# make target encoder learn only on the train features and label
target_enc.fit(train[cat_features], train['outcome'])

# transform the features, rename the columns with _target suffix and join to the same dataframe
train = train.join(target_enc.transform(train[cat_features]).add_suffix('_target'))
valid = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_target'))

train.head()

Unnamed: 0,goal,hour,day,month,year,outcome,category,currency,country,category_count,currency_count,country_count,category_target,currency_target,country_target
0,1000.0,12,11,8,2015,0,108,5,9,1362,33853,33393,0.36019,0.357122,0.361636
1,30000.0,4,2,9,2017,0,93,13,22,5174,293624,290887,0.384615,0.373392,0.376631
2,45000.0,0,12,1,2013,0,93,13,22,5174,293624,290887,0.384615,0.373392,0.376631
3,5000.0,3,17,3,2012,0,90,13,22,15647,293624,290887,0.412655,0.373392,0.376631
4,19500.0,8,4,7,2015,0,55,13,22,10054,293624,290887,0.302625,0.373392,0.376631


In [17]:
bst = train_model(train, valid)

Training model...
Validation AUC score:  0.7491


### CatBoost Encoding
This is similar to target encoding in that it's based on the target probablity for a given value. However with CatBoost, for each row, the target probability is calculated only from the rows before it.

In [18]:
import category_encoders as ce

cat_features = ['category', 'currency', 'country']

target_enc = ce.CatBoostEncoder(cols=cat_features)

# first we split the dataset
train, valid, _ = get_data_splits(data)

# train the encoder only on the train set
target_enc.fit(train[cat_features], train['outcome'])

# transform cat features in both train and valid sets, add a suffix and join them back
train = train.join(target_enc.transform(train[cat_features]).add_suffix('_cb'))
valid = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_cb'))

# train and evaluate the lgbm model
bst = train_model(train, valid)


Training model...
Validation AUC score:  0.7492
