In [6]:
# Enabling print for all lines
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Checking the working directory
import os
os.getcwd()

'C:\\Users\\kalya\\Python\\Kaggle micro courses\\Feature Engineering'

In [4]:
import pandas as pd
ks = pd.read_csv('ks-projects-201801.csv', parse_dates=['deadline', 'launched'])
ks.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


We get the outcome from the state column. To predict the outcome we can use features such as category, currency, funding goal, country, and when it was launched

**Preparing target column**

In [9]:
# First we will look at project states and convert the column into something we can use as targets in a model
pd.unique(ks.state)

# Records for each state
ks.groupby('state')['ID'].count()

array(['failed', 'canceled', 'successful', 'live', 'undefined',
       'suspended'], dtype=object)

state
canceled       38779
failed        197719
live            2799
successful    133956
suspended       1846
undefined       3562
Name: ID, dtype: int64

In [10]:
'''Data cleaning isn't the current focus, so we will simplify this example by

Dropping projects that are "live"
Counting "successful" states as outcome = 1
Combining every other state as outcome = 0'''

'Data cleaning isn\'t the current focus, so we will simplify this example by\n\nDropping projects that are "live"\nCounting "successful" states as outcome = 1\nCombining every other state as outcome = 0'

In [11]:
# Drop live projects
ks = ks.query('state != "live"')

# Add outcome column, "successful" == 1, others are 0
ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))

In [12]:
# Converting timestamps
"""Convert the launched feature into categorical features we can use in a model. Since I loaded in the columns as timestamp data, 
I access date and time values through the .dt attribute on the timestamp column"""

ks = ks.assign(hour=ks.launched.dt.hour, day=ks.launched.dt.day, month=ks.launched.dt.month, year=ks.launched.dt.year)
ks.head()

'Convert the launched feature into categorical features we can use in a model. Since I loaded in the columns as timestamp data, \nI access date and time values through the .dt attribute on the timestamp column'

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,outcome,hour,day,month,year
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,0,12,11,8,2015
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,0,4,2,9,2017
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,0,0,12,1,2013
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,0,3,17,3,2012
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,0,8,4,7,2015


**Prepping categorical variables**

In [13]:
"""Now for the categorical variables -- category, currency, and country -- I'll need to convert them into integers so our model
can use the data. For this I'll use scikit-learn's LabelEncoder. This assigns an integer to each value of the categorical
feature and replaces those values with the integers."""

from sklearn.preprocessing import LabelEncoder

cat_features = ['category', 'currency', 'country']
encoder = LabelEncoder()

# Apply the label encoder to each column
encoded = ks[cat_features].apply(encoder.fit_transform)
encoded.head()

"Now for the categorical variables -- category, currency, and country -- I'll need to convert them into integers so our model\ncan use the data. For this I'll use scikit-learn's LabelEncoder. This assigns an integer to each value of the categorical\nfeature and replaces those values with the integers."

Unnamed: 0,category,currency,country
0,108,5,9
1,93,13,22
2,93,13,22
3,90,13,22
4,55,13,22


In [21]:
# Collect all the features we'll use in a new dataframe and use that to train a model.

# Since ks and encoded have the same index and I can easily join them
data = ks[['goal', 'hour', 'day', 'month', 'year', 'outcome']].join(encoded)
data.head()
type(data)

Unnamed: 0,goal,hour,day,month,year,outcome,category,currency,country
0,1000.0,12,11,8,2015,0,108,5,9
1,30000.0,4,2,9,2017,0,93,13,22
2,45000.0,0,12,1,2013,0,93,13,22
3,5000.0,3,17,3,2012,0,90,13,22
4,19500.0,8,4,7,2015,0,55,13,22


pandas.core.frame.DataFrame

**Creating training, validation, and test splits**

In [16]:
"""We need to create data sets for training, validation, and testing. We'll use a fairly simple approach and split the data
using slices. We'll use 10% of the data as a validation set, 10% for testing, and the other 80% for training."""

valid_fraction = 0.1
valid_size = int(len(data) * valid_fraction)

train = data[:-2 * valid_size]
valid = data[-2 * valid_size:-valid_size]
test = data[-valid_size:]


for each in [train, valid, test]:
    print(f"Outcome fraction = {each.outcome.mean():.4f}")

"We need to create data sets for training, validation, and testing. We'll use a fairly simple approach and split the data\nusing slices. We'll use 10% of the data as a validation set, 10% for testing, and the other 80% for training."

Outcome fraction = 0.3570
Outcome fraction = 0.3539
Outcome fraction = 0.3542


**Training a LightGBM model**

In [19]:
"""We will be using a LightGBM model. This is a tree-based model that typically provides the best performance,
even compared to XGBoost. It's also relatively fast to train. We won't do hyperparameter optimization because that isn't the
goal of this course. So, our models won't be the absolute best performance you can get. But you'll still see model performance
improve as we do feature engineering."""

import lightgbm as lgb

feature_cols = train.columns.drop('outcome')

dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

param = {'num_leaves': 64, 'objective': 'binary'}
param['metric'] = 'auc'
num_round = 1000
bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=10, verbose_eval=False)

"We will be using a LightGBM model. This is a tree-based model that typically provides the best performance,\neven compared to XGBoost. It's also relatively fast to train. We won't do hyperparameter optimization because that isn't the\ngoal of this course. So, our models won't be the absolute best performance you can get. But you'll still see model performance\nimprove as we do feature engineering."

In [18]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-2.3.1-py2.py3-none-win_amd64.whl (544 kB)
Installing collected packages: lightgbm
Successfully installed lightgbm-2.3.1


**Making predictions & evaluating the model**

In [20]:
"""Let's make predictions on the test set with the model and see how well it performs. An important thing to remember is that
you can overfit to the validation data. This is why we need a test set that the model never sees until the final evaluation."""

from sklearn import metrics
ypred = bst.predict(test[feature_cols])
score = metrics.roc_auc_score(test['outcome'], ypred)

print(f"Test AUC score: {score}")

"Let's make predictions on the test set with the model and see how well it performs. An important thing to remember is that\nyou can overfit to the validation data. This is why we need a test set that the model never sees until the final evaluation."

Test AUC score: 0.747615303004287


In [23]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.2.2-py2.py3-none-any.whl (80 kB)
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


In [28]:
import category_encoders as ce
cat_features = ['category', 'currency', 'country']

# Create the encoder itself
target_enc = ce.TargetEncoder(cols=cat_features)
type(target_enc)

# Fit the encoder using the categorical features and target
target_enc.fit(train[cat_features], train['outcome'])

category_encoders.target_encoder.TargetEncoder

TargetEncoder(cols=['category', 'currency', 'country'], drop_invariant=False,
              handle_missing='value', handle_unknown='value',
              min_samples_leaf=1, return_df=True, smoothing=1.0, verbose=0)