# Lecture 2: Introduction to Feature Engineering

## Setup

In [228]:
import numpy as np 
import pandas as pd
import random
from random import choices
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import preprocessing

In [229]:
articles = pd.read_csv('../data/articles.csv')
customers = pd.read_csv('../data/customers.csv')
sample_submisison = pd.read_csv('../data/sample_submission.csv')
transactions = pd.read_csv('../data/transactions_train.csv')

## The H&M Dataset

In [4]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   article_id                    105542 non-null  int64 
 1   product_code                  105542 non-null  int64 
 2   prod_name                     105542 non-null  object
 3   product_type_no               105542 non-null  int64 
 4   product_type_name             105542 non-null  object
 5   product_group_name            105542 non-null  object
 6   graphical_appearance_no       105542 non-null  int64 
 7   graphical_appearance_name     105542 non-null  object
 8   colour_group_code             105542 non-null  int64 
 9   colour_group_name             105542 non-null  object
 10  perceived_colour_value_id     105542 non-null  int64 
 11  perceived_colour_value_name   105542 non-null  object
 12  perceived_colour_master_id    105542 non-null  int64 
 13 

In [5]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371980 entries, 0 to 1371979
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   customer_id             1371980 non-null  object 
 1   FN                      476930 non-null   float64
 2   Active                  464404 non-null   float64
 3   club_member_status      1365918 non-null  object 
 4   fashion_news_frequency  1355971 non-null  object 
 5   age                     1356119 non-null  float64
 6   postal_code             1371980 non-null  object 
dtypes: float64(3), object(4)
memory usage: 73.3+ MB


In [6]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31788324 entries, 0 to 31788323
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   t_dat             object 
 1   customer_id       object 
 2   article_id        int64  
 3   price             float64
 4   sales_channel_id  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 1.2+ GB


In [7]:
X = transactions.merge(customers, how='inner', on='customer_id')
X = X.merge(articles, how='inner', on='article_id')

### Creating Samples 
If you would rather work with samples instead of the whole dataset (while prototyping your code). You can use the code below:

In [9]:
# Adapted from: https://www.kaggle.com/code/paweljankiewicz/hm-create-dataset-samples
for sample_repr, sample in [("01", 0.001), ("1", 0.01), ("5", 0.05)]:
    print(sample)
    customers_sample = customers.sample(int(customers.shape[0]*sample), replace=False)
    customers_sample_ids = set(customers_sample["customer_id"])
    transactions_sample = transactions[transactions["customer_id"].isin(customers_sample_ids)]
    articles_sample_ids = set(transactions_sample["article_id"])
    articles_sample = articles[articles["article_id"].isin(articles_sample_ids)]
    customers_sample.to_csv(f"../data/customers_sample{sample_repr}.csv.gz", index=False)
    transactions_sample.to_csv(f"../data/transactions_sample{sample_repr}.csv.gz", index=False)
    articles_sample.to_csv(f"../data/articles_sample{sample_repr}.csv.gz", index=False)

0.001
0.01
0.05


In [10]:
articles_sample = pd.read_csv('../data/articles_sample01.csv.gz')
customers_sample = pd.read_csv('../data/customers_sample01.csv.gz')
transactions_sample = pd.read_csv('../data/transactions_sample01.csv.gz')

In [11]:
customers_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371 entries, 0 to 1370
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   customer_id             1371 non-null   object 
 1   FN                      471 non-null    float64
 2   Active                  454 non-null    float64
 3   club_member_status      1366 non-null   object 
 4   fashion_news_frequency  1350 non-null   object 
 5   age                     1360 non-null   float64
 6   postal_code             1371 non-null   object 
dtypes: float64(3), object(4)
memory usage: 75.1+ KB


In [12]:
transactions_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30285 entries, 0 to 30284
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   t_dat             30285 non-null  object 
 1   customer_id       30285 non-null  object 
 2   article_id        30285 non-null  int64  
 3   price             30285 non-null  float64
 4   sales_channel_id  30285 non-null  int64  
 5   Ordered           30285 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 1.4+ MB


## A Simplified Task: Binary Classification

In this assignment, we'll consider a simplified task: predict whether a user ordered an item or not. We provide a baseline logistic regression model below, but haven't done much feature preprocessing or engineering!

### Creating the Dataset

In [230]:
# If you'd rather use a sample. Uncomment the following code:
# transactions = transactions_sample
# customers = customers_sample
# articles = articles_sample

In [231]:
transactions['Ordered'] = 1

In [232]:
# What's happening here? 
# We're creating negative samples. I.e. we're creating transactions that didn't actually occur.
# Get real article transactions:
transaction_articles = transactions.drop(labels=['t_dat','customer_id', 'Ordered', 'sales_channel_id'], axis=1)
transaction_articles = transaction_articles.drop_duplicates(subset=['article_id'])
transaction_articles.head()
# Sampling random users, articles and sales channel:
random.seed(42)
customer_ids = customers['customer_id'].unique().tolist()
rand_customers = choices(customer_ids, k=len(transactions))
article_ids = articles['article_id'].unique().tolist()
rand_articles = choices(article_ids, k=len(transactions))
rand_channels = choices([1, 2], k=len(transactions))
# Inserting the random users, articles and channels in a transactions_rand dataframe
transactions_rand = transactions.copy()
transactions_rand.drop(['price'], axis=1, inplace=True)
transactions_rand['customer_id'] = rand_customers 
transactions_rand['article_id'] = rand_articles
transactions_rand['sales_channel_id'] = rand_channels
# Merge the random users with actual pricing from random articles, takes a while to run (merging 38M with 3.8M rows):
transactions_rand = transactions_rand.merge(transaction_articles, on='article_id', how='left')
transactions_rand['Ordered'] = 0

In [233]:
# Result:
transactions_rand.head()

Unnamed: 0,t_dat,customer_id,article_id,sales_channel_id,Ordered,price
0,2018-09-20,a3be039dc4d68548eb226632e04f0e416a506cf8f093d2...,625877003,2,0,0.026627
1,2018-09-20,06700bc2885980aab8135d9361ebd92fc20611302cfcc5...,658223002,2,0,0.033881
2,2018-09-20,46912c337e710d870acbc985206d0240f1796aa7b9f987...,715247001,2,0,0.020322
3,2018-09-20,393a0770d70bbfc1306fd8aaa01639226e326f439b96f7...,516000071,1,0,0.016932
4,2018-09-20,bc8df8bd2c22d849460e2a8b287b93b796b4d0c2094f31...,889319001,2,0,0.050831


In [234]:
# Concat the negative samples to the positive samples:
transactions = pd.concat([transactions, transactions_rand])
transactions = transactions.merge(customers, how="inner", on='customer_id')
transactions = transactions.merge(articles, how="inner", on='article_id')

In [235]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63576648 entries, 0 to 63576647
Data columns (total 36 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   t_dat                         object 
 1   customer_id                   object 
 2   article_id                    int64  
 3   price                         float64
 4   sales_channel_id              int64  
 5   Ordered                       int64  
 6   FN                            float64
 7   Active                        float64
 8   club_member_status            object 
 9   fashion_news_frequency        object 
 10  age                           float64
 11  postal_code                   object 
 12  product_code                  int64  
 13  prod_name                     object 
 14  product_type_no               int64  
 15  product_type_name             object 
 16  product_group_name            object 
 17  graphical_appearance_no       int64  
 18  graphical_appearance

### Basic Preprocessing
Some very basic preprocessing.

In [236]:
# I'm dropping a lot of columns, use them in your engineering tasks!
transactions_processed = transactions[['customer_id', 'age', 'article_id', 'sales_channel_id', 'price', 'Ordered']].copy()
transactions_processed.head()

Unnamed: 0,customer_id,age,article_id,sales_channel_id,price,Ordered
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,24.0,663713001,2,0.050831,1
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,24.0,663713001,2,0.050831,1
2,1385e5f3a2d3dbd21237f91faf81254a6d96de31b07b0b...,25.0,663713001,2,0.050831,1
3,1385e5f3a2d3dbd21237f91faf81254a6d96de31b07b0b...,25.0,663713001,2,0.050831,1
4,217bd772e946a013afd4482a4faa7bd1949c1b4b9a0632...,35.0,663713001,2,0.050831,0


In [237]:
# Does it make sense to label encode?
# Label encoding the customer and article IDs:
customer_encoder = preprocessing.LabelEncoder()
article_encoder = preprocessing.LabelEncoder()

In [238]:
transactions_processed['customer_id'] = customer_encoder.fit_transform(transactions_processed['customer_id'])
transactions_processed['article_id'] = article_encoder.fit_transform(transactions_processed['article_id'])

In [239]:
# If you want to go back to the original encoding:
customer_encoder.inverse_transform([2])

array(['000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318'],
      dtype=object)

In [240]:
transactions_processed.head()

Unnamed: 0,customer_id,age,article_id,sales_channel_id,price,Ordered
0,2,24.0,40179,2,0.050831,1
1,2,24.0,40179,2,0.050831,1
2,104641,25.0,40179,2,0.050831,1
3,104641,25.0,40179,2,0.050831,1
4,179831,35.0,40179,2,0.050831,0


In [241]:
# Can you come up with a NaN strategy that makes sense for each column in the dataset?
# Imputing all NaN values with zeros:
transactions_processed = transactions_processed.fillna(0)
transactions_processed.isnull().values.any()

False

In [242]:
# Does it make sense to one-hot encode?
# One-hot-encoding sales_channel_id:
transactions_processed = pd.get_dummies(transactions_processed, columns=['sales_channel_id'])

In [243]:
transactions_processed.head()

Unnamed: 0,customer_id,age,article_id,price,Ordered,sales_channel_id_1,sales_channel_id_2
0,2,24.0,40179,0.050831,1,0,1
1,2,24.0,40179,0.050831,1,0,1
2,104641,25.0,40179,0.050831,1,0,1
3,104641,25.0,40179,0.050831,1,0,1
4,179831,35.0,40179,0.050831,0,0,1


In [244]:
# Creating a Train / Test Split:
X_train, X_test, y_train, y_test = train_test_split(transactions_processed.drop('Ordered', axis=1), transactions_processed['Ordered'], test_size=0.10, random_state=42)

In [245]:
X_train.head()

Unnamed: 0,customer_id,age,article_id,price,sales_channel_id_1,sales_channel_id_2
50994939,611343,43.0,59247,0.025407,0,1
37464744,550827,38.0,68787,0.030492,1,0
56984818,529348,25.0,30111,0.005068,0,1
17084271,902421,26.0,22112,0.028847,0,1
24488913,23646,34.0,67655,0.025407,0,1


In [246]:
y_train.head()

50994939    1
37464744    0
56984818    0
17084271    1
24488913    1
Name: Ordered, dtype: int64

## Baseline Model

In [247]:
# Will take a few minutes to run, if you're using the whole dataset:
baseline = LogisticRegression(random_state=42)
baseline = baseline.fit(X_train, y_train)

In [248]:
baseline.predict_proba(X_test)

array([[0.5086429 , 0.4913571 ],
       [0.48976052, 0.51023948],
       [0.48813922, 0.51186078],
       ...,
       [0.49463131, 0.50536869],
       [0.49886859, 0.50113141],
       [0.49621575, 0.50378425]])

In [249]:
y_test

23488218    1
10759766    1
38409516    1
24628014    0
46965121    0
           ..
30568173    1
34746913    0
46100043    0
5582956     0
35870758    1
Name: Ordered, Length: 6357665, dtype: int64

In [250]:
# Mean Accuracy:
baseline.score(X_test, y_test)
# As you can seen, the accuracy is ~0.51. In other words, the classifier predicts correctly 51% of the time whether a customer did or din't buy an item.
# Can you improve this baseline logistic regression model by doing better preprocessing and generating new features?

0.51008727260716

In [251]:
# Classification Metrics:
predictions = baseline.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.51      0.39      0.44   3179718
           1       0.51      0.63      0.56   3177947

    accuracy                           0.51   6357665
   macro avg       0.51      0.51      0.50   6357665
weighted avg       0.51      0.51      0.50   6357665



In [252]:
predictions

array([0, 1, 1, ..., 1, 1, 1])

## Assignment: Feature engineering
**TODO:** 
- In groups (of 2-3 students), think about (a few) features that can be engineered (preprocess and generate new features). Divide the work!
- Do these engineered features improve the baseline model?
- Add your thoughts & results to a slide deck for discussion next week (again, 1 slide per person).
