# Lecture 2: Introduction to Feature Engineering

## Setup

In [None]:
import numpy as np 
import pandas as pd
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import preprocessing

In [None]:
articles = pd.read_csv('../data/articles.csv')
customers = pd.read_csv('../data/customers.csv')
sample_submisison = pd.read_csv('../data/sample_submission.csv')
transactions = pd.read_csv('../data/transactions_train.csv')

## The H&M Dataset

In [None]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   article_id                    105542 non-null  int64 
 1   product_code                  105542 non-null  int64 
 2   prod_name                     105542 non-null  object
 3   product_type_no               105542 non-null  int64 
 4   product_type_name             105542 non-null  object
 5   product_group_name            105542 non-null  object
 6   graphical_appearance_no       105542 non-null  int64 
 7   graphical_appearance_name     105542 non-null  object
 8   colour_group_code             105542 non-null  int64 
 9   colour_group_name             105542 non-null  object
 10  perceived_colour_value_id     105542 non-null  int64 
 11  perceived_colour_value_name   105542 non-null  object
 12  perceived_colour_master_id    105542 non-null  int64 
 13 

In [None]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371980 entries, 0 to 1371979
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   customer_id             1371980 non-null  object 
 1   FN                      476930 non-null   float64
 2   Active                  464404 non-null   float64
 3   club_member_status      1365918 non-null  object 
 4   fashion_news_frequency  1355971 non-null  object 
 5   age                     1356119 non-null  float64
 6   postal_code             1371980 non-null  object 
dtypes: float64(3), object(4)
memory usage: 73.3+ MB


In [None]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31788324 entries, 0 to 31788323
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   t_dat             object 
 1   customer_id       object 
 2   article_id        int64  
 3   price             float64
 4   sales_channel_id  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 1.2+ GB


In [None]:
# X = transactions.merge(customers, how='inner', on='customer_id')
# X = X.merge(articles, how='inner', on='article_id')

### Creating Samples 
If you would rather work with samples instead of the whole dataset (while prototyping your code). You can use the code below:

In [None]:
# Adapted from: https://www.kaggle.com/code/paweljankiewicz/hm-create-dataset-samples
# This extracts three sampled datasets, containing 0.1%, 1% and 5% of all users and their transactions, and the associated articles.
for sample_repr, sample in [("01", 0.001), ("1", 0.01), ("5", 0.05)]:
    print(sample)
    customers_sample = customers.sample(int(customers.shape[0]*sample), replace=False)
    customers_sample_ids = set(customers_sample["customer_id"])
    transactions_sample = transactions[transactions["customer_id"].isin(customers_sample_ids)]
    articles_sample_ids = set(transactions_sample["article_id"])
    articles_sample = articles[articles["article_id"].isin(articles_sample_ids)]
    customers_sample.to_csv(f"../data/customers_sample{sample_repr}.csv.gz", index=False)
    transactions_sample.to_csv(f"../data/transactions_sample{sample_repr}.csv.gz", index=False)
    articles_sample.to_csv(f"../data/articles_sample{sample_repr}.csv.gz", index=False)

0.001
0.01
0.05


In [None]:
articles_sample = pd.read_csv('../data/articles_sample1.csv.gz')
customers_sample = pd.read_csv('../data/customers_sample1.csv.gz')
transactions_sample = pd.read_csv('../data/transactions_sample1.csv.gz')

In [None]:
customers_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13719 entries, 0 to 13718
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   customer_id             13719 non-null  object 
 1   FN                      4755 non-null   float64
 2   Active                  4638 non-null   float64
 3   club_member_status      13659 non-null  object 
 4   fashion_news_frequency  13568 non-null  object 
 5   age                     13524 non-null  float64
 6   postal_code             13719 non-null  object 
dtypes: float64(3), object(4)
memory usage: 750.4+ KB


In [None]:
transactions_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 313094 entries, 0 to 313093
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   t_dat             313094 non-null  object 
 1   customer_id       313094 non-null  object 
 2   article_id        313094 non-null  int64  
 3   price             313094 non-null  float64
 4   sales_channel_id  313094 non-null  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 11.9+ MB


## A Simplified Task: Binary Classification

The task of predicting which 12 items users are most likely to buy in the next week is difficult to translate to a traditional classification machine learning setting. 
To obtain the 12 items a user is most likely to buy, we need to make predictions for all items (or the ones selected by a baseline) and select the 12 that have the highest predicted scores.

In this assignment, we'll consider a simplified task: Predict whether a user ordered a single item or not, based on the features of the user and the item. 
We provide a baseline logistic regression model below, but haven't done much feature preprocessing or engineering!
Initially, it is always best to focus your efforts on getting your features in the right shape and setting up the right validation scheme and baselines.
Once you are sure that your features add value and your validation scheme is correct, then you typically move on to trying more elaborate models.

### Creating the Dataset

In [None]:
# If you'd rather use a sample. Uncomment the following code:
transactions = transactions_sample
customers = customers_sample
articles = articles_sample

In [None]:
transactions['ordered'] = 1

The problem setting is an example of a "PU learning" problem, i.e. only positives are labeled, everything else is unlabeled (and can be either positive or negative). 
Of course, we cannot train a classifier with just positive samples: The classifier will just learn that everything is positive.
Therefore, we need to manually generate negative samples.

Below, we use a simple random negative sampling strategy.
We want to create a balanced dataset, meaning that we have just as many positives as negatives.
This makes sure that the classifier will not benefit from predicting the positive/negative class more often than the other.
Realistically, the amount of positive samples is of course many times smaller than the amount of unlabeled, possibly negative instances.


If you want to try your hand at a more complex negative sampling strategy, you may want to check out this blog as a starting point: https://medium.com/mlearning-ai/overview-negative-sampling-on-recommendation-systems-230a051c6cd7.



In [None]:
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,ordered
0,2018-09-20,02f10c94c36c9f8d06ca2fccb131f8579cbabd1e3fb90c...,552826001,0.008458,2,1
1,2018-09-20,02f10c94c36c9f8d06ca2fccb131f8579cbabd1e3fb90c...,631744003,0.006763,2,1
2,2018-09-20,02f10c94c36c9f8d06ca2fccb131f8579cbabd1e3fb90c...,651244001,0.006763,2,1
3,2018-09-20,02f10c94c36c9f8d06ca2fccb131f8579cbabd1e3fb90c...,621381011,0.033881,2,1
4,2018-09-20,02f10c94c36c9f8d06ca2fccb131f8579cbabd1e3fb90c...,592172007,0.006763,2,1


In [None]:
# What's happening here? 
# We're creating negative samples. I.e. we're creating transactions that didn't actually occur.
# First, we need to know which interactions did occur:
positive_pairs = list(map(tuple, transactions[['customer_id', 'article_id']].drop_duplicates().values))

In [None]:
# Then we need to know what every synthetic transaction should contain: a date, a customer_id, an article_id, price, sales_channel_id. We will set ordered = 0, as these transactions didn't really occur.
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,ordered
0,2018-09-20,02f10c94c36c9f8d06ca2fccb131f8579cbabd1e3fb90c...,552826001,0.008458,2,1
1,2018-09-20,02f10c94c36c9f8d06ca2fccb131f8579cbabd1e3fb90c...,631744003,0.006763,2,1
2,2018-09-20,02f10c94c36c9f8d06ca2fccb131f8579cbabd1e3fb90c...,651244001,0.006763,2,1
3,2018-09-20,02f10c94c36c9f8d06ca2fccb131f8579cbabd1e3fb90c...,621381011,0.033881,2,1
4,2018-09-20,02f10c94c36c9f8d06ca2fccb131f8579cbabd1e3fb90c...,592172007,0.006763,2,1


In [None]:
# Extract real values
real_dates = transactions["t_dat"].unique()
real_customers = transactions["customer_id"].unique()
real_articles = transactions["article_id"].unique()
real_channels = transactions["sales_channel_id"].unique()
article_and_price = transactions[["article_id","price"]].drop_duplicates("article_id").set_index("article_id").squeeze()

In [None]:
# How many negatives do we need to sample?
num_neg_pos = transactions.shape[0]
print(num_neg_pos)

313094


In [None]:
# Sampling negatives by selecting random users, articles, dates and sales channel:
# Note: This is quite naive. Some articles may not even have been available at the date we are sampling.
random.seed(42)

# Afterwards, we need to remove potential duplicates, so we'll sample too many.
num_neg_samples = int(num_neg_pos * 1.1)

# Sample each of the independent attributes.
neg_dates = np.random.choice(real_dates, size=num_neg_samples)
neg_articles = np.random.choice(real_articles, size=num_neg_samples)
neg_customers = np.random.choice(real_customers, size=num_neg_samples)
neg_channels = np.random.choice(real_channels, size=num_neg_samples)
ordered = np.array([0] * num_neg_samples)
# Assign to every article a real price.
neg_prices = article_and_price[neg_articles].values

In [None]:
neg_transactions = pd.DataFrame([neg_dates, neg_customers, neg_articles, neg_prices, neg_channels, ordered], index=transactions.columns).T

In [None]:
# Result:
neg_transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,ordered
0,2019-05-20,cbcbac7dccfe0a65d90a03281891187e2714eb79670e8b...,456163013,0.023712,2,0
1,2020-01-14,a68f1782c209950bdd654e024a8b6e2ceb4c73e4538424...,189626001,0.016932,2,0
2,2020-08-10,a153c59e4cc98ca9650274ef037a54486209e91de95cc9...,833512002,0.025407,2,0
3,2020-05-17,496657984c4f5166e1403f054247581279ba70a2a7ac3a...,638383001,0.050831,1,0
4,2020-07-03,ae5a434dcb8fc582936c921c5c4754f718713a3ab10f7e...,629445005,0.025407,2,0


In [None]:
neg_transactions.shape

(344403, 6)

In [None]:
# Remove random negative samples that actually coincide with positives
df = neg_transactions[
    ~neg_transactions.set_index(["customer_id", "article_id"]).index.isin(positive_pairs)
]

# Remove any excess
chosen_neg_transactions = df.sample(num_neg_pos)

In [None]:
# Concat the negative samples to the positive samples:
transactions = pd.concat([transactions, chosen_neg_transactions])
transactions = transactions.merge(customers, how="inner", on='customer_id')
transactions = transactions.merge(articles, how="inner", on='article_id')

In [None]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 626188 entries, 0 to 626187
Data columns (total 36 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   t_dat                         626188 non-null  object 
 1   customer_id                   626188 non-null  object 
 2   article_id                    626188 non-null  object 
 3   price                         626188 non-null  object 
 4   sales_channel_id              626188 non-null  object 
 5   ordered                       626188 non-null  object 
 6   FN                            242499 non-null  float64
 7   Active                        237603 non-null  float64
 8   club_member_status            624244 non-null  object 
 9   fashion_news_frequency        621387 non-null  object 
 10  age                           620290 non-null  float64
 11  postal_code                   626188 non-null  object 
 12  product_code                  626188 non-nul

### Basic Preprocessing
Some very basic preprocessing.

In [None]:
# I'm dropping a lot of columns, use them in your engineering tasks!
transactions_processed = transactions[['customer_id', 'age', 'article_id', 'sales_channel_id', 'price', 'ordered']].copy()
transactions_processed.head()

Unnamed: 0,customer_id,age,article_id,sales_channel_id,price,ordered
0,02f10c94c36c9f8d06ca2fccb131f8579cbabd1e3fb90c...,32.0,552826001,2,0.008458,1
1,31450ad0f1788b63e44094b51b57558d22365ef4efb247...,22.0,552826001,1,0.008458,0
2,d5cc19b27d893b1a7937791ff453b251d49ab66952dd8b...,32.0,552826001,2,0.008458,1
3,30d422b17c18964fed0e363780873f7cefce12206e2f32...,24.0,552826001,2,0.008458,1
4,cfaee643d89a5e55f50191766e2bcbf2b38483f349b33f...,20.0,552826001,2,0.008458,1


In [None]:
# Does it make sense to label encode?
# Label encoding the customer and article IDs:
customer_encoder = preprocessing.LabelEncoder()
article_encoder = preprocessing.LabelEncoder()

In [None]:
transactions_processed['customer_id'] = customer_encoder.fit_transform(transactions_processed['customer_id'])
transactions_processed['article_id'] = article_encoder.fit_transform(transactions_processed['article_id'])

In [None]:
# If you want to go back to the original encoding:
customer_encoder.inverse_transform([2])

array(['000da7cae0959d00f079d2d36f8cd7065fc91c685cd9e9b44c5f8052b03fe285'],
      dtype=object)

In [None]:
transactions_processed.head()

Unnamed: 0,customer_id,age,article_id,sales_channel_id,price,ordered
0,157,32.0,4667,2,0.008458,1
1,2664,22.0,4667,1,0.008458,0
2,11362,32.0,4667,2,0.008458,1
3,2642,24.0,4667,2,0.008458,1
4,11026,20.0,4667,2,0.008458,1


In [None]:
# Can you come up with a NaN strategy that makes sense for each column in the dataset?
# Imputing all NaN values with zeros:
transactions_processed = transactions_processed.fillna(0)
transactions_processed.isnull().values.any()

False

In [None]:
# Does it make sense to one-hot encode?
# One-hot-encoding sales_channel_id:
transactions_processed = pd.get_dummies(transactions_processed, columns=['sales_channel_id'])

In [None]:
transactions_processed.head()

Unnamed: 0,customer_id,age,article_id,price,ordered,sales_channel_id_1,sales_channel_id_2
0,157,32.0,4667,0.008458,1,0,1
1,2664,22.0,4667,0.008458,0,1,0
2,11362,32.0,4667,0.008458,1,0,1
3,2642,24.0,4667,0.008458,1,0,1
4,11026,20.0,4667,0.008458,1,0,1


In [None]:
# Creating a Train / Test Split:
X_train, X_test, y_train, y_test = train_test_split(transactions_processed.drop('ordered', axis=1), transactions_processed['ordered'], test_size=0.10, random_state=42)

In [None]:
X_train.head()

Unnamed: 0,customer_id,age,article_id,price,sales_channel_id_1,sales_channel_id_2
397792,11106,27.0,36197,0.010186,1,0
110794,316,48.0,6093,0.01761,0,1
271629,8824,49.0,35282,0.042356,1,0
506217,2581,25.0,40170,0.006763,0,1
172998,7247,24.0,41323,0.016932,0,1


In [None]:
y_train.head()

397792    1
110794    1
271629    0
506217    1
172998    0
Name: ordered, dtype: int64

## Baseline Model

In [None]:
# Will take a few minutes to run, if you're using the whole dataset:
baseline = LogisticRegression(random_state=42)
baseline = baseline.fit(X_train, y_train)

In [None]:
X_train.head()

Unnamed: 0,customer_id,age,article_id,price,sales_channel_id_1,sales_channel_id_2
397792,11106,27.0,36197,0.010186,1,0
110794,316,48.0,6093,0.01761,0,1
271629,8824,49.0,35282,0.042356,1,0
506217,2581,25.0,40170,0.006763,0,1
172998,7247,24.0,41323,0.016932,0,1


In [None]:
baseline.predict_proba(X_test)

array([[0.51761285, 0.48238715],
       [0.50581416, 0.49418584],
       [0.53122804, 0.46877196],
       ...,
       [0.48571874, 0.51428126],
       [0.50930569, 0.49069431],
       [0.49425239, 0.50574761]])

In [None]:
y_test

91575     1
320242    1
515365    0
65438     1
466368    1
         ..
276028    0
297829    1
227258    0
471010    0
117487    1
Name: ordered, Length: 62619, dtype: int64

In [None]:
# Mean Accuracy:
baseline.score(X_test, y_test)
# As you can seen, the accuracy is ~0.51. In other words, the classifier predicts correctly 51% of the time whether a customer did or din't buy an item.
# Can you improve this baseline logistic regression model by doing better preprocessing and generating new features?
# Also think about my steps! Did it make sense to include the article and customer ids? (And things like that)

0.5097654066657085

In [None]:
# Classification Metrics:
predictions = baseline.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.51      0.64      0.57     31300
           1       0.51      0.38      0.44     31319

    accuracy                           0.51     62619
   macro avg       0.51      0.51      0.50     62619
weighted avg       0.51      0.51      0.50     62619



In [None]:
predictions

array([0, 0, 0, ..., 1, 0, 1], dtype=int64)

## Assignment: Feature engineering
**TODO:** 
- In groups (of 2-3 students), think about (a few) features that can be engineered (preprocess and generate new features). Divide the work!
- Do these engineered features improve the baseline model?
- Add your thoughts & results to a slide deck for discussion next week (again, 1 slide per person).


In [None]:
# Function to create a base model to test features
def create_model(df, target='ordered', test_size=0.10):
    X_train, X_test, y_train, y_test = train_test_split(df.drop(target, axis=1), df[target], test_size=test_size, random_state=42)
    baseline = LogisticRegression(random_state=42)
    baseline = baseline.fit(X_train, y_train)
    print("Mean accuracy: ", baseline.score(X_test, y_test))
    print("Classification Metrics:")
    predictions = baseline.predict(X_test)
    print(classification_report(y_test, predictions))

### Improved baseline: no ids in a model

In [None]:
# baseline model 
create_model(transactions_processed)

Mean accuracy:  0.5097654066657085
Classification Metrics:
              precision    recall  f1-score   support

           0       0.51      0.64      0.57     31300
           1       0.51      0.38      0.44     31319

    accuracy                           0.51     62619
   macro avg       0.51      0.51      0.50     62619
weighted avg       0.51      0.51      0.50     62619



In [None]:
# Same model but without ids 
## Customer ids and article ids make model even worse because 
## they don't add any useful information and don't help to generalize a model. 
features = ['age', 'sales_channel_id_1', 'sales_channel_id_2', 'price', 'ordered']
create_model(transactions_processed[features])

Mean accuracy:  0.6021015985563487
Classification Metrics:
              precision    recall  f1-score   support

           0       0.62      0.54      0.57     31300
           1       0.59      0.67      0.63     31319

    accuracy                           0.60     62619
   macro avg       0.60      0.60      0.60     62619
weighted avg       0.60      0.60      0.60     62619



In [None]:
# Model without ids and with only one sales_channel_id - same performance
features = ['age', 'sales_channel_id_1', 'price', 'ordered']
create_model(transactions_processed[features])

Mean accuracy:  0.6021015985563487
Classification Metrics:
              precision    recall  f1-score   support

           0       0.62      0.54      0.57     31300
           1       0.59      0.67      0.63     31319

    accuracy                           0.60     62619
   macro avg       0.60      0.60      0.60     62619
weighted avg       0.60      0.60      0.60     62619



## Feature generation & test

In [None]:
transactions_processed = transactions[['customer_id', 'age', 'article_id', 'sales_channel_id', 'price', 'ordered', 't_dat', 'index_group_name']].copy()

In [None]:
# Does it make sense to label encode? - only for convinience, not as a predictor 
# Label encoding the customer and article IDs:
customer_encoder = preprocessing.LabelEncoder()
article_encoder = preprocessing.LabelEncoder()

transactions_processed['customer_id'] = customer_encoder.fit_transform(transactions_processed['customer_id'])
transactions_processed['article_id'] = article_encoder.fit_transform(transactions_processed['article_id'])

In [None]:
### Nan values ### 
# replace nan age with mean value
transactions_processed['age'].fillna(int((transactions_processed['age'].mean())), inplace=True)
# sales_channel_id, t_dat are always filled in 
transactions_processed = transactions_processed.fillna(0)
# sales_channel_id normalization 
transactions_processed['sales_channel_id'] = transactions_processed['sales_channel_id'].replace(2, 0)

### Age categories 

In [None]:
# categories with equal bins 
transactions_processed['age_cat4'] = pd.qcut(transactions_processed['age'], q=4,labels=[1,2,3,4])
transactions_processed['age_cat10'] = pd.qcut(transactions_processed['age'], q=10, labels=range(1,11))
# categories with custom bins
bin_values = [0, 20, 22, 24, 26, 28, 30, 35, 40, 45, 50, 60, 100]
transactions_processed['age_categories'] = pd.cut(transactions_processed['age'], bin_values)

In [None]:
# ohe for age categories
transactions_processed = pd.get_dummies(transactions_processed, columns=['age_cat4'])
transactions_processed = pd.get_dummies(transactions_processed, columns=['age_cat10'])
transactions_processed = pd.get_dummies(transactions_processed, columns=['age_categories'])

In [None]:
transactions_processed.columns

Index(['customer_id', 'age', 'article_id', 'sales_channel_id', 'price',
       'ordered', 't_dat', 'index_group_name', 'age_cat4_1', 'age_cat4_2',
       'age_cat4_3', 'age_cat4_4', 'age_cat10_1', 'age_cat10_2', 'age_cat10_3',
       'age_cat10_4', 'age_cat10_5', 'age_cat10_6', 'age_cat10_7',
       'age_cat10_8', 'age_cat10_9', 'age_cat10_10', 'age_categories_(0, 20]',
       'age_categories_(20, 22]', 'age_categories_(22, 24]',
       'age_categories_(24, 26]', 'age_categories_(26, 28]',
       'age_categories_(28, 30]', 'age_categories_(30, 35]',
       'age_categories_(35, 40]', 'age_categories_(40, 45]',
       'age_categories_(45, 50]', 'age_categories_(50, 60]',
       'age_categories_(60, 100]'],
      dtype='object')

In [None]:
features = ['sales_channel_id', 'price', 'ordered', 'age_cat4_1', 'age_cat4_2', 'age_cat4_3', 'age_cat4_4']
create_model(transactions_processed[features])
# comparable with continuous age

Mean accuracy:  0.6041137673868954
Classification Metrics:
              precision    recall  f1-score   support

           0       0.62      0.55      0.58     31300
           1       0.59      0.66      0.62     31319

    accuracy                           0.60     62619
   macro avg       0.61      0.60      0.60     62619
weighted avg       0.61      0.60      0.60     62619



In [None]:
features = ['sales_channel_id', 'price', 'ordered', 'age_cat10_1', 'age_cat10_2',
       'age_cat10_3', 'age_cat10_4', 'age_cat10_5', 'age_cat10_6',
       'age_cat10_7', 'age_cat10_8', 'age_cat10_9', 'age_cat10_10']
create_model(transactions_processed[features])
# better then previous models with another age representation 

Mean accuracy:  0.6104696657563998
Classification Metrics:
              precision    recall  f1-score   support

           0       0.61      0.59      0.60     31300
           1       0.61      0.63      0.62     31319

    accuracy                           0.61     62619
   macro avg       0.61      0.61      0.61     62619
weighted avg       0.61      0.61      0.61     62619



In [None]:
features = ['sales_channel_id', 'price', 'ordered', 
           'age_categories_(0, 20]',
       'age_categories_(20, 22]', 'age_categories_(22, 24]',
       'age_categories_(24, 26]', 'age_categories_(26, 28]',
       'age_categories_(28, 30]', 'age_categories_(30, 35]',
       'age_categories_(35, 40]', 'age_categories_(40, 45]',
       'age_categories_(45, 50]', 'age_categories_(50, 60]',
       'age_categories_(60, 100]']
create_model(transactions_processed[features])
# model is even better with custom bins 

Mean accuracy:  0.6119228988006835
Classification Metrics:
              precision    recall  f1-score   support

           0       0.62      0.58      0.60     31300
           1       0.61      0.64      0.62     31319

    accuracy                           0.61     62619
   macro avg       0.61      0.61      0.61     62619
weighted avg       0.61      0.61      0.61     62619



## Price of purchases in the past (average, total, compared to new price) 

In [None]:
# price of purchased items 
transactions_processed['price_calc'] = transactions_processed['price'] * transactions_processed['ordered']

In [None]:
# sort transactions 
transactions_processed.sort_values(["customer_id", "t_dat"],
               axis = 0, ascending = True,
               inplace = True,
               na_position = "first")

In [None]:
# history of purchases 
## cummulative sum of purchases
transactions_processed['price_csum'] = transactions_processed.groupby(['customer_id'])['price_calc'].cumsum() - transactions_processed['price_calc']
## number of purchases
transactions_processed['purchase_cnt'] = transactions_processed.groupby(['customer_id'])['ordered'].cumsum() - transactions_processed['ordered']
transactions_processed['purchase_cnt_cat'] = transactions_processed.groupby(['customer_id', 'index_group_name'])['ordered'].cumsum() - transactions_processed['ordered']

## average price of purchases
transactions_processed['price_mean'] = transactions_processed['price_csum'] / transactions_processed['purchase_cnt']
transactions_processed['price_mean'] = transactions_processed['price_mean'].fillna(0)
## ratio between item's price and average price
transactions_processed['price_ratio'] = transactions_processed['price_mean'] / transactions_processed['price'] 

In [None]:
## Baselina model with the sum of previous purchases 
features = ['sales_channel_id', 'price', 'ordered', 'age', 'price_csum']
create_model(transactions_processed[features])
# model is significantly better 

Mean accuracy:  0.6917868378607132
Classification Metrics:
              precision    recall  f1-score   support

           0       0.66      0.79      0.72     31489
           1       0.74      0.59      0.65     31130

    accuracy                           0.69     62619
   macro avg       0.70      0.69      0.69     62619
weighted avg       0.70      0.69      0.69     62619



In [None]:
## Baselina model with the mean price of previous purchases 
features = ['sales_channel_id', 'price', 'ordered', 'age', 'price_mean']
create_model(transactions_processed[features])
# price_mean is a bit worse

Mean accuracy:  0.6619556364681646
Classification Metrics:
              precision    recall  f1-score   support

           0       0.67      0.64      0.66     31489
           1       0.65      0.68      0.67     31130

    accuracy                           0.66     62619
   macro avg       0.66      0.66      0.66     62619
weighted avg       0.66      0.66      0.66     62619



In [None]:
## Baselina model with the number of previous purchases 
features = ['sales_channel_id', 'price', 'ordered', 'age', 'purchase_cnt']
create_model(transactions_processed[features])
# purchase_cnt improved a model  

Mean accuracy:  0.69929254699053
Classification Metrics:
              precision    recall  f1-score   support

           0       0.66      0.81      0.73     31489
           1       0.76      0.58      0.66     31130

    accuracy                           0.70     62619
   macro avg       0.71      0.70      0.70     62619
weighted avg       0.71      0.70      0.70     62619



In [None]:
## Baselina model with all new features 
features = ['sales_channel_id', 'price', 'ordered', 'age', 'price_csum', 'purchase_cnt', 'price_mean']
create_model(transactions_processed[features])
# combination of all features does not improve a model  

Mean accuracy:  0.6937191587217937
Classification Metrics:
              precision    recall  f1-score   support

           0       0.68      0.74      0.71     31489
           1       0.71      0.65      0.68     31130

    accuracy                           0.69     62619
   macro avg       0.70      0.69      0.69     62619
weighted avg       0.70      0.69      0.69     62619



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
## One-hot-encoding for index group name 
transactions_processed = pd.get_dummies(transactions_processed, columns=['index_group_name'])
transactions_processed.columns

Index(['customer_id', 'age', 'article_id', 'sales_channel_id', 'price',
       'ordered', 't_dat', 'age_cat4_1', 'age_cat4_2', 'age_cat4_3',
       'age_cat4_4', 'age_cat10_1', 'age_cat10_2', 'age_cat10_3',
       'age_cat10_4', 'age_cat10_5', 'age_cat10_6', 'age_cat10_7',
       'age_cat10_8', 'age_cat10_9', 'age_cat10_10', 'age_categories_(0, 20]',
       'age_categories_(20, 22]', 'age_categories_(22, 24]',
       'age_categories_(24, 26]', 'age_categories_(26, 28]',
       'age_categories_(28, 30]', 'age_categories_(30, 35]',
       'age_categories_(35, 40]', 'age_categories_(40, 45]',
       'age_categories_(45, 50]', 'age_categories_(50, 60]',
       'age_categories_(60, 100]', 'price_calc', 'price_csum', 'purchase_cnt',
       'purchase_cnt_cat', 'price_mean', 'price_ratio',
       'index_group_name_Baby/Children', 'index_group_name_Divided',
       'index_group_name_Ladieswear', 'index_group_name_Menswear',
       'index_group_name_Sport'],
      dtype='object')

In [None]:
## Baselina model with the number of purchases in categories 
features = ['sales_channel_id', 'price', 'ordered', 'age', 'index_group_name_Baby/Children',
       'index_group_name_Divided', 'index_group_name_Ladieswear',
       'index_group_name_Menswear', 'index_group_name_Sport', 'purchase_cnt_cat']
create_model(transactions_processed[features])
# categories help to improve significantly 

Mean accuracy:  0.7196218400166083
Classification Metrics:
              precision    recall  f1-score   support

           0       0.68      0.83      0.75     31489
           1       0.78      0.61      0.68     31130

    accuracy                           0.72     62619
   macro avg       0.73      0.72      0.72     62619
weighted avg       0.73      0.72      0.72     62619



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Purchases in last 7/30/90 days

In [None]:
import datetime
transactions_processed['t_dat'] = pd.to_datetime(transactions_processed['t_dat'])
last_purchases = transactions_processed.groupby(['customer_id', 't_dat'])['ordered'].sum().reset_index()

In [None]:
# puchases in previous 7 / 30 / 90 days
last_purchases['purchases_7'] = last_purchases.set_index('t_dat').groupby('customer_id').rolling('7D')['ordered'].sum().reset_index()['ordered'] - last_purchases['ordered']
last_purchases['purchases_30'] = last_purchases.set_index('t_dat').groupby('customer_id').rolling('30D')['ordered'].sum().reset_index()['ordered'] - last_purchases['ordered']
last_purchases['purchases_90'] = last_purchases.set_index('t_dat').groupby('customer_id').rolling('90D')['ordered'].sum().reset_index()['ordered'] - last_purchases['ordered']

last_purchases['purchases_ratio'] = last_purchases['purchases_7'] / last_purchases['purchases_90']
last_purchases['purchases_ratio'] = last_purchases['purchases_ratio'].fillna(0)

In [None]:
# merge with all data
transactions_processed = transactions_processed.merge(
    last_purchases[['customer_id', 't_dat', 'purchases_7', 'purchases_30', 'purchases_90', 'purchases_ratio']], 
    how='inner', on=['customer_id', 't_dat']
)

In [None]:
## Baseline model with purchases in last 90 days and ratio 
features = ['sales_channel_id', 'price', 'ordered', 'age', 'purchases_ratio', 'purchases_90']
create_model(transactions_processed[features])

Mean accuracy:  0.672591386001054
Classification Metrics:
              precision    recall  f1-score   support

           0       0.64      0.79      0.71     31489
           1       0.72      0.56      0.63     31130

    accuracy                           0.67     62619
   macro avg       0.68      0.67      0.67     62619
weighted avg       0.68      0.67      0.67     62619



In [None]:
## Baseline model with purchases in last 30 days and ratio 
features = ['sales_channel_id', 'price', 'ordered', 'age', 'purchases_ratio', 'purchases_30']
create_model(transactions_processed[features])

Mean accuracy:  0.6310864114725563
Classification Metrics:
              precision    recall  f1-score   support

           0       0.64      0.61      0.63     31489
           1       0.62      0.65      0.64     31130

    accuracy                           0.63     62619
   macro avg       0.63      0.63      0.63     62619
weighted avg       0.63      0.63      0.63     62619

