# Feature engineering
Week 2

In [56]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [57]:
path = "Data/"
transactions_train = pd.read_parquet(path + 'transactions_train.parquet')
customers = pd.read_parquet(path + 'customers.parquet')
articles = pd.read_parquet(path + 'articles.parquet')

# Preferred perceived colour group feature
Customers often tend to have a preference for a specific perceived colour group (some people always wear dark etc). 


In [58]:
merged = pd.merge(transactions_train, articles[['article_id','perceived_colour_value_id']], on='article_id')
color = {}

# Go through each customer and find their favorites
for customer, customer_data in merged.groupby("customer_id"):
    color[customer] = customer_data["perceived_colour_value_id"].value_counts().idxmax()
    

# Create a new composite feature
customers["favourite_colour"] = customers["customer_id"].map(color)

# Customer price deviation
We can calculate the standard deviation of the prices of articles purchased for each customer. Detecting this low deviation might be very useful information because it indicates that the customer has a very specific price range.

In [59]:
# For each customer calculate the standard deviation of the prices of the transactions
price_sensitivity = transactions_train.groupby('customer_id')['price'].std().reset_index()
price_sensitivity.rename(columns={'price': 'price_sensitivity'}, inplace=True)

# When a customer has only one transaction, there is no standard deviation, so fill with 0
price_sensitivity['price_sensitivity'].fillna(0, inplace=True)

customers = pd.merge(customers, price_sensitivity, on='customer_id', how='left')

# Customers with no transactions should get value 0
customers['price_sensitivity'].fillna(0, inplace=True)

# Online channel ratio
We can calculate the ratio of online transactions for each customer. This feature might be useful because it indicates how much a customer prefers to shop online and online shopping behaviour might be totally different from shopping behavior in physical stores.


In [60]:
# creating a dataframe with total amount of transactions and amount of online transactions for each customer
total_transactions = transactions_train.groupby('customer_id').size()
online_transactions = transactions_train[transactions_train['sales_channel_id'] == 2].groupby('customer_id').size()
feature = pd.DataFrame({'total_transactions': total_transactions, 'online_transactions': online_transactions}).reset_index()

# Customers who never made an online purchase should get value 0
feature['online_transactions'].fillna(0, inplace=True)

# Calculating the ratio
feature['online_ratio'] = feature['online_transactions'] / feature['total_transactions']

# Merge channel_affinity feature into the customers DataFrame
customers = pd.merge(customers, feature[['customer_id', 'online_ratio']], on='customer_id')

# Customers who never made an online purchase should get value 0
customers['online_ratio'].fillna(0, inplace=True)

In [61]:
customers.to_parquet(path + 'customers_edited.parquet')