In [270]:
import pandas as pd
import numpy as np
import random
import datetime
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import gc

In [271]:
articles_filename = './data/articles_sample01.csv.gz'
customers_filename = './data/customers_sample01.csv.gz'
transactions_filename = './data/transactions_sample01.csv.gz'

In [272]:
gc.collect()
articles = pd.read_csv(articles_filename)
customers = pd.read_csv(customers_filename)
transactions = pd.read_csv(transactions_filename)
customer_encoder = preprocessing.LabelEncoder()
article_encoder = preprocessing.LabelEncoder()
customers['customer_id'] = customer_encoder.fit_transform(customers['customer_id'])
articles['article_id'] = article_encoder.fit_transform(articles['article_id'])
transactions['customer_id'] = customer_encoder.transform(transactions['customer_id'])
transactions['article_id'] = article_encoder.transform(transactions['article_id'])
transactions['ordered'] = 1

In [273]:
print("taking recent slice")
n_weeks = 4
last_day = transactions['t_dat'].max()
last_day_date = [int(i) for i in last_day.split('-')]
last_day_date = datetime.date(last_day_date[0], last_day_date[1], last_day_date[2])
min_date = last_day_date - datetime.timedelta(weeks=n_weeks)
min_date = min_date.strftime('%Y-%m-%d')
recent_slice = transactions[transactions['t_dat'] >= min_date].copy()

taking recent slice


In [274]:
print("calculating user purchase count")
recent_purchases = recent_slice.groupby('customer_id')[['article_id']].count()
recent_purchases.rename(columns={'article_id': 'customer_purchase_count'}, inplace=True)
customers = customers.merge(recent_purchases, how='left', on='customer_id')

calculating user purchase count


In [275]:
print("calculating user budget")
recent_customer_spendings = recent_slice.groupby('customer_id')[['price']].mean()
recent_customer_spendings.rename(columns={'price': 'average_customer_budget'}, inplace=True)
customers = customers.merge(recent_customer_spendings, how='left', on='customer_id')
customers.head()

calculating user budget


Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,customer_purchase_count,average_customer_budget
0,381,,,ACTIVE,NONE,27.0,9cda5ca5a61e9242461ec0cf8882db45128b7f4e767366...,,
1,226,1.0,1.0,ACTIVE,Regularly,54.0,8fff72abf4e5974ac1e02cac0df7dfba21b13e86e8b90d...,,
2,985,,,ACTIVE,NONE,54.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...,,
3,1286,,,ACTIVE,NONE,48.0,52c8794ad3a1113f149996b96b6b074a5de1ece3c265e4...,,
4,9,1.0,1.0,ACTIVE,Regularly,46.0,796370e7c487836e154d4824186df504b997b1a30cc5c0...,4.0,0.023288


In [276]:
print("calculating article purchase count")
recent_purchases = recent_slice.groupby('article_id')[['article_id']].count()
recent_purchases.rename(columns={'article_id': 'article_purchase_count'}, inplace=True)
articles = articles.merge(recent_purchases, how='left', on='article_id')

calculating article purchase count


In [277]:
print("calculating average article price")
mean_article_prices = transactions.groupby('article_id')[['price']].mean()
mean_article_prices.rename(columns={'price': 'average_article_price'}, inplace=True)
articles = articles.merge(mean_article_prices, how='left', on='article_id')
del recent_slice
customers = customers.fillna(0)
articles = articles.fillna(0)
gc.collect()
articles.head()

calculating average article price


Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,article_purchase_count,average_article_price
0,0,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,0.0,0.007347
1,1,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,0.0,0.008136
2,2,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",0.0,0.020322
3,3,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",0.0,0.020754
4,4,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,12,Light Beige,...,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",0.0,0.025407


In [278]:
first_purchases = transactions.sort_values(by="t_dat").groupby('customer_id').first()
first_purchase_counts = first_purchases["article_id"].value_counts()
first_purchase_df = pd.DataFrame(data={'article_id':first_purchase_counts.index, 'first_purchase_count':first_purchase_counts.values})
articles = articles.merge(first_purchase_df, how="left")
articles = articles.fillna(0)

In [279]:
print("removing oldest data from transactions")
n_weeks = 20
if n_weeks >= 0:
  last_day = transactions['t_dat'].max()
  last_day_date = [int(i) for i in last_day.split('-')]
  last_day_date = datetime.date(last_day_date[0], last_day_date[1], last_day_date[2])
  min_date = last_day_date - datetime.timedelta(weeks=n_weeks)
  min_date = min_date.strftime('%Y-%m-%d')
  transactions = transactions[transactions['t_dat'] >= min_date]
gc.collect()

removing oldest data from transactions


22

Activity-based features, for each customer: determine how many times they bought in the last n weeks

In [None]:
activity_features = []
for n_weeks in range(1, 53, 4):
  col_name = "activity_" + str(n_weeks) + "_weeks"
  last_day = transactions['t_dat'].max()
  last_day_date = [int(i) for i in last_day.split('-')]
  last_day_date = datetime.date(last_day_date[0], last_day_date[1], last_day_date[2])
  min_date = last_day_date - datetime.timedelta(weeks=n_weeks)
  min_date = min_date.strftime('%Y-%m-%d')
  recent_view = transactions['t_dat'] >= min_date
  customer_counts = transactions[recent_view]['customer_id'].value_counts()
  temp_df = pd.DataFrame(data={
      'customer_id': customer_counts.index,
      col_name: customer_counts.values
  })
  customers = customers.merge(temp_df, how='left')
  customers = customers.fillna(0)
  activity_features.append(col_name)


In [281]:
print("generating negative samples")
# negative sample creation
positive_pairs = list(map(tuple, transactions[['customer_id', 'article_id']].drop_duplicates().values))

real_dates = transactions["t_dat"].unique()
real_customers = transactions["customer_id"].unique()
real_articles = transactions["article_id"].unique()
real_channels = transactions["sales_channel_id"].unique()
article_and_price = transactions[["article_id","price"]].drop_duplicates("article_id").set_index("article_id").squeeze()
num_neg_pos = transactions.shape[0]
random.seed(42)
num_neg_samples = int(num_neg_pos * 1.1)
neg_dates = np.random.choice(real_dates, size=num_neg_samples)
neg_articles = np.random.choice(real_articles, size=num_neg_samples)
neg_customers = np.random.choice(real_customers, size=num_neg_samples)
neg_channels = np.random.choice(real_channels, size=num_neg_samples)
ordered = np.array([0] * num_neg_samples)
neg_prices = article_and_price[neg_articles].values
neg_transactions = pd.DataFrame([neg_dates, neg_customers, neg_articles, neg_prices, neg_channels, ordered], index=transactions.columns).T
df = neg_transactions[
    ~neg_transactions.set_index(["customer_id", "article_id"]).index.isin(positive_pairs)
]
chosen_neg_transactions = df.sample(num_neg_pos)
transactions = pd.concat([transactions, chosen_neg_transactions])
transactions = transactions.merge(customers, how="inner", on='customer_id')
transactions = transactions.merge(articles, how="inner", on='article_id')
transactions["price_discrepancy"] = transactions["average_article_price"] - transactions["average_customer_budget"]
del neg_transactions

generating negative samples


In [282]:
print("slicing processed transactions")
feature_names = ['customer_id', 'age', 'sales_channel_id', 'article_id', 'price', 'ordered', 
            'product_type_no', 'customer_purchase_count', 'article_purchase_count', 
            'average_customer_budget', 'average_article_price', 'price_discrepancy', 
            'first_purchase_count'] + activity_features
transactions_processed = transactions[feature_names].copy()
transactions_processed.head()
del transactions
gc.collect()
transactions_processed = transactions_processed.fillna(0)
transactions_processed.isnull().values.any()
transactions_processed = pd.get_dummies(transactions_processed, columns=['sales_channel_id'])


slicing processed transactions


In [283]:
print("writing processed transactions to csv")
# write transactions_processed to csv so it can be re-used
processed_filename = "./data/transactions_processed.csv"
transactions_processed.to_csv(processed_filename, index=False)

writing processed transactions to csv


In [284]:
# if file has been generated previously, can be retrieved from cache here
transactions_processed = pd.read_csv(processed_filename)


In [285]:
print('generating global candidates')
most_popular_count = 20
popular_items = transactions_processed[transactions_processed['ordered']==1].drop_duplicates(subset='article_id')
popular_items.sort_values(by='article_purchase_count', ascending=False, inplace=True)
popular_items.drop(['customer_id', 'ordered', 'customer_purchase_count', 'average_customer_budget', 'price_discrepancy', 'age'], axis=1, inplace=True)
popular_items = popular_items.iloc[:most_popular_count]
popular_items.head()

generating global candidates


Unnamed: 0,article_id,price,product_type_no,article_purchase_count,average_article_price,first_purchase_count,activity_1_weeks,activity_5_weeks,activity_9_weeks,activity_13_weeks,...,activity_21_weeks,activity_25_weeks,activity_29_weeks,activity_33_weeks,activity_37_weeks,activity_41_weeks,activity_45_weeks,activity_49_weeks,sales_channel_id_1,sales_channel_id_2
3042,10433,0.033288,272,8.0,0.032163,0.0,4.0,14.0,14.0,14.0,...,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,0,1
2474,15540,0.033881,254,5.0,0.033506,0.0,0.0,4.0,9.0,9.0,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,0,1
1650,17096,0.042356,273,5.0,0.041699,0.0,1.0,1.0,8.0,14.0,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,0,1
3742,17024,0.042356,272,5.0,0.042356,0.0,7.0,7.0,16.0,30.0,...,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,0,1
4245,1558,0.033881,66,4.0,0.033229,2.0,8.0,59.0,100.0,108.0,...,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,0,1


In [286]:
print("training model")
X_train, X_test, y_train, y_test = train_test_split(transactions_processed.drop('ordered', axis=1), transactions_processed['ordered'], test_size=0.10, random_state=42)
# model = MLPClassifier(hidden_layer_sizes=(9, 20, 9), max_iter=300, random_state=42) #9 20 9
model = RandomForestClassifier(random_state=42)
model = model.fit(X_train, y_train)
predictions = model.predict_proba(X_test)
print("validation score:", model.score(X_test, y_test))

training model
validation score: 0.7646610814927647


In [287]:
print("generating personalised candidates")
customer_temp = customers[['customer_id', 'average_customer_budget', 'customer_purchase_count', 'age']].drop_duplicates()
candidate_pairs = customer_temp.merge(popular_items, how='cross')
candidate_pairs['price_discrepancy'] = candidate_pairs["average_article_price"] - candidate_pairs["average_customer_budget"]
candidate_pairs = candidate_pairs.reindex(columns=X_train.columns)
candidate_pairs.head()

generating personalised candidates


Unnamed: 0,customer_id,age,article_id,price,product_type_no,customer_purchase_count,article_purchase_count,average_customer_budget,average_article_price,price_discrepancy,...,activity_21_weeks,activity_25_weeks,activity_29_weeks,activity_33_weeks,activity_37_weeks,activity_41_weeks,activity_45_weeks,activity_49_weeks,sales_channel_id_1,sales_channel_id_2
0,381,27.0,10433,0.033288,272,0.0,8.0,0.0,0.032163,0.032163,...,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,0,1
1,381,27.0,15540,0.033881,254,0.0,5.0,0.0,0.033506,0.033506,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,0,1
2,381,27.0,17096,0.042356,273,0.0,5.0,0.0,0.041699,0.041699,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,0,1
3,381,27.0,17024,0.042356,272,0.0,5.0,0.0,0.042356,0.042356,...,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,0,1
4,381,27.0,1558,0.033881,66,0.0,4.0,0.0,0.033229,0.033229,...,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,0,1


In [288]:
print("predicting recommendations")
from copy import deepcopy
temp = deepcopy(candidate_pairs)
temp = temp.fillna(0)
temp[["prediction0", 'prediction']] = model.predict_proba(temp)
temp = temp[["customer_id", "article_id", "prediction"]]
temp.sort_values(['customer_id', 'prediction'], ascending=False, inplace=True)
temp["customer_id"] = customer_encoder.inverse_transform(temp['customer_id'])

predicting recommendations


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [289]:
temp.groupby('customer_id').head(12)

Unnamed: 0,customer_id,article_id,prediction
9617,fff89d57674d5fbe315d0a3deb65c1068d64622a310272...,16264,0.84
9608,fff89d57674d5fbe315d0a3deb65c1068d64622a310272...,16800,0.83
9603,fff89d57674d5fbe315d0a3deb65c1068d64622a310272...,17024,0.82
9604,fff89d57674d5fbe315d0a3deb65c1068d64622a310272...,1558,0.82
9609,fff89d57674d5fbe315d0a3deb65c1068d64622a310272...,15866,0.80
...,...,...,...
8498,003f3a1e27583a04278d8c03d6bd30446ccafad37f5601...,17006,0.62
8482,003f3a1e27583a04278d8c03d6bd30446ccafad37f5601...,17096,0.59
8480,003f3a1e27583a04278d8c03d6bd30446ccafad37f5601...,10433,0.58
8495,003f3a1e27583a04278d8c03d6bd30446ccafad37f5601...,7702,0.57


In [290]:
print("writing recommendations")
predictions_file = "predictions.csv"
recommendation_count = 12
prediction_df = temp.groupby('customer_id').head(recommendation_count)
prediction_df['article_id'] = article_encoder.inverse_transform(prediction_df['article_id'])
prediction_df['prediction'] = prediction_df['article_id'].apply(str)
prediction_df["prediction"] = "0" + prediction_df["prediction"]
prediction_df.groupby('customer_id').agg({'prediction': " ".join}).to_csv(predictions_file)

writing recommendations


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
