# Lecture 2: Introduction to Feature Engineering

## Setup

In [None]:
import numpy as np
import pandas
import pandas as pd
import random

import sklearn.preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import preprocessing

In [None]:
articles = pd.read_csv('../data/articles.csv')
customers = pd.read_csv('../data/customers.csv')
sample_submisison = pd.read_csv('../data/sample_submission.csv')
transactions = pd.read_csv('../data/transactions_train.csv')

## The H&M Dataset

In [None]:
articles.info()

In [None]:
customers.info()

In [None]:
transactions.info()

In [None]:
# X = transactions.merge(customers, how='inner', on='customer_id')
# X = X.merge(articles, how='inner', on='article_id')

### Creating Samples 
If you would rather work with samples instead of the whole dataset (while prototyping your code). You can use the code below:

In [None]:
# Adapted from: https://www.kaggle.com/code/paweljankiewicz/hm-create-dataset-samples
# This extracts three sampled datasets, containing 0.1%, 1% and 5% of all users and their transactions, and the associated articles.
for sample_repr, sample in [("01", 0.001), ("1", 0.01), ("5", 0.05)]:
    print(sample)
    customers_sample = customers.sample(int(customers.shape[0]*sample), replace=False)
    customers_sample_ids = set(customers_sample["customer_id"])
    transactions_sample = transactions[transactions["customer_id"].isin(customers_sample_ids)]
    articles_sample_ids = set(transactions_sample["article_id"])
    articles_sample = articles[articles["article_id"].isin(articles_sample_ids)]
    customers_sample.to_csv(f"../data/customers_sample{sample_repr}.csv.gz", index=False)
    transactions_sample.to_csv(f"../data/transactions_sample{sample_repr}.csv.gz", index=False)
    articles_sample.to_csv(f"../data/articles_sample{sample_repr}.csv.gz", index=False)

In [None]:
# articles_sample = pd.read_csv('../data/articles_sample01.csv.gz')
# customers_sample = pd.read_csv('../data/customers_sample01.csv.gz')
# transactions_sample = pd.read_csv('../data/transactions_sample01.csv.gz')
articles_sample = pd.read_csv('../data/articles_sample5.csv.gz')
customers_sample = pd.read_csv('../data/customers_sample5.csv.gz')
transactions_sample = pd.read_csv('../data/transactions_sample5.csv.gz')

In [None]:
customers_sample.info()

In [None]:
transactions_sample.info()

## A Simplified Task: Binary Classification

The task of predicting which 12 items users are most likely to buy in the next week is difficult to translate to a traditional classification machine learning setting. 
To obtain the 12 items a user is most likely to buy, we need to make predictions for all items (or the ones selected by a baseline) and select the 12 that have the highest predicted scores.

In this assignment, we'll consider a simplified task: Predict whether a user ordered a single item or not, based on the features of the user and the item. 
We provide a baseline logistic regression model below, but haven't done much feature preprocessing or engineering!
Initially, it is always best to focus your efforts on getting your features in the right shape and setting up the right validation scheme and baselines.
Once you are sure that your features add value and your validation scheme is correct, then you typically move on to trying more elaborate models.

### Creating the Dataset

In [None]:
# If you'd rather use a sample. Uncomment the following code:
transactions = transactions_sample
customers = customers_sample
articles = articles_sample

In [None]:
transactions['ordered'] = 1

The problem setting is an example of a "PU learning" problem, i.e. only positives are labeled, everything else is unlabeled (and can be either positive or negative). 
Of course, we cannot train a classifier with just positive samples: The classifier will just learn that everything is positive.
Therefore, we need to manually generate negative samples.

Below, we use a simple random negative sampling strategy.
We want to create a balanced dataset, meaning that we have just as many positives as negatives.
This makes sure that the classifier will not benefit from predicting the positive/negative class more often than the other.
Realistically, the amount of positive samples is of course many times smaller than the amount of unlabeled, possibly negative instances.


If you want to try your hand at a more complex negative sampling strategy, you may want to check out this blog as a starting point: https://medium.com/mlearning-ai/overview-negative-sampling-on-recommendation-systems-230a051c6cd7.



In [None]:
transactions.head()

In [None]:
# What's happening here? 
# We're creating negative samples. I.e. we're creating transactions that didn't actually occur.
# First, we need to know which interactions did occur:
positive_pairs = list(map(tuple, transactions[['customer_id', 'article_id']].drop_duplicates().values))

In [None]:
# Then we need to know what every synthetic transaction should contain: a date, a customer_id, an article_id, price, sales_channel_id. We will set ordered = 0, as these transactions didn't really occur.
transactions.head()

In [None]:
# Extract real values
real_dates = transactions["t_dat"].unique()
real_customers = transactions["customer_id"].unique()
real_articles = transactions["article_id"].unique()
real_channels = transactions["sales_channel_id"].unique()
article_and_price = transactions[["article_id","price"]].drop_duplicates("article_id").set_index("article_id").squeeze()

In [None]:
# How many negatives do we need to sample?
num_neg_pos = transactions.shape[0]
print(num_neg_pos)

In [None]:
# Sampling negatives by selecting random users, articles, dates and sales channel:
# Note: This is quite naive. Some articles may not even have been available at the date we are sampling.
random.seed(42)

# Afterwards, we need to remove potential duplicates, so we'll sample too many.
num_neg_samples = int(num_neg_pos * 1.1)

# Sample each of the independent attributes.
neg_dates = np.random.choice(real_dates, size=num_neg_samples)
neg_articles = np.random.choice(real_articles, size=num_neg_samples)
neg_customers = np.random.choice(real_customers, size=num_neg_samples)
neg_channels = np.random.choice(real_channels, size=num_neg_samples)
ordered = np.array([0] * num_neg_samples)
# Assign to every article a real price.
neg_prices = article_and_price[neg_articles].values

In [None]:
neg_transactions = pd.DataFrame([neg_dates, neg_customers, neg_articles, neg_prices, neg_channels, ordered], index=transactions.columns).T

In [None]:
# Result:
neg_transactions.head()

In [None]:
neg_transactions.shape

In [None]:
# Remove random negative samples that actually coincide with positives
df = neg_transactions[
    ~neg_transactions.set_index(["customer_id", "article_id"]).index.isin(positive_pairs)
]

# Remove any excess
chosen_neg_transactions = df.sample(num_neg_pos)

In [None]:
# Concat the negative samples to the positive samples:
transactions = pd.concat([transactions, chosen_neg_transactions])
transactions = transactions.merge(customers, how="inner", on='customer_id')
transactions = transactions.merge(articles, how="inner", on='article_id')

In [None]:
transactions.info()

### Basic Preprocessing
Some very basic preprocessing.

In [None]:
# I'm dropping a lot of columns, use them in your engineering tasks!
transactions_processed = transactions[['customer_id', 'age', 'article_id', 'sales_channel_id', 'price', 'ordered']].copy()
transactions_processed.head()

In [None]:
# Does it make sense to label encode?
# Label encoding the customer and article IDs:
customer_encoder = preprocessing.LabelEncoder()
customer_encoder = customer_encoder.fit(transactions_processed['customer_id'])
article_encoder = preprocessing.LabelEncoder()
article_encoder = article_encoder.fit(transactions_processed['article_id'])

In [None]:
transactions_processed['customer_id'] = customer_encoder.transform(transactions_processed['customer_id'])
transactions_processed['article_id'] = article_encoder.transform(transactions_processed['article_id'])

In [None]:
# If you want to go back to the original encoding:
customer_encoder.inverse_transform([2])

In [None]:
transactions_processed.head()

In [None]:
# Can you come up with a NaN strategy that makes sense for each column in the dataset?
# Imputing all NaN values with zeros:
transactions_processed = transactions_processed.fillna(0)
transactions_processed.isnull().values.any()

In [None]:
# Does it make sense to one-hot encode?
# One-hot-encoding sales_channel_id:
transactions_processed = pd.get_dummies(transactions_processed, columns=['sales_channel_id'])

In [None]:
transactions_processed.head()

In [None]:
# Creating a Train / Test Split:
X_train, X_test, y_train, y_test = train_test_split(transactions_processed.drop('ordered', axis=1), transactions_processed['ordered'], test_size=0.10, random_state=42)

In [None]:
X_train.head()

In [None]:
y_train.head()

## Baseline Model

In [None]:
# Will take a few minutes to run, if you're using the whole dataset:
baseline = LogisticRegression(random_state=42, n_jobs=6)
baseline = baseline.fit(X_train, y_train)

In [None]:
baseline.predict_proba(X_test)

In [None]:
y_test

In [None]:
# Mean Accuracy:
baseline.score(X_test, y_test)
# As you can seen, the accuracy is ~0.51. In other words, the classifier predicts correctly 51% of the time whether a customer did or din't buy an item.
# Can you improve this baseline logistic regression model by doing better preprocessing and generating new features?
# Also think about my steps! Did it make sense to include the article and customer ids? (And things like that)

In [None]:
# Classification Metrics:
predictions = baseline.predict(X_test)
print(classification_report(y_test, predictions))

In [None]:
predictions

## Assignment: Feature engineering
**TODO:** 
- In groups (of 2-3 students), think about (a few) features that can be engineered (preprocess and generate new features). Divide the work!
- Do these engineered features improve the baseline model?
- Add your thoughts & results to a slide deck for discussion next week (again, 1 slide per person).


# prod_name Word2Vec
Evaluation may or may not work. Copying the code to a py file works more reliably.
Reducing vector size may work (but give worse performance).


The Following blocks of code were originally run in a .py file (for stability/easier debugging) and copied over to the notebook.

In [None]:
import gensim
import string
from nltk import word_tokenize
def evaluate(frame, vec_size=50, extra_vec=None):
    """
    Evaluates frame using LogisticRegression
    """
    print("Evaluating...")
    frame = frame.copy()
    seed = 42
    frame = frame.drop(['customer_id', 'article_id'], axis=1)
    scaler = preprocessing.StandardScaler()
    # scaler = preprocessing.MinMaxScaler()
    frame[['age', 'price']] = scaler.fit_transform(frame[['age', 'price']])
    if vec_size > 0:
        vec_idx = range(vec_size)
        frame[vec_idx] = scaler.fit_transform(frame[vec_idx])
    if extra_vec is not None:
        frame[extra_vec] = scaler.fit_transform(frame[extra_vec])
    print(frame)
    X_train, X_test, y_train, y_test = train_test_split(frame.drop('ordered', axis=1), frame['ordered'], test_size=0.10,
                                                        random_state=seed)
    # Will take a few minutes to run, if you're using the whole dataset:
    better = LogisticRegression(random_state=seed, n_jobs=7, verbose=False, max_iter=1000)
    better = better.fit(X_train, y_train)
    better.predict_proba(X_test)
    better.score(X_test, y_test)
    better_predictions = better.predict(X_test)
    print(classification_report(y_test, better_predictions))

In [None]:
vec_size1 = 50

print("Starting implementation Word2Vec")
# prepare transactions
# same as for the baseline
transactions_processed = transactions[
    ['customer_id', 'age', 'article_id', 'prod_name', 'sales_channel_id', 'price', 'ordered']].copy()
customer_encoder = preprocessing.LabelEncoder()
article_encoder = preprocessing.LabelEncoder()
transactions_processed['customer_id'] = customer_encoder.fit_transform(transactions_processed['customer_id'])
transactions_processed['article_id'] = article_encoder.fit_transform(transactions_processed['article_id'])
transactions_processed = transactions_processed.fillna(0)
transactions_processed.isnull().values.any()
transactions_processed = pd.get_dummies(transactions_processed, columns=['sales_channel_id'])

# setup training set for word2vec
train_frame = articles[
    ['prod_name', 'product_type_name', 'product_group_name', 'graphical_appearance_name', 'department_name',
     'index_name', 'index_group_name', 'section_name', 'garment_group_name']].drop_duplicates()
train_frame = train_frame.apply(lambda x: ','.join(x.astype(str)), axis=1)
train_frame = pd.DataFrame({'clean': train_frame})
data = [row.split(',') for row in train_frame['clean']]

# initialise and train model
model = gensim.models.Word2Vec(min_count=1,
                               vector_size=vec_size1,
                               workers=7,
                               window=3,
                               sg=0)
model.build_vocab(data)
model.train(data, total_examples=model.corpus_count, epochs=30)

# df to loop over
p_names = pd.DataFrame(transactions_processed['prod_name']).drop_duplicates().reset_index(drop=True, inplace=False)
# convert all names into vectors
v = np.empty((0, vec_size1))
for a in p_names.values:
    vec = model.wv.__getitem__([a[0]])
    v = np.append(v, vec, axis=0)

# make a dataframe containing name and vector
df = pd.DataFrame(v, columns=[f'f_{i}' for i in range(vec_size1)])
df = pd.concat([p_names, df], axis=1)

# merge dataframe with transactions
transactions_processed = transactions_processed.merge(df, on='prod_name')

# drop name and evaluate
transactions_processed = transactions_processed.drop(['prod_name'], axis=1)
evaluate(transactions_processed, vec_size=vec_size1)

# detailed_desc Doc2Vec

Evaluation may or may not work. Copying the code to a py file works more reliably.
Reducing vector size may work (but give worse performance).

In [None]:
vec_size2 = 50

print("Starting implementation Doc2Vec")

# prepare transactions
# same as for the baseline
transactions_processed = transactions[
    ['customer_id', 'age', 'article_id', 'prod_name', 'sales_channel_id', 'price', 'ordered', 'detail_desc']].copy()
customer_encoder = preprocessing.LabelEncoder()
article_encoder = preprocessing.LabelEncoder()
transactions_processed['customer_id'] = customer_encoder.fit_transform(transactions_processed['customer_id'])
transactions_processed['article_id'] = article_encoder.fit_transform(transactions_processed['article_id'])
transactions_processed = transactions_processed.fillna(0)
transactions_processed.isnull().values.any()
transactions_processed = pd.get_dummies(transactions_processed, columns=['sales_channel_id'])

# make training set
train_frame = articles[['prod_name', 'detail_desc']].drop_duplicates()
train_frame = train_frame.apply(lambda x: word_tokenize(
    str(x['detail_desc']).lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))),
                                axis=1)
train_frame = pd.DataFrame({'clean': train_frame})
data2 = [row for row in train_frame['clean']]

# train model
print("Starting training Doc2Vec")
model2 = gensim.models.Doc2Vec(min_count=1,
                               vector_size=vec_size2,
                               workers=7,
                               window=3)

data2 = [gensim.models.doc2vec.TaggedDocument(d, [i]) for i, d in enumerate(data2)]
model2.build_vocab(data2)
model2.train(data2, total_examples=model2.corpus_count, epochs=30)

# df to loop over
p_desc = pd.DataFrame(transactions_processed[['prod_name', 'detail_desc']]).drop_duplicates().reset_index(drop=True, inplace=False)

# transform descriptions into vectors
print("Starting vectorization")
v = np.empty((0, vec_size2))
for i, a in np.ndenumerate(p_desc['detail_desc'].values):
    desc = word_tokenize(str(a).lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))))
    vec = [model2.infer_vector(desc)]
    v = np.append(v, vec, axis=0)
    if i[0] % 1000 == 0:
        print(f"\rProcessed {i[0]} / {p_desc.shape[0]} ({i[0]/p_desc.shape[0]*100:.2f}%) rows", end="")
print()

# make dataframe with desc + vector
df = pd.DataFrame(v, columns=[f'f_{i}' for i in range(vec_size2)])
df = pd.concat([p_desc, df], axis=1)

# drop description to prevent duplicate columns
transactions_processed = transactions_processed.drop(['detail_desc'], axis=1)

# merge dataframe with transactions
transactions_processed = transactions_processed.merge(df, on='prod_name')

# drop product name and description
transactions_processed = transactions_processed.drop(['prod_name', 'detail_desc'], axis=1)

# evaluate
evaluate(transactions_processed, vec_size2)

# W2V D2V Together

Evaluation may or may not work. Copying the code to a py file works more reliably.
Reducing the vector sizes in the previous models is almost a requirement, since 100 dimensional vectors will not be classified. A lenght of 25 for each vector seems like a good starting point.
***Requires trained models from previous parts.***

In [None]:
print("Starting implementation combination")

# prepare transactions
# same as for the baseline
transactions_processed = transactions[
    ['customer_id', 'age', 'article_id', 'prod_name', 'sales_channel_id', 'price', 'ordered', 'detail_desc']].copy()
customer_encoder = preprocessing.LabelEncoder()
article_encoder = preprocessing.LabelEncoder()
transactions_processed['customer_id'] = customer_encoder.fit_transform(transactions_processed['customer_id'])
transactions_processed['article_id'] = article_encoder.fit_transform(transactions_processed['article_id'])
transactions_processed = transactions_processed.fillna(0)
transactions_processed.isnull().values.any()
transactions_processed = pd.get_dummies(transactions_processed, columns=['sales_channel_id'])

# df to loop over
p_desc = pd.DataFrame(transactions_processed[['prod_name', 'detail_desc']]).drop_duplicates().reset_index(drop=True, inplace=False)

# make a vector for each product
print("Starting vectorization")
v = np.empty((0, vec_size1 + vec_size2))
for i, row in enumerate(p_desc.values):
    p_name = row[0]
    desc = row[1]
    desc = word_tokenize(
        str(desc).lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))))
    desc_vec = model2.infer_vector(desc)
    name_vec = model.wv[p_name]
    vec = np.concatenate((name_vec, desc_vec), axis=None)
    v = np.append(v, [vec], axis=0)
    if i % 1000 == 0:
        print(f"\rProcessed {i} / {p_desc.shape[0]} ({i/p_desc.shape[0]*100:.2f}%) rows", end="")
print()

# make a dataframe out of vectors and
df = pd.DataFrame(v, columns=[f'f_{i}' for i in range(vec_size1 + vec_size2)])
df = pd.concat([p_desc, df], axis=1)

# drop column to prevent duplicate
transactions_processed = transactions_processed.drop(['detail_desc'], axis=1)
# merge dataframe with transactions
transactions_processed = transactions_processed.merge(df, on='prod_name')
# drop name and description
transactions_processed = transactions_processed.drop(['prod_name', 'detail_desc'], axis=1)
#evaluate
evaluate(transactions_processed, vec_size1+vec_size2)

# Recent popularity

Somewhat slow

In [None]:
# prepare transactions
# same as for the baseline
transactions_processed = transactions[
    ['customer_id', 'age', 'article_id', 'sales_channel_id', 'price', 'ordered', 't_dat']].copy()
customer_encoder = preprocessing.LabelEncoder()
article_encoder = preprocessing.LabelEncoder()
transactions_processed['customer_id'] = customer_encoder.fit_transform(transactions_processed['customer_id'])
transactions_processed['article_id'] = article_encoder.fit_transform(transactions_processed['article_id'])
transactions_processed = transactions_processed.fillna(0)
transactions_processed.isnull().values.any()
transactions_processed = pd.get_dummies(transactions_processed, columns=['sales_channel_id'])

# process dates
transactions_processed['t_dat'] = pd.to_datetime(transactions_processed['t_dat'], format='%Y-%m-%d')
transactions_processed = transactions_processed.sort_values(by=['t_dat'])

# make dict of purchase dates
purchase_dates = {}
purchases = transactions_processed[transactions_processed['ordered'] == 1]  # only keep purchases
purchases = purchases.sort_values(by=['t_dat'])
purchases = purchases[['article_id', 't_dat']]
for index, row in enumerate(purchases.values):
    article_id = row[0]
    date = row[1]
    if article_id not in purchase_dates:
        purchase_dates[article_id] = []
    purchase_dates[article_id].append(date)
    if index % 10000 == 0:
        print(f"\rProcessed {index} / {purchases.shape[0]} rows", end="")
print()

# make a list of #recent purchases
# takes a while
rec_purchase_num = []
for index, row in enumerate(transactions_processed.values):
    article_id = row[2]
    date = row[5]
    if article_id in purchase_dates:
        dates = purchase_dates[article_id]
        dates = [d for d in dates if (date >= d >= date - pd.Timedelta(days=7))]
        rec_purchase_num.append(len(dates) - row[4])
    else:
        rec_purchase_num.append(0)
    if index % 10000 == 0:
        print(f"\rProcessed {index} / {transactions_processed.shape[0]} ({index/transactions_processed.shape[0]*100:.2f}%) rows", end="")
print()

# add list to transactions
transactions_processed['rec_purchases'] = rec_purchase_num
print(transactions_processed.head(10))

# drop dates
transactions_processed = transactions_processed.drop(['t_dat'], axis=1)
# evaluate
evaluate(transactions_processed, vec_size=0, extra_vec=['rec_purchases'])

# W2V and Popularity

Evaluation may or may not work. Copying the code to a py file works more reliably.
Reducing vector size may work (but give worse performance).
Requires trained models from previous parts.

In [None]:
vec_size = 50

print("Starting implementation combination w2v, pop")

# prepare transactions
transactions_processed = transactions[
    ['customer_id', 'age', 'article_id', 'sales_channel_id', 'price', 'ordered', 't_dat', 'prod_name']].copy()
customer_encoder = preprocessing.LabelEncoder()
article_encoder = preprocessing.LabelEncoder()
transactions_processed['customer_id'] = customer_encoder.fit_transform(transactions_processed['customer_id'])
transactions_processed['article_id'] = article_encoder.fit_transform(transactions_processed['article_id'])
transactions_processed = transactions_processed.fillna(0)
transactions_processed.isnull().values.any()
transactions_processed = pd.get_dummies(transactions_processed, columns=['sales_channel_id'])

transactions_processed['t_dat'] = pd.to_datetime(transactions_processed['t_dat'], format='%Y-%m-%d')
transactions_processed = transactions_processed.sort_values(by=['t_dat'])

# w2v part
# generate vectors for all product names
print("Starting vectorization")
p_names = pd.DataFrame(transactions_processed['prod_name']).drop_duplicates().reset_index(drop=True, inplace=False)
v = np.empty((0, vec_size))
for a in p_names.values:
    vec = model.wv.__getitem__([a[0]])
    v = np.append(v, vec, axis=0)
df = pd.DataFrame(v, columns=[f'f_{i}' for i in range(vec_size)])
df = pd.concat([p_names, df], axis=1)

# pop part
# make dict of purchase dates
purchase_dates = {}
purchases = transactions_processed[transactions_processed['ordered'] == 1]  # only keep purchases
purchases = purchases.sort_values(by=['t_dat'])
purchases = purchases[['article_id', 't_dat']]
for index, row in enumerate(purchases.values):
    article_id = row[0]
    date = row[1]
    if article_id not in purchase_dates:
        purchase_dates[article_id] = []
    purchase_dates[article_id].append(date)
    if index % 10000 == 0:
        print(f"\rProcessed {index} / {purchases.shape[0]} rows", end="")
print()

# make a list of #recent purchases
rec_purchase_num = []
for index, row in enumerate(transactions_processed.values):
    article_id = row[2]
    date = row[5]
    if article_id in purchase_dates:
        dates = purchase_dates[article_id]
        dates = [d for d in dates if (date >= d >= date - pd.Timedelta(days=7))]
        rec_purchase_num.append(len(dates) - row[4])
    else:
        rec_purchase_num.append(0)
    if index % 10000 == 0:
        print(f"\rProcessed {index} / {transactions_processed.shape[0]} ({index/transactions_processed.shape[0]*100:.2f}%) rows", end="")

print()

# add recent purchases to frame
transactions_processed['rec_purchases'] = rec_purchase_num
# merge with product name vectors
transactions_processed = transactions_processed.merge(df, on='prod_name')
# remove helper columns
transactions_processed = transactions_processed.drop(['t_dat'], axis=1)
transactions_processed = transactions_processed.drop(['prod_name'], axis=1)
# evaluate
evaluate(transactions_processed, vec_size=vec_size, extra_vec=['rec_purchases'])

In [42]:
import gensim
import string
from nltk import word_tokenize
def evaluate(frame, vec_size=50, extra_vec=None):
    """
    Evaluates frame using LogisticRegression
    """
    print("Evaluating...")
    frame = frame.copy()
    seed = 42
    frame = frame.drop(['customer_id', 'article_id'], axis=1)
    scaler = preprocessing.StandardScaler()
    # scaler = preprocessing.MinMaxScaler()
    frame[['age', 'price']] = scaler.fit_transform(frame[['age', 'price']])
    if vec_size > 0:
        vec_idx = range(vec_size)
        frame[vec_idx] = scaler.fit_transform(frame[vec_idx])
    if extra_vec is not None:
        frame[extra_vec] = scaler.fit_transform(frame[extra_vec])
    print(frame)
    X_train, X_test, y_train, y_test = train_test_split(frame.drop('ordered', axis=1), frame['ordered'], test_size=0.10,
                                                        random_state=seed)
    # Will take a few minutes to run, if you're using the whole dataset:
    better = LogisticRegression(random_state=seed, n_jobs=7, verbose=False, max_iter=1000)
    better = better.fit(X_train, y_train)
    better.predict_proba(X_test)
    better.score(X_test, y_test)
    better_predictions = better.predict(X_test)
    print(classification_report(y_test, better_predictions))

In [43]:
vec_size1 = 50

print("Starting implementation Word2Vec")
# prepare transactions
# same as for the baseline
transactions_processed = transactions[
    ['customer_id', 'age', 'article_id', 'prod_name', 'sales_channel_id', 'price', 'ordered']].copy()
customer_encoder = preprocessing.LabelEncoder()
article_encoder = preprocessing.LabelEncoder()
transactions_processed['customer_id'] = customer_encoder.fit_transform(transactions_processed['customer_id'])
transactions_processed['article_id'] = article_encoder.fit_transform(transactions_processed['article_id'])
transactions_processed = transactions_processed.fillna(0)
transactions_processed.isnull().values.any()
transactions_processed = pd.get_dummies(transactions_processed, columns=['sales_channel_id'])

# setup training set for word2vec
train_frame = articles[
    ['prod_name', 'product_type_name', 'product_group_name', 'graphical_appearance_name', 'department_name',
     'index_name', 'index_group_name', 'section_name', 'garment_group_name']].drop_duplicates()
train_frame = train_frame.apply(lambda x: ','.join(x.astype(str)), axis=1)
train_frame = pd.DataFrame({'clean': train_frame})
data = [row.split(',') for row in train_frame['clean']]

# initialise and train model
model = gensim.models.Word2Vec(min_count=1,
                               vector_size=vec_size1,
                               workers=7,
                               window=3,
                               sg=0)
model.build_vocab(data)
model.train(data, total_examples=model.corpus_count, epochs=30)

# df to loop over
p_names = pd.DataFrame(transactions_processed['prod_name']).drop_duplicates().reset_index(drop=True, inplace=False)
# convert all names into vectors
v = np.empty((0, vec_size1))
for a in p_names.values:
    vec = model.wv.__getitem__([a[0]])
    v = np.append(v, vec, axis=0)

# make a dataframe containing name and vector
df = pd.DataFrame(v, columns=[f'f_{i}' for i in range(vec_size1)])
df = pd.concat([p_names, df], axis=1)

# merge dataframe with transactions
transactions_processed = transactions_processed.merge(df, on='prod_name')

# drop name and evaluate
transactions_processed = transactions_processed.drop(['prod_name'], axis=1)
evaluate(transactions_processed, vec_size=vec_size1)

# detailed_desc Doc2Vec

Evaluation may or may not work. Copying the code to a py file works more reliably.
Reducing vector size may work (but give worse performance).

In [44]:
vec_size2 = 50

print("Starting implementation Doc2Vec")

# prepare transactions
# same as for the baseline
transactions_processed = transactions[
    ['customer_id', 'age', 'article_id', 'prod_name', 'sales_channel_id', 'price', 'ordered', 'detail_desc']].copy()
customer_encoder = preprocessing.LabelEncoder()
article_encoder = preprocessing.LabelEncoder()
transactions_processed['customer_id'] = customer_encoder.fit_transform(transactions_processed['customer_id'])
transactions_processed['article_id'] = article_encoder.fit_transform(transactions_processed['article_id'])
transactions_processed = transactions_processed.fillna(0)
transactions_processed.isnull().values.any()
transactions_processed = pd.get_dummies(transactions_processed, columns=['sales_channel_id'])

# make training set
train_frame = articles[['prod_name', 'detail_desc']].drop_duplicates()
train_frame = train_frame.apply(lambda x: word_tokenize(
    str(x['detail_desc']).lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))),
                                axis=1)
train_frame = pd.DataFrame({'clean': train_frame})
data2 = [row for row in train_frame['clean']]

# train model
print("Starting training Doc2Vec")
model2 = gensim.models.Doc2Vec(min_count=1,
                               vector_size=vec_size2,
                               workers=7,
                               window=3)

data2 = [gensim.models.doc2vec.TaggedDocument(d, [i]) for i, d in enumerate(data2)]
model2.build_vocab(data2)
model2.train(data2, total_examples=model2.corpus_count, epochs=30)

# df to loop over
p_desc = pd.DataFrame(transactions_processed[['prod_name', 'detail_desc']]).drop_duplicates().reset_index(drop=True, inplace=False)

# transform descriptions into vectors
print("Starting vectorization")
v = np.empty((0, vec_size2))
for i, a in np.ndenumerate(p_desc['detail_desc'].values):
    desc = word_tokenize(str(a).lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))))
    vec = [model2.infer_vector(desc)]
    v = np.append(v, vec, axis=0)
    if i[0] % 1000 == 0:
        print(f"\rProcessed {i[0]} / {p_desc.shape[0]} ({i[0]/p_desc.shape[0]*100:.2f}%) rows", end="")
print()

# make dataframe with desc + vector
df = pd.DataFrame(v, columns=[f'f_{i}' for i in range(vec_size2)])
df = pd.concat([p_desc, df], axis=1)

# drop description to prevent duplicate columns
transactions_processed = transactions_processed.drop(['detail_desc'], axis=1)

# merge dataframe with transactions
transactions_processed = transactions_processed.merge(df, on='prod_name')

# drop product name and description
transactions_processed = transactions_processed.drop(['prod_name', 'detail_desc'], axis=1)

# evaluate
evaluate(transactions_processed, vec_size2)

# W2V D2V Together

Evaluation may or may not work. Copying the code to a py file works more reliably.
Reducing the vector sizes in the previous models is almost a requirement, since 100 dimensional vectors will not be classified. A lenght of 25 for each vector seems like a good starting point.
***Requires trained models from previous parts.***

In [45]:
print("Starting implementation combination")

# prepare transactions
# same as for the baseline
transactions_processed = transactions[
    ['customer_id', 'age', 'article_id', 'prod_name', 'sales_channel_id', 'price', 'ordered', 'detail_desc']].copy()
customer_encoder = preprocessing.LabelEncoder()
article_encoder = preprocessing.LabelEncoder()
transactions_processed['customer_id'] = customer_encoder.fit_transform(transactions_processed['customer_id'])
transactions_processed['article_id'] = article_encoder.fit_transform(transactions_processed['article_id'])
transactions_processed = transactions_processed.fillna(0)
transactions_processed.isnull().values.any()
transactions_processed = pd.get_dummies(transactions_processed, columns=['sales_channel_id'])

# df to loop over
p_desc = pd.DataFrame(transactions_processed[['prod_name', 'detail_desc']]).drop_duplicates().reset_index(drop=True, inplace=False)

# make a vector for each product
print("Starting vectorization")
v = np.empty((0, vec_size1 + vec_size2))
for i, row in enumerate(p_desc.values):
    p_name = row[0]
    desc = row[1]
    desc = word_tokenize(
        str(desc).lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))))
    desc_vec = model2.infer_vector(desc)
    name_vec = model.wv[p_name]
    vec = np.concatenate((name_vec, desc_vec), axis=None)
    v = np.append(v, [vec], axis=0)
    if i % 1000 == 0:
        print(f"\rProcessed {i} / {p_desc.shape[0]} ({i/p_desc.shape[0]*100:.2f}%) rows", end="")
print()

# make a dataframe out of vectors and
df = pd.DataFrame(v, columns=[f'f_{i}' for i in range(vec_size1 + vec_size2)])
df = pd.concat([p_desc, df], axis=1)

# drop column to prevent duplicate
transactions_processed = transactions_processed.drop(['detail_desc'], axis=1)
# merge dataframe with transactions
transactions_processed = transactions_processed.merge(df, on='prod_name')
# drop name and description
transactions_processed = transactions_processed.drop(['prod_name', 'detail_desc'], axis=1)
#evaluate
evaluate(transactions_processed, vec_size1+vec_size2)

# Recent popularity

Somewhat slow

In [46]:
# prepare transactions
# same as for the baseline
transactions_processed = transactions[
    ['customer_id', 'age', 'article_id', 'sales_channel_id', 'price', 'ordered', 't_dat']].copy()
customer_encoder = preprocessing.LabelEncoder()
article_encoder = preprocessing.LabelEncoder()
transactions_processed['customer_id'] = customer_encoder.fit_transform(transactions_processed['customer_id'])
transactions_processed['article_id'] = article_encoder.fit_transform(transactions_processed['article_id'])
transactions_processed = transactions_processed.fillna(0)
transactions_processed.isnull().values.any()
transactions_processed = pd.get_dummies(transactions_processed, columns=['sales_channel_id'])

# process dates
transactions_processed['t_dat'] = pd.to_datetime(transactions_processed['t_dat'], format='%Y-%m-%d')
transactions_processed = transactions_processed.sort_values(by=['t_dat'])

# make dict of purchase dates
purchase_dates = {}
purchases = transactions_processed[transactions_processed['ordered'] == 1]  # only keep purchases
purchases = purchases.sort_values(by=['t_dat'])
purchases = purchases[['article_id', 't_dat']]
for index, row in enumerate(purchases.values):
    article_id = row[0]
    date = row[1]
    if article_id not in purchase_dates:
        purchase_dates[article_id] = []
    purchase_dates[article_id].append(date)
    if index % 10000 == 0:
        print(f"\rProcessed {index} / {purchases.shape[0]} rows", end="")
print()

# make a list of #recent purchases
# takes a while
rec_purchase_num = []
for index, row in enumerate(transactions_processed.values):
    article_id = row[2]
    date = row[5]
    if article_id in purchase_dates:
        dates = purchase_dates[article_id]
        dates = [d for d in dates if (date >= d >= date - pd.Timedelta(days=7))]
        rec_purchase_num.append(len(dates) - row[4])
    else:
        rec_purchase_num.append(0)
    if index % 10000 == 0:
        print(f"\rProcessed {index} / {transactions_processed.shape[0]} ({index/transactions_processed.shape[0]*100:.2f}%) rows", end="")
print()

# add list to transactions
transactions_processed['rec_purchases'] = rec_purchase_num
print(transactions_processed.head(10))

# drop dates
transactions_processed = transactions_processed.drop(['t_dat'], axis=1)
# evaluate
evaluate(transactions_processed, vec_size=0, extra_vec=['rec_purchases'])

Long loop
Processed 1590000 / 1598761 rows
Processed 3190000 / 3197522 (99.76%) rows
         customer_id   age  article_id     price  ordered      t_dat  \
0                 14  22.0        4029  0.016932        1 2018-09-20   
381716         48660  58.0       26263  0.033881        1 2018-09-20   
1342250        67720  20.0         805  0.013542        0 2018-09-20   
1342705        33681  27.0       11446  0.022017        1 2018-09-20   
1342746        33681  27.0       13443  0.025407        1 2018-09-20   
1342771        33681  27.0       11585  0.016932        1 2018-09-20   
1861668        62179  21.0       22937  0.011847        1 2018-09-20   
2718627        52818  26.0       17134  0.016932        0 2018-09-20   
1343424        33779  50.0       21043  0.030492        1 2018-09-20   
1343525        33779  50.0       25058  0.022864        1 2018-09-20   

         sales_channel_id_1  sales_channel_id_2  rec_purchases  
0                         0                   1          

# W2V and Popularity

Evaluation may or may not work. Copying the code to a py file works more reliably.
Reducing vector size may work (but give worse performance).
Requires trained models from previous parts.

In [None]:
vec_size = 50

print("Starting implementation combination w2v, pop")

# prepare transactions
transactions_processed = transactions[
    ['customer_id', 'age', 'article_id', 'sales_channel_id', 'price', 'ordered', 't_dat', 'prod_name']].copy()
customer_encoder = preprocessing.LabelEncoder()
article_encoder = preprocessing.LabelEncoder()
transactions_processed['customer_id'] = customer_encoder.fit_transform(transactions_processed['customer_id'])
transactions_processed['article_id'] = article_encoder.fit_transform(transactions_processed['article_id'])
transactions_processed = transactions_processed.fillna(0)
transactions_processed.isnull().values.any()
transactions_processed = pd.get_dummies(transactions_processed, columns=['sales_channel_id'])

transactions_processed['t_dat'] = pd.to_datetime(transactions_processed['t_dat'], format='%Y-%m-%d')
transactions_processed = transactions_processed.sort_values(by=['t_dat'])

# w2v part
# generate vectors for all product names
print("Starting vectorization")
p_names = pd.DataFrame(transactions_processed['prod_name']).drop_duplicates().reset_index(drop=True, inplace=False)
v = np.empty((0, vec_size))
for a in p_names.values:
    vec = model.wv.__getitem__([a[0]])
    v = np.append(v, vec, axis=0)
df = pd.DataFrame(v, columns=[f'f_{i}' for i in range(vec_size)])
df = pd.concat([p_names, df], axis=1)

# pop part
# make dict of purchase dates
purchase_dates = {}
purchases = transactions_processed[transactions_processed['ordered'] == 1]  # only keep purchases
purchases = purchases.sort_values(by=['t_dat'])
purchases = purchases[['article_id', 't_dat']]
for index, row in enumerate(purchases.values):
    article_id = row[0]
    date = row[1]
    if article_id not in purchase_dates:
        purchase_dates[article_id] = []
    purchase_dates[article_id].append(date)
    if index % 10000 == 0:
        print(f"\rProcessed {index} / {purchases.shape[0]} rows", end="")
print()

# make a list of #recent purchases
rec_purchase_num = []
for index, row in enumerate(transactions_processed.values):
    article_id = row[2]
    date = row[5]
    if article_id in purchase_dates:
        dates = purchase_dates[article_id]
        dates = [d for d in dates if (date >= d >= date - pd.Timedelta(days=7))]
        rec_purchase_num.append(len(dates) - row[4])
    else:
        rec_purchase_num.append(0)
    if index % 10000 == 0:
        print(f"\rProcessed {index} / {transactions_processed.shape[0]} ({index/transactions_processed.shape[0]*100:.2f}%) rows", end="")

print()

# add recent purchases to frame
transactions_processed['rec_purchases'] = rec_purchase_num
# merge with product name vectors
transactions_processed = transactions_processed.merge(df, on='prod_name')
# remove helper columns
transactions_processed = transactions_processed.drop(['t_dat'], axis=1)
transactions_processed = transactions_processed.drop(['prod_name'], axis=1)
# evaluate
evaluate(transactions_processed, vec_size=vec_size, extra_vec=['rec_purchases'])