# Addressing Repeat Rent as a Feature  
In this notebook, I attempt feature engineering based on repeat rents. However, this approach failed because the features consistently had non-zero values for positive samples and zero values for the generated negative examples.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
path='/content/drive/MyDrive/RecSys_206894495'
%run /content/drive/MyDrive/RecSys_206894495/models/evaluate_models.ipynb

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
empty json


In [None]:
!pip install pyarrow



In [None]:
file_path_outfit = path + "/models/val_outfit.json"
file_path_group = path + "/models/val_group.json"

In [None]:
#import
#warning
import warnings
warnings.filterwarnings('ignore')

#general
import pandas as pd
import numpy as np
from itertools import product
import json
from tqdm.notebook import tqdm
from datetime import datetime

#for predictions
from sklearn.ensemble import GradientBoostingClassifier as GradientBoosting
from sklearn.preprocessing import OneHotEncoder

In [None]:
#load data
user_splits=pd.read_parquet(path+'/models/user_splits.parquet')

In [None]:
user_splits.head(1)

Unnamed: 0,customer_id,train_outfit_ids,val_outfit_ids,test_outfit_ids,train_group,val_group,test_group,train_booking_times_start,val_booking_times_start,test_booking_times_start,train_booking_times_end,val_booking_times_end,test_booking_times_end,join_outfit_ids,join_group
0,0,"[outfit.85f26909d8334ab78f30c2fc9c73faf7, outf...",[outfit.d4b6896b1ae74cdabebfdcf948fe64e2],[outfit.9f5058295098471abdfaf0a7c74ddbfe],"[group.c79c907b6c94a9bd2005e038943ab529, group...",[group.e5d92e41f348d9c05919685917e77de9],[group.f6f0b9ebb3228aab27a79ac658c76682],"[2023-11-22T00:00:00.000000, 2023-11-24T00:00:...",[2023-11-24T00:00:00.000000],[2023-12-06T00:00:00.000000],"[2023-12-21T00:00:00.000000, 2023-12-23T00:00:...",[2023-12-23T00:00:00.000000],[2024-01-05T00:00:00.000000],"[outfit.85f26909d8334ab78f30c2fc9c73faf7, outf...","[group.c79c907b6c94a9bd2005e038943ab529, group..."


In [None]:
def create_gap_df(df, outfit_col, time_col):
    gap_data = []

    for index, row in df.iterrows():
        customer_id = row['customer_id']
        outfits = row[outfit_col]
        times = row[time_col]

        outfit_times = {}

        for outfit, time in zip(outfits, times):
            if outfit not in outfit_times:
                outfit_times[outfit] = []
            outfit_times[outfit].append(time)

        for outfit, time_list in outfit_times.items():
            time_list.sort()
            gaps = [(time_list[i+1] - time_list[i]) for i in range(len(time_list) - 1)]

            gap_row = {'customer_id': customer_id, 'outfit_id': outfit}
            for i, gap in enumerate(gaps[:-1]):
                gap_row[f'gap_{i+1}'] = gap
            gap_row['label'] = (gaps[-1]) if gaps else None

            gap_data.append(gap_row)

    return pd.DataFrame(gap_data)

# Create train, val, test DataFrames
train_df = create_gap_df(user_splits, 'train_outfit_ids', 'train_booking_times_start')
val_df = create_gap_df(user_splits, 'val_outfit_ids', 'val_booking_times_start')
test_df = create_gap_df(user_splits, 'test_outfit_ids', 'test_booking_times_start')

print("Train DataFrame:")
display(train_df)



Train DataFrame:


Unnamed: 0,customer_id,outfit_id,label,gap_1,gap_2,gap_3
0,0,outfit.85f26909d8334ab78f30c2fc9c73faf7,NaT,NaT,NaT,NaT
1,0,outfit.b77ec4404eef405aae1833e224314586,NaT,NaT,NaT,NaT
2,3,outfit.d7bff1b799a34575a47ce0f531791c9f,60 days,NaT,NaT,NaT
3,3,outfit.542f3e3ae20d42bea4c81a418f7adb7b,NaT,NaT,NaT,NaT
4,3,outfit.2fac9258e5a64ac9a7658f54bd2056a8,NaT,NaT,NaT,NaT
...,...,...,...,...,...,...
46504,7413,outfit.78778f72774d4269abd8410a9511fdf6,NaT,NaT,NaT,NaT
46505,7413,outfit.cc2a4ea6b82044d2804ee26e593fbc00,NaT,NaT,NaT,NaT
46506,7413,outfit.17996f09f2a84c078a7b0bd718bcddd6,NaT,NaT,NaT,NaT
46507,7413,outfit.db025960e5494378bd81a48377b0a1bc,NaT,NaT,NaT,NaT


In [None]:
# Function to create a new DataFrame for a specific set (train, val, test)
def create_new_df(df, set_type):
    new_data = []

    for index, row in df.iterrows():
        outfit_ids = row[f'{set_type}_outfit_ids']
        unique_outfit_ids, counts = np.unique(outfit_ids, return_counts=True)
        for outfit_id, count in zip(unique_outfit_ids, counts):
            new_data.append({
                'customer.id': row['customer_id'],
                'outfit.id': outfit_id,
                'count': count
            })

    return pd.DataFrame(new_data)

# Create new DataFrames for train, val, and test
train_df = create_new_df(user_splits, 'train')
val_df = create_new_df(user_splits, 'val')
test_df = create_new_df(user_splits, 'test')

In [None]:
# Function to create a new DataFrame for a specific set (train, val, test)
def create_new_df_group(df, set_type):
    new_data = []

    for index, row in df.iterrows():
        outfit_ids = row[f'{set_type}_group']
        unique_outfit_ids, counts = np.unique(outfit_ids, return_counts=True)
        for outfit_id, count in zip(unique_outfit_ids, counts):
            new_data.append({
                'customer.id': row['customer_id'],
                'group': outfit_id,
                'count': count
            })

    return pd.DataFrame(new_data)

# Create new DataFrames for train, val, and test
train_df_group = create_new_df_group(user_splits, 'train')
val_df_group = create_new_df_group(user_splits, 'val')
test_df_group = create_new_df_group(user_splits, 'test')

In [None]:
customers_outfits=orders[['customer.id','outfit.id']].drop_duplicates()
customer_group=orders[['customer.id','group']].drop_duplicates()

In [None]:
def add_order_columns(df, orders):
    # Sort the orders by customer, outfit, and rent_period_start
    orders = orders.sort_values(by=['customer.id', 'outfit.id', 'rentalPeriod.start'])

    # Create a list to store the results
    result_list = []

    # Iterate over each unique customer and outfit pair
    for (customer, outfit), group in orders.groupby(['customer.id', 'outfit.id']):
        # Create a dictionary to store the row data
        row_data = {'customer.id': customer, 'outfit.id': outfit}

        # Add each order's start and end dates to the row data
        previous_start = None
        for i, (start, end) in enumerate(zip(group['rentalPeriod.start'], group['rentalPeriod.end']), 1):
            # Calculate the gap between orders (start to start)
            if previous_start is not None:
                gap_start_to_start = (start - previous_start).days
                row_data[f'gap_start_to_start_before_order_{i}'] = gap_start_to_start

            previous_start = start

        # Append the row data to the result list
        result_list.append(row_data)

    # Create the result dataframe from the list
    result_df = pd.DataFrame(result_list)

    # Filter the result dataframe to only include pairs that were in the original df
    result_df = result_df.merge(df, on=['customer.id', 'outfit.id'], how='inner')

    return result_df


In [None]:
def add_order_columns_group(df, orders):
    # Sort the orders by customer, outfit, and rent_period_start
    orders = orders.sort_values(by=['customer.id', 'group', 'rentalPeriod.start'])

    # Create a list to store the results
    result_list = []

    # Iterate over each unique customer and outfit pair
    for (customer, group_i), group in orders.groupby(['customer.id', 'group']):
        # Create a dictionary to store the row data
        row_data = {'customer.id': customer, 'group': group_i}

        # Add each order's start and end dates to the row data
        previous_start = None
        for i, (start, end) in enumerate(zip(group['rentalPeriod.start'], group['rentalPeriod.end']), 1):
            # Calculate the gap between orders (start to start)
            if previous_start is not None:
                gap_start_to_start = (start - previous_start).days
                row_data[f'gap_start_to_start_before_order_{i}'] = gap_start_to_start

            previous_start = start

        # Append the row data to the result list
        result_list.append(row_data)

    # Create the result dataframe from the list
    result_df = pd.DataFrame(result_list)

    # Filter the result dataframe to only include pairs that were in the original df
    result_df = result_df.merge(df, on=['customer.id', 'group'], how='inner')

    return result_df


In [None]:
# Call the function
train_df = add_order_columns(train_df, orders)
val_df = add_order_columns(val_df, orders)
test_df = add_order_columns(test_df, orders)

train_df_group = add_order_columns_group(train_df_group, orders)
val_df_group = add_order_columns_group(val_df_group, orders)
test_df_group = add_order_columns_group(test_df_group, orders)

In [None]:
def add_order_columns_count_outfit(df, customer_outfit):

    # Merge the dataframes on 'customer.id' and 'outfit.id' with how='outer' to include all pairs
    merged_df = pd.merge(customer_outfit, df, on=['customer.id', 'outfit.id'], how='outer')

    # Fill NaN values in 'count' column with 0
    merged_df['count'].fillna(0, inplace=True)

    return merged_df

def add_order_columns_count_group(df, customer_group):

    # Merge the dataframes on 'customer.id' and 'outfit.id' with how='outer' to include all pairs
    merged_df = pd.merge(customer_group, df, on=['customer.id', 'group'], how='outer')

    # Fill NaN values in 'count' column with 0
    merged_df['count'].fillna(0, inplace=True)
    merged_df['label'] = df['count'].apply(lambda x: 1 if x >= 2 else 0)

    return merged_df

In [None]:
train_df = add_order_columns_count_outfit(train_df, customers_outfits)
val_df = add_order_columns_count_outfit(val_df, customers_outfits)
test_df = add_order_columns_count_outfit(test_df, customers_outfits)

train_df_group = add_order_columns_count_group(train_df_group, customer_group)
val_df_group = add_order_columns_count_group(val_df_group, customer_group)
test_df_group = add_order_columns_count_group(test_df_group, customer_group)

In [None]:
# Concatenate the two DataFrames
combined_df = pd.concat([train_df, val_df])

# Group by customer_id and outfit_id and sum the counts
join_df = combined_df.groupby(['customer.id', 'outfit.id'], as_index=False)['count'].sum()

# Concatenate the two DataFrames
combined_df_group = pd.concat([train_df_group, val_df_group])

# Group by customer_id and outfit_id and sum the counts
join_df_group = combined_df_group.groupby(['customer.id', 'group'], as_index=False)['count'].sum()

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(customers_outfits)

df=enc.transform(train_df[['customer.id', 'outfit.id']])
train_df.drop(['customer.id', 'outfit.id'], axis=1, inplace=True)
train_df=pd.concat([pd.DataFrame(df.toarray()),train_df],axis=1)

df=enc.transform(val_df[['customer.id', 'outfit.id']])
val_df.drop(['customer.id', 'outfit.id'], axis=1, inplace=True)
val_df=pd.concat([pd.DataFrame(df.toarray()),val_df],axis=1)

df=enc.transform(test_df[['customer.id', 'outfit.id']])
test_df.drop(['customer.id', 'outfit.id'], axis=1, inplace=True)
test_df=pd.concat([pd.DataFrame(df.toarray()),test_df],axis=1)

df=enc.transform(join_df[['customer.id', 'outfit.id']])
join_df.drop(['customer.id', 'outfit.id'], axis=1, inplace=True)
join_df=pd.concat([pd.DataFrame(df.toarray()),join_df],axis=1)

In [None]:
enc=OneHotEncoder(handle_unknown='ignore')
enc.fit(customer_group)

df=enc.transform(train_df_group[['customer_id', 'group']])
train_df_group.drop(['customer_id', 'group'], axis=1, inplace=True)
train_df_group=pd.concat([pd.DataFrame(df.toarray()),train_df_group],axis=1)

df=enc.transform(val_df_group[['customer_id', 'group']])
val_df_group.drop(['customer_id', 'group'], axis=1, inplace=True)
val_df_group=pd.concat([pd.DataFrame(df.toarray()),val_df_group],axis=1)

df=enc.transform(test_df_group[['customer_id', 'group']])
test_df_group.drop(['customer_id', 'group'], axis=1, inplace=True)
test_df_group=pd.concat([pd.DataFrame(df.toarray()),test_df_group],axis=1)

df=enc.transform(join_df_group[['customer_id', 'group']])
join_df_group.drop(['customer_id', 'group'], axis=1, inplace=True)
join_df_group=pd.concat([pd.DataFrame(df.toarray()),join_df_group],axis=1)

##Gradient Boosting

In [None]:
x_train=train_df.drop(['count','label'],axis=1)
y_train=train_df['label']
x_val=val_df.drop(['count','label'],axis=1)
y_val=val_df['label']
x_test=test_df.drop(['count','label'],axis=1)
y_test=test_df['label']
x_join=join_df.drop(['count','label'],axis=1)
y_join=join_df['label']

In [None]:
x_train_group=train_df_group.drop(['count','label'],axis=1)
y_train_group=train_df_group['label']
x_val_group=val_df_group.drop(['count','label'],axis=1)
y_val_group=val_df_group['label']
x_test_group=test_df_group.drop(['count','label'],axis=1)
y_test_group=test_df_group['label']
x_join_group=join_df_group.drop(['count','label'],axis=1)
y_join_group=join_df_group['label']


In [None]:
def train_gb(x,y,n_estimators,learning_rate,max_depth):
  algo=GradientBoosting(n_estimators=n_estimators,learning_rate=learning_rate,max_depth=max_depth)
  algo.fit(x,y)
  return algo
def test_gb(algo,x_test,train,group=False):
  x_test['count_predicted']=algo.predict(x_test)
  df_merged = user_splits_df.merge(results, left_on='customer_id',right_on='customer.id', how='left').drop(columns='customer.id').groupby('customer_id').agg({
      **{col: 'first' for col in user_splits_df.columns if col != 'customer_id'},
      'iid': lambda x: list(x)
  }).reset_index()
  # Rename the aggregated column to id_prediction
  if group:
    df_merged.rename(columns={'iid': 'group_prediction'}, inplace=True)
  else:
    df_merged.rename(columns={'iid': 'id_prediction'}, inplace=True)
  return predictions,algo,df_merged

def train_and_test_gb(x_train,y_train,x_test,user_splits_df,n_estimators,learning_rate,max_depth,group=False):
  algo=train_gb(x_train,y_train,n_estimators,learning_rate,max_depth)
  predictions,algo,df_merged=test_gb(algo,x_test,user_splits_df)
  return predictions,algo,df_merged

In [None]:
n_estimators_list=[100,150,200]
learning_rate_list=[0.1,0.05,0.2]
max_depth_list=[3,4,5]

In [None]:
##Hyper-Parameters tuning for outfit

test_permutations = list(product(n_estimators_list,learning_rate_list,max_depth_list))

for n_estimators,learning_rate,max_depth in tqdm(test_permutations):
    user_splits_df=user_splits.copy()
    predictions,algo,user_splits_df = train_and_test_gb(x_train,y_train,x_val,user_splits_df,n_estimators,learning_rate,max_depth)
    param_dict = {'n_estimators':n_estimators,'learning_rate':learning_rate,'max_depth':max_depth}
    user_splits_df, all_dict = evaluate_val_metrics_at_n_outfit(user_splits_df,'Gradient_Boosting_repeat', n=10,model_params=param_dict)

In [None]:
##Hyper-Parameters tuning for group

test_permutations = list(product(n_estimators_list,learning_rate_list,max_depth_list))

for n_estimators,learning_rate,max_depth in tqdm(test_permutations):
    user_splits_df=user_splits.copy()
    predictions,algo,user_splits_df = train_and_test_gb(x_train_group,y_train_group,x_val_group,user_splits_df,n_estimators,learning_rate,max_depth,group=True)
    param_dict = {'n_estimators':n_estimators,'learning_rate':learning_rate,'max_depth':max_depth}
    user_splits_df, all_dict = evaluate_val_metrics_at_n_group(user_splits_df,'Gradient_Boosting_repeat', n=10,model_params=param_dict)

In [None]:
# Retrieve the best model parameters from the JSON file for the specified method
best_model_params_outfit = retrieve_best_model_params_from_file_outfit('Gradient_Boosting_repeat', n=10)
best_model_params_group = retrieve_best_model_params_from_file_group('Gradient_Boosting_repeat', n=10)

user_splits_df=user_splits.copy()
n_estimators_outfit=best_model_params_outfit['n_estimators']
learning_rate_outfit=best_model_params_outfit['learning_rate']
max_depth_outfit=best_model_params_outfit['max_depth']
n_estimators_group=best_model_params_group['n_estimators']
learning_rate_group=best_model_params_group['learning_rate']
max_depth_group=best_model_params_group['max_depth']

# Train the model with the best parameters
predictions,algo,user_splits_df = train_and_test_gb(x_join,y_join,x_test,user_splits_df,
                                                    n_estimators_outfit, learning_rate_outfit, max_depth_outfit)
predictions,algo,user_splits_df = train_and_test_gb(x_join_group,y_join_group,x_test_group,user_splits_df,
                                                    n_estimators_group, learning_rate_group, max_depth_group,group=True)
# Evaluate the model
user_splits_df, all_dict = evaluate_df_metrics_at_n(user_splits_df,'Gradient_Boosting_repeat', n=10)