# Purpose of this file:
Author: Phuong Huong Nguyen | 15.5.2022
1. Creating a fearture named **average_cycle(weeks)**, which tell us the average time (according to week) that one user bought a certain item in the preriod of time
- The data will be taken from **new_orders_aggregation.csv**
- Explaining the method to calculate as below:
    * For example, we know that one user A bought an item B in weeks_list: [23, 41, 50]:
    * --> **Average cycle = ((50 - 41) + (41 - 23)) / (len(weeks_list) - 1)**
    * --> To be clearer, we calculate the average distance between elements in weeks_list
    * weeks_list will be aggregated from the **new_orders_aggregation.csv**

2. Creating features **first_week_bought** and **last_week_bought**. These feature let us know when is the first time and last time one user bought a certain item

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import math

In [2]:
orders_df = pd.read_csv('new_orders_aggregation.csv', sep='|', index_col = 0)
orders_df

  mask |= (ar1 == a)


Unnamed: 0,userID,itemID,date,order
0,0,1505,2020-09-01,1
1,0,6446,2020-12-11,1
2,0,6446,2021-01-15,1
3,0,9325,2020-11-20,1
4,0,12468,2020-08-03,1
...,...,...,...,...
1071015,46137,22403,2021-01-18,1
1071016,46137,22583,2021-01-31,1
1071017,46137,28343,2020-08-08,1
1071018,46137,28900,2020-08-08,2


In [3]:
# convert column date in df from str type to datetime type
orders_df['date'] = pd.to_datetime(orders_df['date'], infer_datetime_format = True, cache = True)
orders_df['month'] = orders_df['date'].dt.month
orders_df

Unnamed: 0,userID,itemID,date,order,month
0,0,1505,2020-09-01,1,9
1,0,6446,2020-12-11,1,12
2,0,6446,2021-01-15,1,1
3,0,9325,2020-11-20,1,11
4,0,12468,2020-08-03,1,8
...,...,...,...,...,...
1071015,46137,22403,2021-01-18,1,1
1071016,46137,22583,2021-01-31,1,1
1071017,46137,28343,2020-08-08,1,8
1071018,46137,28900,2020-08-08,2,8


# Calculating weeks from 1.June 2020 to 31.1 2021 in orders_df
- timeDeltaDays include the days which are counted from 1 to... , in which 1 equals to the day 01.06.2020 in column date
- timeDeltaWeeks is calculated according to timeDeltaDays. 1 week in timeDeltaWeeks includes 7 days (from 1 to 7 in timeDeltaDays)

In [4]:
date = datetime.strptime("01.06.2020", '%d.%m.%Y')
orders_df['timeDeltaDays'] = orders_df['date'].apply(lambda x: (x-date).days + 1)
orders_df['timeDeltaWeeks'] = orders_df['timeDeltaDays'].apply(lambda x: int(math.ceil(x / 7)))
orders_df

Unnamed: 0,userID,itemID,date,order,month,timeDeltaDays,timeDeltaWeeks
0,0,1505,2020-09-01,1,9,93,14
1,0,6446,2020-12-11,1,12,194,28
2,0,6446,2021-01-15,1,1,229,33
3,0,9325,2020-11-20,1,11,173,25
4,0,12468,2020-08-03,1,8,64,10
...,...,...,...,...,...,...,...
1071015,46137,22403,2021-01-18,1,1,232,34
1071016,46137,22583,2021-01-31,1,1,245,35
1071017,46137,28343,2020-08-08,1,8,69,10
1071018,46137,28900,2020-08-08,2,8,69,10


## Removing all duplicated rows which have the same (userID, itemID, timeDeltaWeeks) except for the first occurrence

In [5]:
df = orders_df.copy().drop(columns=['order'])
df.drop_duplicates(subset=['userID', 'itemID', 'timeDeltaWeeks'], keep='first', inplace = True)
df

Unnamed: 0,userID,itemID,date,month,timeDeltaDays,timeDeltaWeeks
0,0,1505,2020-09-01,9,93,14
1,0,6446,2020-12-11,12,194,28
2,0,6446,2021-01-15,1,229,33
3,0,9325,2020-11-20,11,173,25
4,0,12468,2020-08-03,8,64,10
...,...,...,...,...,...,...
1071015,46137,22403,2021-01-18,1,232,34
1071016,46137,22583,2021-01-31,1,245,35
1071017,46137,28343,2020-08-08,8,69,10
1071018,46137,28900,2020-08-08,8,69,10


## Copying all data before 4.1.2021 to a new dataframe and calculating average cycle based on the new dataframe

In [6]:
df_copy = df[df['timeDeltaDays'] < 218]
df_copy

Unnamed: 0,userID,itemID,date,month,timeDeltaDays,timeDeltaWeeks
0,0,1505,2020-09-01,9,93,14
1,0,6446,2020-12-11,12,194,28
3,0,9325,2020-11-20,11,173,25
4,0,12468,2020-08-03,8,64,10
5,0,12505,2020-08-18,8,79,12
...,...,...,...,...,...,...
1071011,46137,2667,2020-09-17,9,109,16
1071014,46137,20209,2020-08-08,8,69,10
1071017,46137,28343,2020-08-08,8,69,10
1071018,46137,28900,2020-08-08,8,69,10


## Creating an dataframe to manage purchasing history per usersID
- **user_history** is the dataframe which manage all purchase history of each userID in the period of time from June 2020 to before 4.1.2021. Therefore, from this dataframe we can search for one userID to know all the itemID they bought and the time of transaction

In [11]:
# getting all unique userIDs in orders_df
old_userIDs = orders_df['userID'].unique()

# Create an empty DataFrame of user_history 
user_history = pd.DataFrame({'userID':[],
                   'itemID': [],
                    'date': [],
                    'month': [],
                   'timeDeltaDays': [],
                   'timeDeltaWeeks': []}, dtype = 'i4')
# loop through each userID to get all of their historical purchases and add to the user_history
for user in old_userIDs:
    order = df_copy[df_copy['userID'] == user]
    user_history = pd.concat([user_history, order], ignore_index = True)
    
user_history

Unnamed: 0,userID,itemID,date,month,timeDeltaDays,timeDeltaWeeks
0,0,1505,2020-09-01 00:00:00,9,93,14
1,0,6446,2020-12-11 00:00:00,12,194,28
2,0,9325,2020-11-20 00:00:00,11,173,25
3,0,12468,2020-08-03 00:00:00,8,64,10
4,0,12505,2020-08-18 00:00:00,8,79,12
...,...,...,...,...,...,...
918285,46137,2667,2020-09-17 00:00:00,9,109,16
918286,46137,20209,2020-08-08 00:00:00,8,69,10
918287,46137,28343,2020-08-08 00:00:00,8,69,10
918288,46137,28900,2020-08-08 00:00:00,8,69,10


In [12]:
# extract dataframe purchase_history_per_user to .csv file
#user_history.to_csv('user_history.csv')

In [13]:
# import user_history.csv
#user_history = pd.read_csv('user_history.csv', index_col = 0)

## Using dataframe df_copy  to update all weeks of year, in which one user bought a certain item

In [14]:
# list_weeks is used to update all list of weeks, in which one user bought a certain item. 
list_weeks = []

for i in range(len(df_copy['userID'])):
    #user = df_copy['userID'][i]
    #item = df_copy['itemID'][i]
    user = df_copy.iloc[i, 0]
    item = df_copy.iloc[i, 1]
    # get a sub-dataframe which match with user and item
    data = user_history[(user_history['userID'] == user) & (user_history['itemID'] == item)]
    weeks = data['timeDeltaWeeks'].to_list()
    list_weeks.append(weeks)
    

# adding list_weeks to the dataframe df_copy
df_copy['weeks_bought_item'] = list_weeks
df_copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_copy['weeks_bought_item'] = list_weeks


Unnamed: 0,userID,itemID,date,month,timeDeltaDays,timeDeltaWeeks,weeks_bought_item
0,0,1505,2020-09-01,9,93,14,[14]
1,0,6446,2020-12-11,12,194,28,[28]
3,0,9325,2020-11-20,11,173,25,[25]
4,0,12468,2020-08-03,8,64,10,[10]
5,0,12505,2020-08-18,8,79,12,[12]
...,...,...,...,...,...,...,...
1071011,46137,2667,2020-09-17,9,109,16,[16]
1071014,46137,20209,2020-08-08,8,69,10,[10]
1071017,46137,28343,2020-08-08,8,69,10,[10]
1071018,46137,28900,2020-08-08,8,69,10,[10]


In [15]:
# extract dataframe df into a .csv file
#df_copy.to_csv('2_weeks_bought_item.csv')

## Calculate the average cycle of purchasing of each user for each certain item ( That means how long did one user buy a certain item again). The calculation is according to week

### 1. writing a function which calculate the average distance between elements in a list 
- For example: give list = [23, 41, 50]: -->average = ((50 - 41) + (41 - 23)) / (len(list) - 1)

In [16]:
def calculate_distance(list_distance):
    avg_cicle = 0
    # if list_distance has only one element (that means user bought that item only one time), so avg_cicle should be 0
    if len(list_distance) <= 1:
        return 0
    else:
        for i in range(len(list_distance) - 1, 0, -1): # i goes down to 1
            j = i - 1
            avg_cicle = avg_cicle + list_distance[i] - list_distance[j]
        return round(avg_cicle / (len(list_distance) - 1), 2)  # result is rounded to 2 decimals


#### 2. Applying function calculate_distance to all elements of list_weeks and save it into a list of avg_cicle_list

In [17]:
avg_cicle_list = []
for weeks in list_weeks:
    avg_cicle = calculate_distance(weeks)
    avg_cicle_list.append(avg_cicle)


#### 3. Adding avg_cicle_list to the dataframe df as a new column

In [18]:
df_copy['average_cycle(weeks)'] = avg_cicle_list
df_copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_copy['average_cycle(weeks)'] = avg_cicle_list


Unnamed: 0,userID,itemID,date,month,timeDeltaDays,timeDeltaWeeks,weeks_bought_item,average_cycle(weeks)
0,0,1505,2020-09-01,9,93,14,[14],0.0
1,0,6446,2020-12-11,12,194,28,[28],0.0
3,0,9325,2020-11-20,11,173,25,[25],0.0
4,0,12468,2020-08-03,8,64,10,[10],0.0
5,0,12505,2020-08-18,8,79,12,[12],0.0
...,...,...,...,...,...,...,...,...
1071011,46137,2667,2020-09-17,9,109,16,[16],0.0
1071014,46137,20209,2020-08-08,8,69,10,[10],0.0
1071017,46137,28343,2020-08-08,8,69,10,[10],0.0
1071018,46137,28900,2020-08-08,8,69,10,[10],0.0


In [20]:
# extract data df into average_cycle_of_purchasing.csv
#df_copy.to_csv('2_average_cycle_of_purchasing.csv')

## Getting the first week and the last week that the user bought a certain item based on 'timeDeltaWeeks'

In [21]:
first_week_bought = []
last_week_bought = []
for li in list_weeks:
    first_week_bought.append(li[0])
    last_week_bought.append(li[-1])
    
# adding columns to df_copy
data_copy = df_copy.copy()
data_copy['first_week_bought'] = first_week_bought
data_copy['last_week_bought'] = last_week_bought

In [22]:
data_copy

Unnamed: 0,userID,itemID,date,month,timeDeltaDays,timeDeltaWeeks,weeks_bought_item,average_cycle(weeks),first_week_bought,last_week_bought
0,0,1505,2020-09-01,9,93,14,[14],0.0,14,14
1,0,6446,2020-12-11,12,194,28,[28],0.0,28,28
3,0,9325,2020-11-20,11,173,25,[25],0.0,25,25
4,0,12468,2020-08-03,8,64,10,[10],0.0,10,10
5,0,12505,2020-08-18,8,79,12,[12],0.0,12,12
...,...,...,...,...,...,...,...,...,...,...
1071011,46137,2667,2020-09-17,9,109,16,[16],0.0,16,16
1071014,46137,20209,2020-08-08,8,69,10,[10],0.0,10,10
1071017,46137,28343,2020-08-08,8,69,10,[10],0.0,10,10
1071018,46137,28900,2020-08-08,8,69,10,[10],0.0,10,10


In [23]:
# extract data df_copy into average_cycle_first_last_weeks.csv
data_copy.to_csv('2_average_cycle_first_last_weeks.csv')

In [24]:
data = data_copy.drop(columns=['date', 'month', 'timeDeltaDays', 'timeDeltaWeeks', 'weeks_bought_item'])
data

Unnamed: 0,userID,itemID,average_cycle(weeks),first_week_bought,last_week_bought
0,0,1505,0.0,14,14
1,0,6446,0.0,28,28
3,0,9325,0.0,25,25
4,0,12468,0.0,10,10
5,0,12505,0.0,12,12
...,...,...,...,...,...
1071011,46137,2667,0.0,16,16
1071014,46137,20209,0.0,10,10
1071017,46137,28343,0.0,10,10
1071018,46137,28900,0.0,10,10


## Removing all duplicated data (userID, itemID) except for the first one, so that we can merge correctly with train_test_set

In [26]:
features_without_duplicate = data.drop_duplicates(['userID', 'itemID'], keep='first')
features_without_duplicate

Unnamed: 0,userID,itemID,average_cycle(weeks),first_week_bought,last_week_bought
0,0,1505,0.0,14,14
1,0,6446,0.0,28,28
3,0,9325,0.0,25,25
4,0,12468,0.0,10,10
5,0,12505,0.0,12,12
...,...,...,...,...,...
1071011,46137,2667,0.0,16,16
1071014,46137,20209,0.0,10,10
1071017,46137,28343,0.0,10,10
1071018,46137,28900,0.0,10,10


In [27]:
features_without_duplicate.to_csv('IU_FEAT_AverageCycle.csv')