In [1]:
import pandas as pd
import random
import glob

In [12]:
def get_one_customer(data, weekday_tag, customer_no):
    """
    input:
        data: dataframe - containing data of one weekday
        weekday_tag: str - e.g. 'mo','tu','we','th','fr'
        ustomer_no: int - as in the data dataframe
    output dataframe for one customer: 
        timestamp : 1min frequency
        customer_id : str weekday_tag + customer_no
        location : location at correspoinding time 
    """
    # get data of one customer
    one_customer = data.loc[data['customer_no']==customer_no]
    # generate complete time index
    one_timeind = pd.date_range(start=one_customer.index[0], end=one_customer.index[-1], freq='min')
    # initiate dataframe with complete time index
    df_one = pd.DataFrame({'timestamp' : one_timeind})
    # modify customer id by adding weekday information
    df_one['customer_id'] = weekday_tag + str(customer_no)
    # fill in the dataframe
    df_one = df_one.merge(one_customer['location'].reset_index(), on=['timestamp'], how='left').fillna(method='ffill')
    # generate a series with location after the current location
    after = df_one['location'].iloc[1:].reset_index()
    # remove the last row
    df_one_ba = df_one[:-1]
    # add column after containing information of the next step location
    df_one_ba['after'] = after['location']

    return df_one_ba 

In [13]:
df = pd.read_csv('../data/monday.csv', sep=';', parse_dates=True, index_col=[0])
one_customer = get_one_customer(df, 'mo', 42)
one_customer

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_one_ba['after'] = after['location']


Unnamed: 0,timestamp,customer_id,location,after
0,2019-09-02 07:26:00,mo42,dairy,dairy
1,2019-09-02 07:27:00,mo42,dairy,dairy
2,2019-09-02 07:28:00,mo42,dairy,dairy
3,2019-09-02 07:29:00,mo42,dairy,spices
4,2019-09-02 07:30:00,mo42,spices,spices
5,2019-09-02 07:31:00,mo42,spices,fruit
6,2019-09-02 07:32:00,mo42,fruit,dairy
7,2019-09-02 07:33:00,mo42,dairy,dairy
8,2019-09-02 07:34:00,mo42,dairy,dairy
9,2019-09-02 07:35:00,mo42,dairy,dairy


In [None]:
data_all = pd.DataFrame(columns=['timestamp', 'customer_id', 'location', 'after'])
for file in glob.glob('../data/*.csv'):
    weekday_tag = file[8:10]
    print(weekday_tag)
    df = pd.read_csv(file, sep=';', parse_dates=True, index_col=[0])
    for i_customers in df['customer_no'].unique():
        one_customer = get_one_customer(df, weekday_tag, i_customers)
        data_all = data_all.append(one_customer)

In [15]:
data_all.shape

(46458, 4)

In [16]:
data_all.head(3)

Unnamed: 0,timestamp,customer_id,location,after
0,2019-09-03 07:02:00,tu1,fruit,fruit
1,2019-09-03 07:03:00,tu1,fruit,fruit
2,2019-09-03 07:04:00,tu1,fruit,drinks
