Here we'll put the bulk of the work.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def check_data(df):
    df.info()
    print(df.head())

# Data Preprocessing
First let's tidy up the data we were given (See `datawasher.ipynb` for more details)

In [2]:
# The Visits Table
file_path = 'datasets_dirty/visits_log_us.csv'

try:
    open(file_path, 'r')
except FileNotFoundError:
    file_path = '/datasets/visits_log_us.csv'

v_df = pd.read_csv(
    file_path,
    parse_dates=['Start Ts', 'End Ts'],
    dtype=
        {
            'Device': 'category',
            'Source Id': 'category'
        }
)

v_df = v_df.rename(
    columns={
        'Uid': 'uid',
        'Device': 'device',
        'Start Ts': 'start_time',
        'End Ts': 'end_time',
        'Source Id': 'source_id'
    }
)
check_data(v_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 359400 entries, 0 to 359399
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   device      359400 non-null  category      
 1   end_time    359400 non-null  datetime64[ns]
 2   source_id   359400 non-null  category      
 3   start_time  359400 non-null  datetime64[ns]
 4   uid         359400 non-null  uint64        
dtypes: category(2), datetime64[ns](2), uint64(1)
memory usage: 8.9 MB
    device            end_time source_id          start_time  \
0    touch 2017-12-20 17:38:00         4 2017-12-20 17:20:00   
1  desktop 2018-02-19 17:21:00         2 2018-02-19 16:53:00   
2    touch 2017-07-01 01:54:00         5 2017-07-01 01:54:00   
3  desktop 2018-05-20 11:23:00         9 2018-05-20 10:59:00   
4  desktop 2017-12-27 14:06:00         3 2017-12-27 14:06:00   

                    uid  
0  16879256277535980062  
1    104060357244891740  
2   745903560

Going from memory usage: 79.3 MB to memory usage: 8.9 MB without any loss of data? Nice.
## The visits table (server logs with data on website visits):
- uid — user's unique identifier
    - Change from 'Uid' to 'uid'
- device — user's device
    - Change from 'Device' to 'device'
    - There's only two different values, so I'll change the type to category
- start_time — session start date and time
    - Change name from 'Start Ts' to 'start_time'
    - Looks like the seconds aren't included in this, I'll convert to datetime
- end_time — session end date and time
    - Change name from 'End Ts' to 'end_time'
    - Change to datetime type also
- source_id — identifier of the ad source the user came from
    - Change name from 'Source Id' to 'source_id'
    - There's only 10 unique values, so I changed this to category type. I'll come back and undo if I need to.

In [3]:
# The Orders Table
file_path = 'datasets_dirty/orders_log_us.csv'

try:
    open(file_path, 'r')
except FileNotFoundError:
    file_path = '/datasets/orders_log_us.csv'

o_df = pd.read_csv(
    file_path,
    parse_dates=['Buy Ts']
)

o_df = o_df.rename(
    columns={
        'Uid': 'uid',
        'Buy Ts': 'purchase_time',
        'Revenue': 'profit'
    }
)
check_data(o_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50415 entries, 0 to 50414
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   purchase_time  50415 non-null  datetime64[ns]
 1   profit         50415 non-null  float64       
 2   uid            50415 non-null  uint64        
dtypes: datetime64[ns](1), float64(1), uint64(1)
memory usage: 1.2 MB
        purchase_time  profit                   uid
0 2017-06-01 00:10:00   17.00  10329302124590727494
1 2017-06-01 00:25:00    0.55  11627257723692907447
2 2017-06-01 00:27:00    0.37  17903680561304213844
3 2017-06-01 00:29:00    0.55  16109239769442553005
4 2017-06-01 07:58:00    0.37  14200605875248379450


Going from memory usage: 4.4 MB to memory usage: 1.2 MB without any loss of data? Nice.
## The orders table (data on orders):
- uid — unique identifier of the user making an order
    - Change from 'Uid' to 'uid'
- purchase_time — order date and time
    - Change from 'Buy Ts' to 'purchase_time'
    - Convert to datetime type
- profit — Yandex.Afisha's revenue from the order
    - Change from 'Revenue' to 'profit'

In [4]:
# The Costs Table
file_path = 'datasets_dirty/costs_us.csv'

try:
    open(file_path, 'r')
except FileNotFoundError:
    file_path = '/datasets/costs_us.csv'

c_df = pd.read_csv(
    file_path,
    parse_dates=['dt'],
    dtype=
        {
            'Device': 'category',
            'source_id': 'category'
        }
)

c_df = c_df.rename(
    columns={
        'dt': 'date'
    }
)
check_data(c_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2542 entries, 0 to 2541
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   source_id  2542 non-null   category      
 1   date       2542 non-null   datetime64[ns]
 2   costs      2542 non-null   float64       
dtypes: category(1), datetime64[ns](1), float64(1)
memory usage: 42.7 KB
  source_id       date  costs
0         1 2017-06-01  75.20
1         1 2017-06-02  62.25
2         1 2017-06-03  36.53
3         1 2017-06-04  55.00
4         1 2017-06-05  57.08


Going from memory usage: 206.2 KB to memory usage: 42.7 KB without any loss of data? Stellar move by me.
## The costs table (data on marketing expenses):
- source_id — ad source identifier
    - There's only 7 unique values. Convert to category type
- dt — date
    - change from 'dt' to 'date'
    - It only has dates, and no times. Convert to datetime type accordingly
- costs — expenses on this ad source on this day
    - This looks fine unchanged

# Report
```
Q: How many people use this every day, week, and month?

A: 
    Monthly average users =     23,228
    Weekly average users =      5,716
    Daily average users =       907 (about 15.88% of the weekly and 3.91% of the monthly users)
```

In [5]:
# Add columns indicating the year, month, week, and day of a users `start_time`
v_df['session_year']  = pd.to_datetime(v_df['start_time'].dt.isocalendar().year)
v_df['session_month'] = pd.to_datetime(v_df['start_time'].dt.to_period('M').dt.to_timestamp()) # Retains year and month
v_df['session_week'] = v_df['start_time'].dt.strftime('%Y-%U')  # Retains year and week
v_df['session_date'] = pd.to_datetime(v_df['start_time'].dt.date)

# v_df.sample(7)

Unnamed: 0,device,end_time,source_id,start_time,uid,session_year,session_month,session_week,session_date
213585,desktop,2017-10-26 21:15:00,4,2017-10-26 21:13:00,2751553619528980168,1970-01-01 00:00:00.000002017,2017-10-01,2017-43,2017-10-26
80067,desktop,2017-11-24 18:23:00,2,2017-11-24 18:08:00,15787964388505675161,1970-01-01 00:00:00.000002017,2017-11-01,2017-47,2017-11-24
293051,desktop,2017-11-22 12:29:00,1,2017-11-22 12:27:00,12482367913018848776,1970-01-01 00:00:00.000002017,2017-11-01,2017-47,2017-11-22
170862,desktop,2018-01-20 12:31:00,4,2018-01-20 12:30:00,553499195425847886,1970-01-01 00:00:00.000002018,2018-01-01,2018-02,2018-01-20
179733,touch,2018-03-27 13:29:00,9,2018-03-27 13:24:00,3820700620527732700,1970-01-01 00:00:00.000002018,2018-03-01,2018-12,2018-03-27
34450,desktop,2018-02-22 15:50:00,4,2018-02-22 15:45:00,505460878143962777,1970-01-01 00:00:00.000002018,2018-02-01,2018-07,2018-02-22
262326,touch,2017-08-17 00:40:00,2,2017-08-17 00:34:00,10264481331471440791,1970-01-01 00:00:00.000002017,2017-08-01,2017-33,2017-08-17


In [6]:
# Calculating average daily, weekly, and monthly users
dau_total = (
    v_df.groupby('session_date')
    .agg({'uid': 'nunique'})
    .mean()
)

wau_total = (
    v_df.groupby(['session_year', 'session_week'])
    .agg({'uid': 'nunique'})
    .mean()
)

mau_total = (
    v_df.groupby(['session_year', 'session_month'])
    .agg({'uid': 'nunique'})
    .mean()
)

# Also, sticky versions
sticky_wau = (dau_total / wau_total) * 100
sticky_mau = (dau_total / mau_total) * 100

print(f'dau = {int(dau_total)} wau = {int(wau_total)} mau = {int(mau_total)}')
print(f'sticky_wau = {float(sticky_wau):.2f}% sticky_mau = {float(sticky_mau):.2f}%')

dau = 907 wau = 5612 mau = 23228
sticky_wau = 16.18% sticky_mau = 3.91%


```
Q: How many sessions are there per day?

A: Average Daily Sessions = 987.36 sessions

Q: What is the length of each session?

A: Average Session length = 10.73mins
```

In [7]:
# Finding session duration in seconds, then dividing by 60 to get mins, then print the average
v_df['session_duration_mins'] = (
    v_df['end_time'] - v_df['start_time']
).dt.seconds / 60

print(f"Average Session length {float(v_df['session_duration_mins'].mean()):.2f}mins")

Average Session length 10.73mins


In [8]:
# Counting number of sessions per day, then printing the average
v_by_day = v_df.groupby('session_date').agg({
    'start_time': 'count'
})
v_by_day = v_by_day.rename(
    columns={
        'start_time': 'sessions'
    }
)
print(f"Average Daily Sessions {float(v_by_day['sessions'].mean()):.2f} sessions")

Average Daily Sessions 987.36 sessions


```
Q: What's the user retention rate?

A: See the Pivot Table below
```

In [9]:
# Calculating first session date to be used for other cohort calculations
first_session_date = v_df.groupby(['uid'])['session_date'].min()
first_session_date.name = 'first_session_date'

v_df = v_df.join(first_session_date, on='uid')

# For daily cohorts, if we want to later

# v_df['cohort_lifetime_days'] = ((
#         v_df['session_date']
#             -
#         v_df['first_session_date']
#     ) / np.timedelta64(1, 'D')
# ).astype(int)

In [None]:
# For weekly cohorts, if we want to later

# first_session_week = v_df.groupby(['uid'])['session_week'].min()
# first_session_week.name = 'first_session_week'

# v_df = v_df.join(first_session_week, on='uid')

# v_df['cohort_lifetime_weeks'] = ((
#         v_df['session_date']
#             -
#         v_df['first_session_date']
#     ) / np.timedelta64(1, 'W')
# ).astype(int)

In [10]:
# Calculating Monthly Cohort Lifetime in Months

first_session_month = v_df.groupby(['uid'])['session_month'].min()
first_session_month.name = 'first_session_month'

try:
    v_df = v_df.join(first_session_month, on='uid')
except:
    print('we already have first_session_month')

v_df['cohort_lifetime_months'] = (
    (
        v_df['session_date'].astype('datetime64[M]')
            -
        v_df['first_session_date'].astype('datetime64[M]')
    ) / np.timedelta64(1, 'M')
).astype(int)

# v_df.info()
# v_df[['start_time', 'session_year', 'session_month', 'session_date', 'first_session_month', 'cohort_lifetime_months']].head(20)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 359400 entries, 0 to 359399
Data columns (total 13 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   device                  359400 non-null  category      
 1   end_time                359400 non-null  datetime64[ns]
 2   source_id               359400 non-null  category      
 3   start_time              359400 non-null  datetime64[ns]
 4   uid                     359400 non-null  uint64        
 5   session_year            359400 non-null  datetime64[ns]
 6   session_month           359400 non-null  datetime64[ns]
 7   session_week            359400 non-null  object        
 8   session_date            359400 non-null  datetime64[ns]
 9   session_duration_mins   359400 non-null  float64       
 10  first_session_date      359400 non-null  datetime64[ns]
 11  first_session_month     359400 non-null  datetime64[ns]
 12  cohort_lifetime_months  359400

Unnamed: 0,start_time,session_year,session_month,session_date,first_session_month,cohort_lifetime_months
0,2017-12-20 17:20:00,1970-01-01 00:00:00.000002017,2017-12-01,2017-12-20,2017-12-01,0
1,2018-02-19 16:53:00,1970-01-01 00:00:00.000002018,2018-02-01,2018-02-19,2018-02-01,0
2,2017-07-01 01:54:00,1970-01-01 00:00:00.000002017,2017-07-01,2017-07-01,2017-07-01,0
3,2018-05-20 10:59:00,1970-01-01 00:00:00.000002018,2018-05-01,2018-05-20,2018-03-01,2
4,2017-12-27 14:06:00,1970-01-01 00:00:00.000002017,2017-12-01,2017-12-27,2017-12-01,0
5,2017-09-03 21:35:00,1970-01-01 00:00:00.000002017,2017-09-01,2017-09-03,2017-09-01,0
6,2018-01-30 11:13:00,1970-01-01 00:00:00.000002018,2018-01-01,2018-01-30,2017-06-01,7
7,2017-11-05 15:14:00,1970-01-01 00:00:00.000002017,2017-11-01,2017-11-05,2017-11-01,0
8,2017-07-19 10:41:00,1970-01-01 00:00:00.000002017,2017-07-01,2017-07-19,2017-07-01,0
9,2017-11-08 13:42:00,1970-01-01 00:00:00.000002017,2017-11-01,2017-11-08,2017-11-01,0


In [None]:
# v_df[['uid', 'cohort_lifetime_days', 'cohort_lifetime_weeks', 'cohort_lifetime_months']].sample(10)

In [None]:
# cohorts_weekly = (
#     v_df.groupby(['first_session_week', 'cohort_lifetime_weeks'])
#     .agg({'uid': 'nunique'})
#     .reset_index()
# )

# initial_users_count = cohorts_weekly[cohorts_weekly['cohort_lifetime_weeks'] == 0][ ['first_session_week', 'uid'] ]
# initial_users_count = initial_users_count.rename(
#     columns={'uid': 'weekly_cohort_users'}
# ) 

# cohorts_weekly = cohorts_weekly.merge(initial_users_count, on='first_session_week')

In [11]:
# Grouping by Cohort
cohorts_monthly = (
    v_df.groupby(['first_session_month', 'cohort_lifetime_months'])
    .agg({'uid': 'nunique'})
    .reset_index()
)

initial_users_count = cohorts_monthly[cohorts_monthly['cohort_lifetime_months'] == 0][ ['first_session_month', 'uid'] ]
initial_users_count = initial_users_count.rename(
    columns={'uid': 'monthly_cohort_users'}
) 

cohorts_monthly = cohorts_monthly.merge(initial_users_count, on='first_session_month')

# Add retention rate column
cohorts_monthly['retention'] = cohorts_monthly['uid'] / cohorts_monthly['monthly_cohort_users']

In [17]:
# make and print table
retention_pivot = cohorts_monthly.pivot_table(
    index='first_session_month',
    columns='cohort_lifetime_months',
    values='retention',
    aggfunc='sum'
)

# Format the percents
retention_pivot = retention_pivot.applymap(lambda x: '{:.2%}'.format(x))

# Get rid og the nan% values
retention_pivot = retention_pivot.replace('nan%', '')

retention_pivot


cohort_lifetime_months,0,1,2,3,4,5,6,7,8,9,10,11
first_session_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-06-01,100.00%,5.87%,5.58%,6.71%,6.77%,6.84%,5.99%,5.50%,5.39%,4.68%,4.03%,3.27%
2017-07-01,100.00%,5.13%,5.60%,5.35%,5.68%,4.63%,4.72%,4.38%,3.09%,2.77%,1.74%,
2017-08-01,100.00%,6.15%,6.10%,5.80%,4.42%,4.20%,4.09%,3.07%,2.62%,1.32%,,
2017-09-01,100.00%,6.77%,6.33%,4.14%,3.96%,3.74%,2.86%,2.37%,1.07%,,,
2017-10-01,100.00%,6.27%,4.37%,3.83%,3.44%,2.51%,2.04%,1.05%,,,,
2017-11-01,100.00%,5.08%,4.16%,3.63%,2.72%,2.15%,1.09%,,,,,
2017-12-01,100.00%,4.44%,3.41%,2.35%,2.02%,0.98%,,,,,,
2018-01-01,100.00%,4.33%,3.01%,2.20%,0.91%,,,,,,,
2018-02-01,100.00%,3.47%,2.27%,1.02%,,,,,,,,
2018-03-01,100.00%,3.13%,1.47%,,,,,,,,,


```
Q: When do people start buying?

A:Average Conversion Rate = 16.90 days
```

In [18]:
# Find the earliest purchase date for each uid (aka for each user)
first_purchase_date = o_df.groupby(['uid'])['purchase_time'].min()
first_purchase_date.name = 'first_purchase_date'

user_conversion = (
    first_session_date.to_frame()
    .join(first_purchase_date, on='uid')
    .dropna()
    # .fillna('never')
)
user_conversion.reset_index(inplace=True)

# Add number of days between users first session and first purchase
user_conversion['conversion_days'] = ((
        user_conversion['first_purchase_date']
            -
        pd.to_datetime(user_conversion['first_session_date'])
    ) / np.timedelta64(1, 'D')
).astype(int)

# user_conversion.sample(10)

Unnamed: 0,uid,first_session_date,first_purchase_date,conversion_days
36374,18366944480640554137,2017-12-10,2017-12-25 16:27:00,15
31692,16004931378838924750,2017-06-01,2017-06-29 18:39:00,28
13651,6840944448800081883,2017-10-09,2017-10-11 23:19:00,2
12949,6461972137199349313,2017-12-13,2017-12-13 20:23:00,0
12280,6140503418397349832,2017-08-05,2017-10-06 08:51:00,62
16930,8512581799131126070,2018-04-15,2018-04-16 07:02:00,1
6114,3091456611554355756,2018-03-05,2018-03-21 09:37:00,16
21730,10976909326931315696,2017-11-20,2017-12-07 15:21:00,17
990,489376571683475084,2017-10-10,2017-12-06 10:56:00,57
34640,17486284766854249763,2017-11-24,2017-11-24 18:06:00,0


In [21]:
# user_conversion['conversion_days'].value_counts()
print(f"Average Conversion Rate {float(user_conversion['conversion_days'].mean()):.2f} days")

Average Conversion Rate 16.90 days


```
Q: How many orders do they make during a given period of time?

A: 
```

In [None]:
v_df[[
    'uid', 
    'cohort_lifetime_months', 
    'cohort_lifetime_weeks', 
    'cohort_lifetime_days'
]].sample(10)

In [None]:
o_df = o_df.merge(user_conversion[['uid', 'first_purchase_date']], on='uid', how='left')

# o_df.head()

In [None]:
o_df['user_age'] = ((
        o_df['purchase_time']
            -
        o_df['first_purchase_date']
    ) / np.timedelta64(1, 'D')
)

# o_df.sample(10)

In [None]:
# Group o_df by uid and calculate the total number of purchases per uid
purchase_count = o_df.groupby('uid')['purchase_time'].count()

# Calculate the duration in days, weeks, and months for each uid
duration_days = (o_df.groupby('uid')['purchase_time'].max() - o_df.groupby('uid')['purchase_time'].min()).dt.days
duration_weeks = duration_days / 7
duration_months = duration_days / 30

# Calculate the average number of purchases per day, week, and month
avg_purchases_per_day = np.where(duration_days == 0, 0, purchase_count / duration_days)
avg_purchases_per_week = np.where(duration_weeks == 0, 0, purchase_count / duration_weeks)
avg_purchases_per_month = np.where(duration_months == 0, 0, purchase_count / duration_months)

# Combine the results into a new DataFrame
average_purchases_df = pd.DataFrame({
    'uid': purchase_count.index,
    'avg_purchases_per_day': avg_purchases_per_day,
    'avg_purchases_per_week': avg_purchases_per_week,
    'avg_purchases_per_month': avg_purchases_per_month
})

# Display the average purchases per day, week, and month
# average_purchases_df.sample(20)


In [None]:
# Group o_df by uid and calculate the total profit and number of purchases per uid
purchase_stats = o_df.groupby('uid').agg({'profit': 'sum', 'purchase_time': 'count'})

# Calculate the average purchase size per user
average_purchase_size_per_user = purchase_stats['profit'] / purchase_stats['purchase_time']

# Add the average purchase size per user to o_df
o_df = o_df.merge(average_purchase_size_per_user.rename('average_purchase_size_per_user'), on='uid')

# Display the average purchase size per user
average_purchase_size_per_user_df = o_df[['uid', 'average_purchase_size_per_user']].drop_duplicates()
# average_purchase_size_per_user_df.sample(10)


In [None]:
total_ltv_data = o_df.groupby('uid')['profit'].sum().reset_index()

total_ltv_data = total_ltv_data.sort_values('profit', ascending=False)

total_ltv_data.head(20)


In [None]:
purchase_stats = o_df.groupby('uid').agg({'profit': 'sum', 'purchase_time': 'count'})

purchase_stats.sample(20)

average_purchase_size = purchase_stats['profit'] / purchase_stats['purchase_time']

# print(average_purchase_size)


In [None]:
total_costs = c_df.agg({'costs': 'sum'})
total_costs

In [None]:
sourcely_costs = c_df.groupby('source_id').agg({'costs':'sum'}).reset_index()

# sourcely_costs

In [None]:
daily_costs = c_df.groupby('date').agg({'costs': 'sum'})

# Create the plot using Pandas plot()
daily_costs.plot(figsize=(10, 6))

# Customize the plot
plt.title('Costs by Date')
plt.xlabel('Date')
plt.ylabel('Costs')
plt.xticks(rotation=45)

# Display the plot
plt.show()


In [None]:
sourcely_users = (
    v_df.groupby(['source_id'])
    .agg({'uid': 'nunique'})
    .reset_index()
)

sourcely_users.columns = ['source_id', 'n_users']

# sourcely_users

In [None]:
cac_report = pd.merge(
    sourcely_costs,
    sourcely_users,
    on='source_id'
)
# cac_report

In [None]:
cac_report['cac'] = cac_report['costs'] / cac_report['n_users']
# cac_report

In [None]:
# Plotting the average CAC
plt.figure(figsize=(10, 6))
plt.bar(cac_report['source_id'], cac_report['cac'])
plt.xlabel('Source ID')
plt.ylabel('Average CAC')
plt.title('Average Customer Acquisition Cost by Source ID')
plt.xticks(rotation=45)
plt.show()

In [None]:
avg_ltv_per_user = (
    total_ltv_data.agg({'profit':'mean'})
    .reset_index()
)
avg_ltv_per_user

In [None]:
total_n_users = (
    v_df.agg({'uid': 'nunique'})
    .reset_index()
)
total_n_users

In [None]:
average_cac_per_user = (total_costs[0] / total_n_users[0]).reset_index()
average_cac_per_user

In [None]:
roi = (avg_ltv_per_user[0] / average_cac_per_user[0]).reset_index()

print('roi =', roi[0])

In [None]:
# Graph CAC, LTV, and ROI per device per week

# Calculate CAC per source_id and time period
cac_per_source = c_df.groupby(['source_id', pd.Grouper(key='date', freq='W-SUN')]).agg({'costs': 'sum'}).reset_index()

# Calculate LTV per source_id and time period
# Merge visits and orders tables on 'uid' column
v_and_o_dfs = v_df.merge(o_df, on='uid')

# Group merged dataframe by 'source_id' and week of 'purchase_time', calculate LTV
sourcely_ltv = v_and_o_dfs.groupby(['source_id', pd.Grouper(key='purchase_time', freq='W-SUN')]).agg(
    ltv=('profit', 'sum'),
    unique_users=('uid', 'nunique')
).reset_index()

# Calculate ROI per source_id and time period
roi_per_source = sourcely_ltv.copy()
roi_per_source['roi'] = roi_per_source['ltv'] / cac_per_source['costs']

# Plotting CAC per source_id
plt.figure(figsize=(10, 6))
for source_id, group in cac_per_source.groupby('source_id'):
    plt.plot(group['date'], group['costs'], label=f"Source {source_id}")
plt.xlabel('Week')
plt.ylabel('CAC')
plt.title('CAC per Source over Time (Weekly)')
plt.legend()
plt.show()

# Plotting LTV per source_id
plt.figure(figsize=(10, 6))
for source_id, group in sourcely_ltv.groupby('source_id'):
    plt.plot(group['purchase_time'], group['ltv'], label=f"Source {source_id}")
plt.xlabel('Week')
plt.ylabel('LTV')
plt.title('LTV per Source over Time (Weekly)')
plt.legend()
plt.show()

# Plotting ROI per source_id
plt.figure(figsize=(10, 6))
for source_id, group in roi_per_source.groupby('source_id'):
    plt.plot(group['purchase_time'], group['roi'], label=f"Source {source_id}")
plt.xlabel('Week')
plt.ylabel('ROI')
plt.title('ROI per Source over Time (Weekly)')
plt.legend()
plt.show()


In [None]:
# Graph CAC, LTV, and ROI per device per week

# Extract the date part from the "end_time" column in v_df
v_df['date'] = pd.to_datetime(v_df['end_time'].dt.date)

# Merge v_df and c_df based on the date part
v_and_c_dfs = v_df.merge(c_df, on='date')

# Calculate CAC per device and week
cac_per_device_week = v_and_c_dfs.groupby([v_and_c_dfs['device'], pd.Grouper(key='date', freq='W-MON')])['costs'].sum().reset_index()

# Calculate LTV per device and week

# Calculate LTV per device and week
ltv_per_device_week = v_and_o_dfs.groupby([v_and_o_dfs['device'], pd.Grouper(key='purchase_time', freq='W-SUN')])['profit'].sum().reset_index()

# Calculate ROI per device and week
roi_per_device_week = ltv_per_device_week.copy()
roi_per_device_week['roi'] = roi_per_device_week['profit'] / cac_per_device_week['costs']

# Plotting CAC per device
plt.figure(figsize=(10, 6))
for device, group in cac_per_device_week.groupby('device'):
    plt.plot(group['date'], group['costs'], label=device)
plt.xlabel('Week')
plt.ylabel('CAC')
plt.title('CAC per Device over Time')
plt.legend()
plt.show()

# Plotting LTV per device
plt.figure(figsize=(10, 6))
for device, group in ltv_per_device_week.groupby('device'):
    plt.plot(group['purchase_time'], group['profit'], label=device)
plt.xlabel('Week')
plt.ylabel('LTV')
plt.title('LTV per Device over Time')
plt.legend()
plt.show()

# Plotting ROI per device
plt.figure(figsize=(10, 6))
for device, group in roi_per_device_week.groupby('device'):
    plt.plot(group['purchase_time'], group['roi'], label=device)
plt.xlabel('Week')
plt.ylabel('ROI')
plt.title('ROI per Device over Time')
plt.legend()
plt.show()
