In [39]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [40]:
data = pd.read_csv('user_journey_raw.csv')

In [41]:
data

Unnamed: 0,user_id,session_id,subscription_type,user_journey
0,1516,2980231,Annual,Homepage-Log in-Log in-Log in-Log in-Log in-Lo...
1,1516,2980248,Annual,Other-Sign up-Sign up-Sign up-Sign up-Sign up-...
2,1516,2992252,Annual,Log in-Log in-Log in-Log in-Log in-Log in
3,1516,3070491,Annual,Homepage-Log in-Log in-Log in-Log in-Log in-Lo...
4,1516,3709807,Annual,Log in-Log in-Log in-Log in-Log in-Log in-Log ...
...,...,...,...,...
9930,509095,4487613,Annual,Other-Other-Other-Other-Other-Other-Other-Othe...
9931,509095,4842565,Annual,Other-Other-Other-Other-Other-Other-Other-Othe...
9932,509095,4843103,Annual,Other-Other
9933,509095,4845316,Annual,Other-Other-Other-Other-Other-Other-Other-Othe...


In [42]:
data.nunique()

user_id              1350
session_id           9935
subscription_type       3
user_journey         1841
dtype: int64

In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9935 entries, 0 to 9934
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   user_id            9935 non-null   int64 
 1   session_id         9935 non-null   int64 
 2   subscription_type  9935 non-null   object
 3   user_journey       9935 non-null   object
dtypes: int64(2), object(2)
memory usage: 310.6+ KB


In [44]:
data['subscription_type'].unique()

array(['Annual', 'Monthly', 'Quarterly'], dtype=object)

In [45]:
data.describe()

Unnamed: 0,user_id,session_id
count,9935.0,9935.0
mean,401709.580775,3464916.0
std,106934.401794,1095088.0
min,1516.0,1817.0
25%,335237.0,3087452.0
50%,461458.0,3640994.0
75%,470347.0,4247818.0
max,509096.0,4845427.0


In [46]:
def remove_page_duplicates(user_journey: str) -> str:
  result = []
  journey = user_journey.split('-')
  result.append(journey[0])
  for word in journey[1:]:
    if word != result[-1]:
      result.append(word)

  return '-'.join(result)
    

In [47]:
def group_by(data, count_from = 'last', group_column = 'user_id', target_column='user_journey',sessions = 'All'):
  if count_from == 'last':
    if sessions == 'All':
      df = data
    else:
      df = data.groupby(group_column).tail(sessions)
  elif count_from == 'first':
    if sessions == 'All':
      df = data
    else:
      df = data.groupby(group_column).head(sessions)
  result = df.groupby('user_id')['user_journey'].apply(list).to_dict()

  data['grouped_home_pages'] = data['user_id'].map(result)
  data['grouped_home_pages'] = data['grouped_home_pages'].map(lambda x: '-'.join(x))

  return data




In [48]:
def remove_pages(data, pages, target_column) -> pd.DataFrame:
  def remove_from_line(list_pages, pages_to_remove):
    return '-'.join([page for page in list_pages if page not in pages_to_remove])
  
  data[target_column] =   data[target_column].map(lambda x : remove_from_line(x.split('-'), pages))

  return data

In [49]:
result_data = group_by(data, count_from='first', group_column='user_id', target_column='user_journey', sessions=3)


In [50]:
def page_counts(data, plan=None):
  if plan:
    data = data[data['subscription_type'] == plan]

  return data['user_journey'].str.split('-').explode().value_counts()

In [51]:
page_counts(data, 'Monthly')

Checkout                    7554
Log in                      5811
Sign up                     2683
Courses                     2343
Career tracks               1518
Other                       1124
Homepage                    1088
Career track certificate     881
Pricing                      658
Coupon                       602
Course certificate           389
Resources center             382
Success stories              213
Upcoming courses              50
Blog                          10
Instructors                    8
About us                       4
Name: user_journey, dtype: int64

In [52]:
def page_presence(data, plan=None):
  if plan:
    data = data[data['subscription_type'] == plan]
  unique_pages = data['user_journey'].str.split('-').apply(lambda x: set(x))
  return unique_pages.explode().value_counts()

In [53]:
result_data = data.copy()
result_data['grouped_home_pages'] = result_data['grouped_home_pages'].map(remove_page_duplicates)

In [54]:
def page_destination(data, plan=None):
  if plan:
    data = data[data['subscription_type'] == plan]
  destinations = []
  for journey in data['user_journey']:
    journey = journey.split('-')
    for i in range(len(journey) -1):
      destinations.append((journey[i], journey[i+1]))
  return pd.Series(destinations).value_counts()

In [55]:
def page_sequences(data, plan=None, n=3):
  if plan:
      data = data[data['subscription_type'] == plan]
  
  destinations = []
  
  for journey in data['user_journey']:
      journey = journey.split('-')
      cup = []
      
      for i in range(len(journey) - n + 1):
          s = '-'.join(journey[i:i+n])
          if s not in cup:  # Avoid duplicates within the same journey
              cup.append(s)
      
      destinations.extend(cup)  # Add unique sequences from this journey to the overall list
  
  return pd.Series(destinations).value_counts()  # Count occurrences of each sequence


In [56]:
d = page_destination(data)
d_pricing = d[d.index.map(lambda x: x[0] == 'Pricing')]
d_pricing

(Pricing, Pricing)                     1168
(Pricing, Checkout)                     286
(Pricing, Sign up)                      128
(Pricing, Courses)                       99
(Pricing, Log in)                        80
(Pricing, Career track certificate)      58
(Pricing, Career tracks)                 53
(Pricing, Homepage)                      49
(Pricing, Resources center)              32
(Pricing, Course certificate)            22
(Pricing, Other)                         13
(Pricing, Upcoming courses)              13
(Pricing, Success stories)                4
(Pricing, Blog)                           2
(Pricing, Instructors)                    1
dtype: int64

In [57]:
pricing_followups = data[data['user_journey'] == 'Pricing'].shift(-1)  # Get the next page after "Pricing"
followup_counts = pricing_followups['user_journey'].value_counts()
print(followup_counts)

Series([], Name: user_journey, dtype: int64)


In [58]:
def journey_length(data, plan=None):
  if plan:
      data = data[data['subscription_type'] == plan]
  data['length'] = data['user_journey'].str.split('-').apply(len)

  cal_data = data.groupby('user_id')['length'].mean()
  return cal_data
  

In [59]:
data_s = data.groupby('user_id').tail(3)
journey_length(data_s).mean()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['length'] = data['user_journey'].str.split('-').apply(len)


10.077777777777778

In [60]:
page_presence(data.groupby('user_id').tail(3))

Checkout                    1159
Log in                       829
Coupon                       755
Homepage                     662
Other                        496
Sign up                      387
Pricing                      266
Courses                      228
Career tracks                168
Career track certificate      80
Resources center              66
Course certificate            45
Upcoming courses              32
Success stories               13
Instructors                    9
Blog                           5
About us                       5
Name: user_journey, dtype: int64

In [72]:
page_sequences(data.groupby('user_id').tail(3)).head(25)

Checkout-Checkout-Checkout                                                    822
Coupon-Coupon-Coupon                                                          600
Log in-Log in-Log in                                                          553
Sign up-Sign up-Sign up                                                       287
Other-Other-Other                                                             238
Homepage-Log in-Log in                                                        216
Courses-Courses-Courses                                                       148
Career tracks-Career tracks-Career tracks                                     113
Homepage-Pricing-Pricing                                                      101
Homepage-Sign up-Sign up                                                       97
Sign up-Log in-Log in                                                          73
Sign up-Sign up-Log in                                                         71
Career track cer

In [62]:
# Page count for all users
print("Page Count (All Users):")
print(page_counts(data))

# Page count for monthly users
print("\nPage Count (Monthly Users):")
print(page_counts(data, plan='Monthly'))




Page Count (All Users):
Checkout                    17896
Log in                      17265
Coupon                      11855
Courses                      7149
Sign up                      6824
Other                        6820
Career tracks                4910
Homepage                     3808
Career track certificate     3044
Resources center             2266
Pricing                      2262
Course certificate           1114
Success stories               604
Upcoming courses              188
Instructors                    76
Blog                           36
About us                       33
Name: user_journey, dtype: int64

Page Count (Monthly Users):
Checkout                    7554
Log in                      5811
Sign up                     2683
Courses                     2343
Career tracks               1518
Other                       1124
Homepage                    1088
Career track certificate     881
Pricing                      658
Coupon                       602
Course

In [63]:
# Page presence for all users
print("\nPage Presence (All Users):")
print(page_presence(data))




Page Presence (All Users):
Log in                      3798
Homepage                    2396
Checkout                    2021
Other                       1535
Sign up                     1210
Coupon                      1041
Pricing                      929
Courses                      908
Career tracks                747
Career track certificate     355
Resources center             339
Course certificate           191
Upcoming courses             101
Success stories               49
Instructors                   26
About us                      22
Blog                          15
Name: user_journey, dtype: int64


In [64]:
# Page destination for all users
print("\nPage Destination (All Users):")
print(page_destination(data))




Page Destination (All Users):
(Checkout, Checkout)            15832
(Log in, Log in)                13389
(Coupon, Coupon)                10814
(Courses, Courses)               5962
(Sign up, Sign up)               5521
                                ...  
(Sign up, Checkout)                 1
(Instructors, Log in)               1
(Checkout, Sign up)                 1
(Homepage, Checkout)                1
(Career tracks, Instructors)        1
Length: 192, dtype: int64


In [65]:
# Page sequences for all users with N=2
print("\nPage Sequences (All Users, N=2):")
print(page_sequences(data, n=2))




Page Sequences (All Users, N=2):
Log in-Log in                3590
Checkout-Checkout            1891
Other-Other                  1171
Coupon-Coupon                1040
Sign up-Sign up               934
                             ... 
Instructors-Log in              1
Success stories-Courses         1
Sign up-Upcoming courses        1
Checkout-Success stories        1
Career tracks-Instructors       1
Length: 192, dtype: int64


In [66]:
# Journey length for all users
print("\nJourney Length (All Users):")
print(journey_length(data))


Journey Length (All Users):
user_id
1516       9.076923
3395       3.600000
10107      7.375000
11145      4.909091
12400      8.000000
            ...    
509060    11.000000
509061    20.000000
509085    18.000000
509095    15.500000
509096    16.000000
Name: length, Length: 1350, dtype: float64


In [67]:
first_three_sessions = data.groupby('user_id').head(3)
record_count = len(first_three_sessions)
print(record_count)

3575


In [68]:
page_presence(data,'Quarterly')

Log in                      135
Checkout                    103
Homepage                     91
Sign up                      64
Other                        42
Pricing                      28
Courses                      25
Career tracks                17
Career track certificate     13
Coupon                       13
Course certificate            8
Resources center              7
Success stories               3
Upcoming courses              3
About us                      1
Name: user_journey, dtype: int64