In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
import yaml

%matplotlib inline

In [2]:
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Configuration file not found!")

In [3]:
# Load Datasets
df_clients = pd.read_csv('../data/raw_data_txt/df_final_demo.txt', sep=',')
df_web_1 = pd.read_csv('../data/raw_data_txt/df_final_web_data_pt_1.txt', sep=',')
df_web_2 = pd.read_csv('../data/raw_data_txt/df_final_web_data_pt_2.txt', sep=',')
df_groups = pd.read_csv('../data/raw_data_txt/df_final_experiment_clients.txt', sep=',')
# Merge Logs
df_logs = pd.concat([df_web_1, df_web_2], ignore_index=True)

In [4]:
# Load the DataFrame from a pickle file
df_full = pd.read_pickle(config['data']['clean']['df_full'])
df_full

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07,Test,5.0,60.0,79.0,U,2.0,189023.86,1.0,4.0
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51,Test,5.0,60.0,79.0,U,2.0,189023.86,1.0,4.0
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22,Test,5.0,60.0,79.0,U,2.0,189023.86,1.0,4.0
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13,Test,5.0,60.0,79.0,U,2.0,189023.86,1.0,4.0
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04,Test,5.0,60.0,79.0,U,2.0,189023.86,1.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
747568,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:46:10,,,,,,,,,
747569,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:45:29,,,,,,,,,
747570,9668240,388766751_9038881013,922267647_3096648104_968866,step_1,2017-05-24 18:44:51,,,,,,,,,
747571,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:44:34,,,,,,,,,


# Frequency of age range

In [5]:
#frequency of age that uses web the most
frequency_age = df_full['clnt_age'].value_counts()
frequency_age 

clnt_age
58.5    6432
57.5    6115
52.5    6061
59.5    6037
55.5    5913
        ... 
96.0      10
14.0      10
13.5       9
95.5       6
94.5       5
Name: count, Length: 165, dtype: int64

# Frequency of age and variation

In [6]:
variation_table = df_full['Variation'].value_counts()
variation_table

Variation
Test       177114
Control    142407
Name: count, dtype: int64

# Frequency of age and Non-Experimeent

In [7]:
# Frequency of age including NaN values
frequency_age = df_full['clnt_age'].value_counts(dropna=False)

print(frequency_age)

clnt_age
NaN     300416
58.5      6432
57.5      6115
52.5      6061
59.5      6037
         ...  
96.0        10
14.0        10
13.5         9
95.5         6
94.5         5
Name: count, Length: 166, dtype: int64


In [8]:
# A frequency table for 'Variation' and 'clnt_age'
Vari_age_table = pd.crosstab(index=df_full['Variation'], columns=df_full['clnt_age'])

# Display the frequency table
print(Vari_age_table)


clnt_age   17.0  17.5  18.0  18.5  19.0  19.5  20.0  20.5  21.0  21.5  ...  \
Variation                                                              ...   
Control      11   180   169   253   200   456   313   611   346   681  ...   
Test          8   232   140   456   258   510   320   674   474  1003  ...   

clnt_age   91.0  91.5  92.0  92.5  93.0  93.5  94.0  94.5  95.5  96.0  
Variation                                                              
Control      17     6    27    18    12    32    12     0     0     4  
Test         22    19    15    10    15     0    13     5     6     6  

[2 rows x 158 columns]


In [9]:
# Fill NaN values in 'Variation' column with 'Non-Experiment'
df_full['Variation'] = df_full['Variation'].fillna('Non-Experiment')

# Group by 'clnt_age' and 'Variation', then count occurrences
most_frequent_variation = df_full.groupby(['clnt_age', 'Variation']).size().reset_index(name='count')

# Get the most frequent variation for each age
most_frequent_variation = most_frequent_variation.loc[most_frequent_variation.groupby('clnt_age')['count'].idxmax()]

print(most_frequent_variation)

     clnt_age       Variation  count
0        13.5  Non-Experiment      9
1        14.0  Non-Experiment     10
2        14.5  Non-Experiment    161
3        15.0  Non-Experiment    106
4        15.5  Non-Experiment    268
..        ...             ...    ...
461      93.5         Control     32
463      94.0            Test     13
464      94.5            Test      5
465      95.5            Test      6
467      96.0            Test      6

[165 rows x 3 columns]


In [10]:
# Fill NaN values in 'Variation' column with 'Non-Experiment'
df_full['Variation'] = df_full['Variation'].fillna('Non-Experiment')

# Group by 'clnt_age' and 'Variation', then count occurrences
most_frequent_variation = df_full.groupby(['clnt_age', 'Variation']).size().reset_index(name='count')

# Get the most frequent variation for each age
most_frequent_variation = most_frequent_variation.loc[most_frequent_variation.groupby('clnt_age')['count'].idxmax()]

# Create separate tables for each group
control_group = most_frequent_variation[most_frequent_variation['Variation'] == 'Control']
test_group = most_frequent_variation[most_frequent_variation['Variation'] == 'Test']
non_experiment_group = most_frequent_variation[most_frequent_variation['Variation'] == 'Non-Experiment']

# Display the results
print("Control Group:")
print(control_group)

print("\nTest Group:")
print(test_group)

print("\nNon-Experiment Group:")
print(non_experiment_group)

# Get the top 3 ages most frequent users of Vanguard in each group
top_3_control = control_group.nlargest(3, 'count')[['clnt_age', 'count']]
top_3_test = test_group.nlargest(3, 'count')[['clnt_age', 'count']]
top_3_non_experiment = non_experiment_group.nlargest(3, 'count')[['clnt_age', 'count']]

# Display the top 3 ages for each group
print(f"\nThe top 3 ages of most frequent users of Vanguard in the control group are: {top_3_control['clnt_age'].tolist()}")
print(f"The top 3 ages of most frequent users of Vanguard in the test group are: {top_3_test['clnt_age'].tolist()}")
print(f"The top 3 ages of most frequent users of Vanguard in the non-experiment group are: {top_3_non_experiment['clnt_age'].tolist()}")

Control Group:
     clnt_age Variation  count
55       25.0   Control   1266
61       26.0   Control   1064
67       27.0   Control   1293
73       28.0   Control   1323
79       29.0   Control   1254
91       31.0   Control   1382
109      34.0   Control   1137
121      36.0   Control   1158
127      37.0   Control   1017
133      38.0   Control   1048
139      39.0   Control    994
151      41.0   Control    973
169      44.0   Control   1212
187      47.0   Control   1164
205      50.0   Control   1320
211      51.0   Control   1350
319      69.0   Control    834
355      75.0   Control    543
388      80.5   Control    199
397      82.0   Control    185
415      85.0   Control     93
421      86.0   Control     80
427      87.0   Control     55
433      88.0   Control    144
454      92.0   Control     27
456      92.5   Control     18
461      93.5   Control     32

Test Group:
     clnt_age Variation  count
42       22.5      Test   1235
48       23.5      Test   1797
54       24

# Control Group and Age

In [11]:
# Filter for only the Control group
control_group_variation = most_frequent_variation[most_frequent_variation['Variation'] == 'Control']

# Count occurrences of 'clnt_age' for the Control group
control_age_frequency = control_group_variation.groupby('clnt_age')['count'].sum().reset_index()

# Print the age frequency
print(control_age_frequency)

    clnt_age  count
0       25.0   1266
1       26.0   1064
2       27.0   1293
3       28.0   1323
4       29.0   1254
5       31.0   1382
6       34.0   1137
7       36.0   1158
8       37.0   1017
9       38.0   1048
10      39.0    994
11      41.0    973
12      44.0   1212
13      47.0   1164
14      50.0   1320
15      51.0   1350
16      69.0    834
17      75.0    543
18      80.5    199
19      82.0    185
20      85.0     93
21      86.0     80
22      87.0     55
23      88.0    144
24      92.0     27
25      92.5     18
26      93.5     32


In [12]:
# Filter for only the Control group
control_group_variation = most_frequent_variation[most_frequent_variation['Variation'] == 'Control']

# Print the result
print("Control Group Variation:")
print(control_group_variation)

# Count occurrences of 'clnt_age' for the Control group
control_age_frequency = control_group_variation.groupby('clnt_age')['count'].sum().reset_index()

# Print the age frequency
print("\nControl Age Frequency:")
print(control_age_frequency)

# Sort the control group DataFrame by 'count' in descending order
sorted_control_group = control_group_variation.sort_values(by='count', ascending=False)

# Print the sorted control group DataFrame
print("\nSorted Control Group by Count:")
print(sorted_control_group)

# Get the top 3 ages in the control group
top_3_ages = sorted_control_group.head(3)

# Print the result
print("\nThe top 3 ages in the control group that used Vanguard the most frequently are:")
for index, row in top_3_ages.iterrows():
    print(f"Age: {row['clnt_age']}, Count: {row['count']}")

Control Group Variation:
     clnt_age Variation  count
55       25.0   Control   1266
61       26.0   Control   1064
67       27.0   Control   1293
73       28.0   Control   1323
79       29.0   Control   1254
91       31.0   Control   1382
109      34.0   Control   1137
121      36.0   Control   1158
127      37.0   Control   1017
133      38.0   Control   1048
139      39.0   Control    994
151      41.0   Control    973
169      44.0   Control   1212
187      47.0   Control   1164
205      50.0   Control   1320
211      51.0   Control   1350
319      69.0   Control    834
355      75.0   Control    543
388      80.5   Control    199
397      82.0   Control    185
415      85.0   Control     93
421      86.0   Control     80
427      87.0   Control     55
433      88.0   Control    144
454      92.0   Control     27
456      92.5   Control     18
461      93.5   Control     32

Control Age Frequency:
    clnt_age  count
0       25.0   1266
1       26.0   1064
2       27.0   1293
3  

# Test Group and Age 

In [13]:
# Filter for only the Test group
test_group_variation = most_frequent_variation[most_frequent_variation['Variation'] == 'Test']

# Count occurrences of 'clnt_age' for the Test group
test_age_frequency = test_group_variation.groupby('clnt_age')['count'].sum().reset_index()

# Print the age frequency
print(test_age_frequency)

     clnt_age  count
0        22.5   1235
1        23.5   1797
2        24.5   1675
3        25.5   2046
4        26.5   1957
..        ...    ...
111      93.0     15
112      94.0     13
113      94.5      5
114      95.5      6
115      96.0      6

[116 rows x 2 columns]


In [14]:
# Filter for only the Test group
test_group_variation = most_frequent_variation[most_frequent_variation['Variation'] == 'Test']

# Print the result
print("Test Group Variation:")
print(test_group_variation)

# Test occurrences of 'clnt_age' for the Test group
test_age_frequency = test_group_variation.groupby('clnt_age')['count'].sum().reset_index()

# Print the age frequency
print("\nTest Age Frequency:")
print(test_age_frequency)

# Sort the entire DataFrame by 'count' in descending order
sorted_variation = most_frequent_variation.sort_values(by='count', ascending=False)

# Print the sorted DataFrame
print("\nSorted Variation by Count:")
print(sorted_variation)

# Filter for only the Test group and sort by 'count'
test_group_sorted = most_frequent_variation[most_frequent_variation['Variation'] == 'Test'].sort_values(by='count', ascending=False)

# Print the sorted test group DataFrame
print("\nSorted Test Group by Count:")
print(test_group_sorted)

# Get the top 3 ages in the test group
top_3_ages = test_group_sorted.head(3)

# Print the result
print("\nThe top 3 ages in the test group that used Vanguard the most frequently are:")
for index, row in top_3_ages.iterrows():
    print(f"Age: {row['clnt_age']}, Count: {row['count']}")

Test Group Variation:
     clnt_age Variation  count
42       22.5      Test   1235
48       23.5      Test   1797
54       24.5      Test   1675
60       25.5      Test   2046
66       26.5      Test   1957
..        ...       ...    ...
460      93.0      Test     15
463      94.0      Test     13
464      94.5      Test      5
465      95.5      Test      6
467      96.0      Test      6

[116 rows x 3 columns]

Test Age Frequency:
     clnt_age  count
0        22.5   1235
1        23.5   1797
2        24.5   1675
3        25.5   2046
4        26.5   1957
..        ...    ...
111      93.0     15
112      94.0     13
113      94.5      5
114      95.5      6
115      96.0      6

[116 rows x 2 columns]

Sorted Variation by Count:
     clnt_age       Variation  count
282      62.5            Test   2659
264      59.5            Test   2598
270      60.5            Test   2591
228      53.5            Test   2590
252      57.5            Test   2585
..        ...             ...    ..

# Non-Experiment Group and Age

In [18]:
## Filter for only the Non-Experiment group
non_experiment_group_variation = most_frequent_variation[most_frequent_variation['Variation'] == 'Non-Experiment']

# Print the result
print("Non-Experiment Group Variation:")
print(non_experiment_group_variation)

# Occurrences of 'clnt_age' for the Non-Experiment group
non_experiment_age_frequency = non_experiment_group_variation.groupby('clnt_age')['count'].sum().reset_index()

# Print the age frequency
print("\nNon-Experiment Age Frequency:")
print(non_experiment_age_frequency)

# Sort the entire DataFrame by 'count' in descending order
sorted_variation = most_frequent_variation.sort_values(by='count', ascending=False)

# Print the sorted DataFrame
print("\nSorted Variation by Count:")
print(sorted_variation)

# Filter for only the Non-Experiment group and sort by 'count'
non_experiment_group_sorted = most_frequent_variation[most_frequent_variation['Variation'] == 'Non-Experiment'].sort_values(by='count', ascending=False)

# Print the sorted non-experiment group DataFrame
print("\nSorted Non-Experiment Group by Count:")
print(non_experiment_group_sorted)

# Get the top 3 ages in the non-experiment group
top_3_ages = non_experiment_group_sorted.head(3)

# Print the result
print("\nThe top 3 ages in the non-experiment group that used Vanguard the most frequently are:")
for index, row in top_3_ages.iterrows():
    print(f"Age: {row['clnt_age']}, Count: {row['count']}")

Non-Experiment Group Variation:
     clnt_age       Variation  count
0        13.5  Non-Experiment      9
1        14.0  Non-Experiment     10
2        14.5  Non-Experiment    161
3        15.0  Non-Experiment    106
4        15.5  Non-Experiment    268
5        16.0  Non-Experiment    252
6        16.5  Non-Experiment    443
8        17.0  Non-Experiment    332
11       17.5  Non-Experiment    462
14       18.0  Non-Experiment    432
17       18.5  Non-Experiment    732
20       19.0  Non-Experiment    607
23       19.5  Non-Experiment    812
26       20.0  Non-Experiment    655
29       20.5  Non-Experiment   1124
32       21.0  Non-Experiment    703
35       21.5  Non-Experiment   1259
38       22.0  Non-Experiment    719
44       23.0  Non-Experiment    941
50       24.0  Non-Experiment   1085
194      48.0  Non-Experiment   1042
413      84.5  Non-Experiment     96

Non-Experiment Age Frequency:
    clnt_age  count
0       13.5      9
1       14.0     10
2       14.5    161
3     

# Statistics of Age

In [21]:
# Mean age for the test group
test_group = df_full[df_full['Variation'] == 'Test']
mean_age_test_group = test_group['clnt_age'].mean()
print(f"The mean age of the test group is: {mean_age_test_group}")

The mean age of the test group is: 48.76155579653665


In [22]:
# Mean age for the control group

control_group = df_full[df_full['Variation'] == 'Control']
mean_age_control_group = control_group['clnt_age'].mean()
print(f"The mean age of the control group is: {mean_age_control_group}")

The mean age of the control group is: 48.302375081663186


In [23]:
# Mean age for the Non-Experiment group

nonexperiment_group = df_full[df_full['Variation'] == 'Non-Experiment']
mean_age_nonexperiment_group = nonexperiment_group['clnt_age'].mean()
print(f"The mean age of the Non-Experiment group group is: {mean_age_nonexperiment_group}")

The mean age of the Non-Experiment group group is: 45.401753424657535


In [24]:
# mean age of range of people using vanguard 
mean_age = df_full['clnt_age'].mean()
print(mean_age)

47.65550019344436


In [25]:
#mode age of range of people using vanguard
mode_age = df_full['clnt_age'].mode()
print(mode_age)

0    58.5
Name: clnt_age, dtype: float64


In [26]:
# Measure of min/maxage range for the users in vanguard group
min_age = df_full['clnt_age'].min()
max_age = df_full['clnt_age'].max()
print(min_age)
print(max_age) 

13.5
96.0


# Measures of Dispersion for each group

In [27]:
# the control group
control_group = most_frequent_variation[most_frequent_variation['Variation'] == 'Control']

# Calculate measures of dispersion for 'clnt_age' in the control group
variance_age = control_group['clnt_age'].var()
std_dev_age = control_group['clnt_age'].std()
min_age = control_group['clnt_age'].min()
max_age = control_group['clnt_age'].max()
range_age = max_age - min_age
quantiles_age = control_group['clnt_age'].quantile([0.25, 0.5, 0.75])

# Output the results
print("Variance of Ages:", variance_age)
print("Standard Deviation of Ages:", std_dev_age)
print("Minimum Age:", min_age)
print("Maximum Age:", max_age)
print("Range of Ages:", range_age)
print("Quantiles of Ages:\n", quantiles_age)

Variance of Ages: 645.2179487179487
Standard Deviation of Ages: 25.40114069718029
Minimum Age: 25.0
Maximum Age: 93.5
Range of Ages: 68.5
Quantiles of Ages:
 0.25    35.0
0.50    47.0
0.75    83.5
Name: clnt_age, dtype: float64


In [28]:
# the test group
test_group = most_frequent_variation[most_frequent_variation['Variation'] == 'Test']

# Calculate measures of dispersion for 'clnt_age' in the test group
variance_age = test_group ['clnt_age'].var()
std_dev_age = test_group['clnt_age'].std()
min_age = test_group['clnt_age'].min()
max_age = test_group['clnt_age'].max()
range_age = max_age - min_age
quantiles_age = test_group['clnt_age'].quantile([0.25, 0.5, 0.75])

# Output the results
print("Variance of Ages:", variance_age)
print("Standard Deviation of Ages:", std_dev_age)
print("Minimum Age:", min_age)
print("Maximum Age:", max_age)
print("Range of Ages:", range_age)
print("Quantiles of Ages:\n", quantiles_age)

Variance of Ages: 398.1191154422789
Standard Deviation of Ages: 19.952922478731754
Minimum Age: 22.5
Maximum Age: 96.0
Range of Ages: 73.5
Quantiles of Ages:
 0.25    44.250
0.50    60.750
0.75    76.125
Name: clnt_age, dtype: float64


In [29]:
# the non-experiment group
nonexperiment_group = most_frequent_variation[most_frequent_variation['Variation'] == 'Non-Experiment']

# Calculate measures of dispersion for 'Non-Experiment' in the test group
variance_age = nonexperiment_group['clnt_age'].var()
std_dev_age = nonexperiment_group['clnt_age'].std()
min_age = nonexperiment_group['clnt_age'].min()
max_age = nonexperiment_group['clnt_age'].max()
range_age = max_age - min_age
quantiles_age = nonexperiment_group ['clnt_age'].quantile([0.25, 0.5, 0.75])

# Output the results
print("Variance of Ages:", variance_age)
print("Standard Deviation of Ages:", std_dev_age)
print("Minimum Age:", min_age)
print("Maximum Age:", max_age)
print("Range of Ages:", range_age)
print("Quantiles of Ages:\n", quantiles_age)


Variance of Ages: 239.2034632034632
Standard Deviation of Ages: 15.466203904108571
Minimum Age: 13.5
Maximum Age: 84.5
Range of Ages: 71.0
Quantiles of Ages:
 0.25    16.125
0.50    18.750
0.75    21.375
Name: clnt_age, dtype: float64


# Stats of Tenure by Year

In [40]:
# Mean age and tenure for the test group
test_group = df_full[df_full['Variation'] == 'Test']

# Calculate mean age and mean tenure in months
mean_age_test_group = test_group['clnt_age'].mean()
mean_tenure_test_group = test_group['clnt_tenure_yr'].mean()

# Print the results
print(f"The mean age of the test group is: {mean_age_test_group}")
print(f"The mean tenure in years of the test group is: {mean_tenure_test_group}")

The mean age of the test group is: 48.76155579653665
The mean tenure in years of the test group is: 12.182023563432626


In [31]:
# Mean age and tenure for the control group
control_group = df_full[df_full['Variation'] == 'Control']

# Calculate mean age and mean tenure in years
mean_age_control_group = control_group['clnt_age'].mean()
mean_tenure_control_group = control_group['clnt_tenure_yr'].mean()

# Print the results
print(f"The mean age of the control group is: {mean_age_control_group}")
print(f"The mean tenure in years of the control group is: {mean_tenure_control_group}")

The mean age of the control group is: 48.302375081663186
The mean tenure in months of the control group is: 146.00687024509494


In [41]:
# Mean age and tenure for the non-experiment group
nonexperiment_group = df_full[df_full['Variation'] == 'Non-Experiment']

# Calculate mean age and mean tenure in years
mean_age_nonexperiment_group = nonexperiment_group['clnt_age'].mean()
mean_tenure_nonexperiment_group = nonexperiment_group['clnt_tenure_yr'].mean()

# Print the results
print(f"The mean age of the Non-Experiment group is: {mean_age_nonexperiment_group}")
print(f"The mean tenure in years of the Non-Experiment group is: {mean_tenure_nonexperiment_group}")

The mean age of the Non-Experiment group is: 45.401753424657535
The mean tenure in years of the Non-Experiment group is: 12.377416829745597


# Proportion of Control Group and Age

In [33]:
# Filter for the Control group
control_group = df_full[df_full['Variation'] == 'Control']

# Calculate variation groups and their counts for the Control group
variation_groups = control_group.groupby('Variation')['clnt_age'].size().reset_index(name='Count')

# Print the variation groups
print("Variation Groups for Control Group:")
print(variation_groups)

# Calculate age frequency for the Control group
age_frequency = control_group.groupby('clnt_age').size().reset_index(name='Count')

# Print the age frequency
print("\nAge Frequency for Control Group:")
print(age_frequency)

# Calculate the proportion of variations for the Control group
variation_proportion = control_group['Variation'].value_counts(normalize=True).reset_index(name='Proportion')
variation_proportion.columns = ['Variation', 'Proportion']

# Print the variation proportions for Control group
print("\nProportion of Variations for Control Group:")
print(variation_proportion)

# Calculate the proportion of ages for the Control group
age_proportion = control_group['clnt_age'].value_counts(normalize=True).reset_index(name='Proportion')
age_proportion.columns = ['clnt_age', 'Proportion']

# Print the age proportions for Control group
print("\nProportion of Ages for Control Group:")
print(age_proportion)

# Calculate the total count for each variation and age group for the Control group
variation_age_proportion = control_group.groupby(['Variation', 'clnt_age']).size().reset_index(name='Count')

# Calculate the proportion within each variation group
variation_age_proportion['Proportion'] = variation_age_proportion.groupby('Variation')['Count'].transform(lambda x: x / x.sum())

# Print the combined proportion of variation and age for Control group
print("\nProportion of Variation and Age for Control Group:")
print(variation_age_proportion)

# Get the top 3 proportions for the Control group according to age
top_3_age_proportions = age_proportion.nlargest(3, 'Proportion')

# Print the final statement
top_3_ages = top_3_age_proportions['clnt_age'].tolist()
top_3_proportions = top_3_age_proportions['Proportion'].tolist()
print(f"\nThe top 3 proportions for the Control group according to age are: {list(zip(top_3_ages, top_3_proportions))}")

Variation Groups for Control Group:
  Variation   Count
0   Control  142407

Age Frequency for Control Group:
     clnt_age  Count
0        17.0     11
1        17.5    180
2        18.0    169
3        18.5    253
4        19.0    200
..        ...    ...
150      92.5     18
151      93.0     12
152      93.5     32
153      94.0     12
154      96.0      4

[155 rows x 2 columns]

Proportion of Variations for Control Group:
  Variation  Proportion
0   Control         1.0

Proportion of Ages for Control Group:
     clnt_age  Proportion
0        58.5    0.014794
1        59.5    0.013684
2        61.5    0.013382
3        55.5    0.013375
4        62.5    0.013347
..        ...         ...
150      94.0    0.000084
151      17.0    0.000077
152      91.5    0.000042
153      96.0    0.000028
154      90.0    0.000021

[155 rows x 2 columns]

Proportion of Variation and Age for Control Group:
    Variation  clnt_age  Count  Proportion
0     Control      17.0     11    0.000077
1     Co

# Proportion of Test Group and Age

In [34]:
# Filter for the Test group
test_group = df_full[df_full['Variation'] == 'Test']

# Calculate variation groups and their counts for the Test group
variation_groups = test_group.groupby('Variation')['clnt_age'].size().reset_index(name='Count')

# Print the variation groups
print("Variation Groups for Test Group:")
print(variation_groups)

# Calculate age frequency for the Test group
age_frequency = test_group.groupby('clnt_age').size().reset_index(name='Count')

# Print the age frequency
print("\nAge Frequency for Test Group:")
print(age_frequency)

# Calculate the proportion of variations for the Test group
variation_proportion = test_group['Variation'].value_counts(normalize=True).reset_index(name='Proportion')
variation_proportion.columns = ['Variation', 'Proportion']

# Print the variation proportions for Test group
print("\nProportion of Variations for Test Group:")
print(variation_proportion)

# Calculate the proportion of ages for the Test group
age_proportion = test_group['clnt_age'].value_counts(normalize=True).reset_index(name='Proportion')
age_proportion.columns = ['clnt_age', 'Proportion']

# Print the age proportions for Test group
print("\nProportion of Ages for Test Group:")
print(age_proportion)

# Calculate the total count for each variation and age group for the Test group
variation_age_proportion = test_group.groupby(['Variation', 'clnt_age']).size().reset_index(name='Count')

# Calculate the proportion within each variation group
variation_age_proportion['Proportion'] = variation_age_proportion.groupby('Variation')['Count'].transform(lambda x: x / x.sum())

# Print the combined proportion of variation and age for Test group
print("\nProportion of Variation and Age for Test Group:")
print(variation_age_proportion)

# Get the top 3 proportions for the Test group according to age
top_3_age_proportions = age_proportion.nlargest(3, 'Proportion')

# Print the final statement
top_3_ages = top_3_age_proportions['clnt_age'].tolist()
top_3_proportions = top_3_age_proportions['Proportion'].tolist()
print(f"\nThe top 3 proportions for the Test group according to age are: {list(zip(top_3_ages, top_3_proportions))}")

Variation Groups for Test Group:
  Variation   Count
0      Test  177114

Age Frequency for Test Group:
     clnt_age  Count
0        17.0      8
1        17.5    232
2        18.0    140
3        18.5    456
4        19.0    258
..        ...    ...
152      93.0     15
153      94.0     13
154      94.5      5
155      95.5      6
156      96.0      6

[157 rows x 2 columns]

Proportion of Variations for Test Group:
  Variation  Proportion
0      Test         1.0

Proportion of Ages for Test Group:
     clnt_age  Proportion
0        62.5    0.015018
1        59.5    0.014673
2        60.5    0.014634
3        53.5    0.014628
4        57.5    0.014600
..        ...         ...
152      86.0    0.000051
153      17.0    0.000045
154      95.5    0.000034
155      96.0    0.000034
156      94.5    0.000028

[157 rows x 2 columns]

Proportion of Variation and Age for Test Group:
    Variation  clnt_age  Count  Proportion
0        Test      17.0      8    0.000045
1        Test      17.5

# Proportion of Non-Experiment group

In [35]:
## Filter for the Non-Experiment group
non_experiment_group = df_full[df_full['Variation'] == 'Non-Experiment']

# Calculate variation groups and their counts for the Non-Experiment group
variation_groups = non_experiment_group.groupby('Variation')['clnt_age'].size().reset_index(name='Count')

# Print the variation groups
print("Variation Groups for Non-Experiment Group:")
print(variation_groups)

# Calculate age frequency for the Non-Experiment group
age_frequency = non_experiment_group.groupby('clnt_age').size().reset_index(name='Count')

# Print the age frequency
print("\nAge Frequency for Non-Experiment Group:")
print(age_frequency)

# Calculate the proportion of variations for the Non-Experiment group
variation_proportion = non_experiment_group['Variation'].value_counts(normalize=True).reset_index(name='Proportion')
variation_proportion.columns = ['Variation', 'Proportion']

# Print the variation proportions for Non-Experiment group
print("\nProportion of Variations for Non-Experiment Group:")
print(variation_proportion)

# Calculate the proportion of ages for the Non-Experiment group
age_proportion = non_experiment_group['clnt_age'].value_counts(normalize=True).reset_index(name='Proportion')
age_proportion.columns = ['clnt_age', 'Proportion']

# Print the age proportions for Non-Experiment group
print("\nProportion of Ages for Non-Experiment Group:")
print(age_proportion)

# Calculate the total count for each variation and age group for the Non-Experiment group
variation_age_proportion = non_experiment_group.groupby(['Variation', 'clnt_age']).size().reset_index(name='Count')

# Calculate the proportion within each variation group
variation_age_proportion['Proportion'] = variation_age_proportion.groupby('Variation')['Count'].transform(lambda x: x / x.sum())

# Print the combined proportion of variation and age for Non-Experiment group
print("\nProportion of Variation and Age for Non-Experiment Group:")
print(variation_age_proportion)

# Get the top 3 proportions for the Non-Experiment group according to age
top_3_age_proportions = age_proportion.nlargest(3, 'Proportion')

# Print the final statement
top_3_ages = top_3_age_proportions['clnt_age'].tolist()
top_3_proportions = top_3_age_proportions['Proportion'].tolist()
print(f"\nThe top 3 proportions for the Non-Experiment group according to age are: {list(zip(top_3_ages, top_3_proportions))}")

Variation Groups for Non-Experiment Group:
        Variation   Count
0  Non-Experiment  428052

Age Frequency for Non-Experiment Group:
     clnt_age  Count
0        13.5      9
1        14.0     10
2        14.5    161
3        15.0    106
4        15.5    268
..        ...    ...
151      89.5      6
152      90.5      7
153      91.0      4
154      91.5     14
155      92.5      7

[156 rows x 2 columns]

Proportion of Variations for Non-Experiment Group:
        Variation  Proportion
0  Non-Experiment         1.0

Proportion of Ages for Non-Experiment Group:
     clnt_age  Proportion
0        51.5    0.014779
1        58.5    0.014106
2        50.5    0.013926
3        54.5    0.013777
4        55.5    0.013746
..        ...         ...
151      87.0    0.000070
152      90.5    0.000055
153      92.5    0.000055
154      89.5    0.000047
155      91.0    0.000031

[156 rows x 2 columns]

Proportion of Variation and Age for Non-Experiment Group:
          Variation  clnt_age  Coun

# Tenure and Age of Control Group

In [42]:
# Filter for control group
filtered_df = df_full[df_full['Variation'] == 'Control']

# Get top 3 ages with the longest tenure
longest_tenure = filtered_df.nlargest(3, 'clnt_tenure_yr')[['clnt_age', 'clnt_tenure_yr']]

# Get top 3 ages with the least tenure
least_tenure = filtered_df.nsmallest(3, 'clnt_tenure_yr')[['clnt_age', 'clnt_tenure_yr']]

# Display the results
print("Top 3 Ages with Longest Tenure:")
print(longest_tenure)

print("\nTop 3 Ages with Least Tenure:")
print(least_tenure)

Top 3 Ages with Longest Tenure:
       clnt_age  clnt_tenure_yr
26179      48.5            55.0
26180      48.5            55.0
26181      48.5            55.0

Top 3 Ages with Least Tenure:
       clnt_age  clnt_tenure_yr
12236      19.5             2.0
12237      19.5             2.0
12238      19.5             2.0


# Tenure and Age of Test Group

In [58]:
# Filter for test group
filtered_df = df_full[df_full['Variation'] == 'Test']

# Get top 3 ages with the longest tenure
longest_tenure = filtered_df.nlargest(3, 'clnt_tenure_yr')[['clnt_age', 'clnt_tenure_yr']]
longest_tenure['clnt_tenure_yr'] = longest_tenure['clnt_tenure_yr'].astype(int)  # Convert to integer

# Get top 3 ages with the least tenure
least_tenure = filtered_df.nsmallest(3, 'clnt_tenure_yr')[['clnt_age', 'clnt_tenure_yr']]
least_tenure['clnt_tenure_yr'] = least_tenure['clnt_tenure_yr'].astype(int)  # Convert to integer

# Display the results
print("Top 3 Ages with Longest Tenure:")
print(longest_tenure)

print("\nTop 3 Ages with Least Tenure:")
print(least_tenure)

Top 3 Ages with Longest Tenure:
        clnt_age  clnt_tenure_yr
121630      42.5              55
121631      42.5              55
121632      42.5              55

Top 3 Ages with Least Tenure:
       clnt_age  clnt_tenure_yr
19886      58.5               2
19887      58.5               2
19888      58.5               2


# Tenure and Age of Non-Experiment Group

In [50]:
# Filter for Non-Experiment group
filtered_df = df_full[df_full['Variation'] == 'Non-Experiment']

# Get top 3 ages with the longest tenure
longest_tenure = filtered_df.nlargest(3, 'clnt_tenure_yr')[['clnt_age', 'clnt_tenure_yr']]

# Get top 3 ages with the least tenure
least_tenure = filtered_df.nsmallest(3, 'clnt_tenure_yr')[['clnt_age', 'clnt_tenure_yr']]

# Display the results
print("Top 3 Ages with Longest Tenure:")
print(longest_tenure.astype({'clnt_age': 'int', 'clnt_tenure_yr': 'int'}))

print("\nTop 3 Ages with Least Tenure:")
print(least_tenure.astype({'clnt_age': 'int', 'clnt_tenure_yr': 'int'}))

Top 3 Ages with Longest Tenure:
        clnt_age  clnt_tenure_yr
322189        51              62
322190        51              62
245750        52              55

Top 3 Ages with Least Tenure:
        clnt_age  clnt_tenure_yr
257298        18               2
257332        18               2
257333        18               2


# The most and least amount of Log-ins according to Age and Variant Group 

In [57]:


# Function to get longest tenure and logon counts for a given group

def get_longest_tenure_and_logons(df, group_name):
    # Get the top 3 ages with the longest tenure
    longest_tenure = df.nlargest(3, 'clnt_tenure_yr')[['clnt_age', 'clnt_tenure_yr', 'logons_6_mnth']]
    longest_tenure['Group'] = group_name
    
    # Get the most frequent logon
    most_logons = longest_tenure.nlargest(1, 'logons_6_mnth')
    
    # Get the least frequent logon
    least_logons = longest_tenure.nsmallest(1, 'logons_6_mnth')
    
    return most_logons, least_logons

# Filter for each group
test_group = df_full[df_full['Variation'] == 'Test']
control_group = df_full[df_full['Variation'] == 'Control']
non_experiment_group = df_full[df_full['Variation'] == 'Non-Experiment']

# Get longest tenure and logons for each group
most_tenure_test, least_tenure_test = get_longest_tenure_and_logons(test_group, 'Test')
most_tenure_control, least_tenure_control = get_longest_tenure_and_logons(control_group, 'Control')
most_tenure_non_experiment, least_tenure_non_experiment = get_longest_tenure_and_logons(non_experiment_group, 'Non-Experiment')

# Display results and summary statements
def print_summary(most_tenure, least_tenure, group_name):
    most_age = most_tenure['clnt_age'].values[0]
    most_tenure_value = int(most_tenure['clnt_tenure_yr'].values[0])  # Convert to integer
    most_logons = most_tenure['logons_6_mnth'].values[0]  # Get the number of logons

    least_age = least_tenure['clnt_age'].values[0]
    least_tenure_value = int(least_tenure['clnt_tenure_yr'].values[0])  # Convert to integer
    least_logons = least_tenure['logons_6_mnth'].values[0]

    # Updated print statements
    print(f"This {most_age} year old with {most_tenure_value} years tenure in the {group_name} group logged on the most in 6 months. They logged on {most_logons} times.")
    print(f"This {least_age} year old with {least_tenure_value} years tenure in the {group_name} group logged on the least in 6 months. They logged on {least_logons} times.")

# Example usage
print_summary(most_tenure_test, least_tenure_test, 'Test')
print_summary(most_tenure_control, least_tenure_control, 'Control')
print_summary(most_tenure_non_experiment, least_tenure_non_experiment, 'Non-Experiment')

This 42.5 year old with 55 years tenure in the Test group logged on the most in 6 months. They logged on 4.0 times.
This 42.5 year old with 55 years tenure in the Test group logged on the least in 6 months. They logged on 4.0 times.
This 48.5 year old with 55 years tenure in the Control group logged on the most in 6 months. They logged on 3.0 times.
This 48.5 year old with 55 years tenure in the Control group logged on the least in 6 months. They logged on 3.0 times.
This 51.0 year old with 62 years tenure in the Non-Experiment group logged on the most in 6 months. They logged on 4.0 times.
This 52.5 year old with 55 years tenure in the Non-Experiment group logged on the least in 6 months. They logged on 2.0 times.


# The most and least amount of Calls according to Age and Variant Group 

In [55]:
# Function to get longest tenure and call counts for a given group

def get_longest_tenure_and_calls(df, group_name):
    # Get the top 3 ages with the longest tenure
    longest_tenure = df.nlargest(3, 'clnt_tenure_yr')[['clnt_age', 'clnt_tenure_yr', 'calls_6_mnth']]
    longest_tenure['Group'] = group_name
    
    # Get the most frequent calls
    most_calls = longest_tenure.nlargest(1, 'calls_6_mnth')
    
    # Get the least frequent calls
    least_calls = longest_tenure.nsmallest(1, 'calls_6_mnth')
    
    return most_calls, least_calls

# Filter for each group
test_group = df_full[df_full['Variation'] == 'Test']
control_group = df_full[df_full['Variation'] == 'Control']
non_experiment_group = df_full[df_full['Variation'] == 'Non-Experiment']

# Get longest tenure and calls for each group
most_tenure_test, least_tenure_test = get_longest_tenure_and_calls(test_group, 'Test')
most_tenure_control, least_tenure_control = get_longest_tenure_and_calls(control_group, 'Control')
most_tenure_non_experiment, least_tenure_non_experiment = get_longest_tenure_and_calls(non_experiment_group, 'Non-Experiment')

# Display results and summary statements
def print_summary(most_tenure, least_tenure, group_name):
    most_age = most_tenure['clnt_age'].values[0]
    most_tenure_value = int(most_tenure['clnt_tenure_yr'].values[0])  # Convert to integer
    most_calls = most_tenure['calls_6_mnth'].values[0]  # Get the number of calls

    least_age = least_tenure['clnt_age'].values[0]
    least_tenure_value = int(least_tenure['clnt_tenure_yr'].values[0])  # Convert to integer
    least_calls = least_tenure['calls_6_mnth'].values[0]

    # Updated print statement
    print(f"This {most_age} year old with {most_tenure_value} years tenure in the {group_name} group called the most in 6 months. They called {most_calls} times.")
    print(f"This {least_age} year old with {least_tenure_value} years tenure in the {group_name} group called the least in 6 months. They called {least_calls} times.")

# Example usage
print_summary(most_tenure_test, least_tenure_test, 'Test')
print_summary(most_tenure_control, least_tenure_control, 'Control')
print_summary(most_tenure_non_experiment, least_tenure_non_experiment, 'Non-Experiment')

This 42.5 year old with 55 years tenure in the Test group called the most in 6 months. They called 1.0 times.
This 42.5 year old with 55 years tenure in the Test group called the least in 6 months. They called 1.0 times.
This 48.5 year old with 55 years tenure in the Control group called the most in 6 months. They called 0.0 times.
This 48.5 year old with 55 years tenure in the Control group called the least in 6 months. They called 0.0 times.
This 51.0 year old with 62 years tenure in the Non-Experiment group called the most in 6 months. They called 4.0 times.
This 52.5 year old with 55 years tenure in the Non-Experiment group called the least in 6 months. They called 2.0 times.
