## Setup

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
import os

In [8]:
# relative pass should be 
path1 = "df_demo_clean.csv"
path2 = "df_experiment_clean.csv"
path3 = "df_web_clean.csv"

df_demo = pd.read_csv(path1, index_col=0)
df_exp = pd.read_csv(path2, index_col=0)
df_web = pd.read_csv(path3, index_col=0)


In [54]:
print(f'The data set has {df_exp.shape[0]} rows and {df_exp.shape[1]} columns with the following types:')
print(df_exp.dtypes)
df_exp.sort_values(by = 'client_id', inplace = True)
df_exp.reset_index(drop = True, inplace = True)
df_exp.sample(5)

The data set has 70609 rows and 2 columns with the following types:
client_id     int64
Variation    object
dtype: object


Unnamed: 0,client_id,Variation
9221,1324640,Test
27893,3960280,Test
7887,1131564,Test
22972,3277823,Test
5343,753820,


In [58]:
# dropping of the dublications and handling NaN values in Gender and Variation columns
df_clean = cleaning_gender(df)
df_clean = drop_dub(df_clean)
df_clean =cleaning_Variation(df_clean)

In [59]:
df_clean.isna().sum()

client_id            0
clnt_tenure_yr      14
clnt_tenure_mnth    14
clnt_age            15
gendr                0
num_accts           14
bal                 14
calls_6_mnth        14
logons_6_mnth       14
variation            0
dtype: int64

In [60]:
df_clean.describe(include='all').drop(columns = 'client_id').round(2)

Unnamed: 0,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,variation
count,70595.0,70595.0,70594.0,70609,70595.0,70595.0,70595.0,70595.0,70609
unique,,,,3,,,,,3
top,,,,U,,,,,Test
freq,,,,24139,,,,,26968
mean,12.05,150.66,46.44,,2.26,147445.24,3.38,5.57,
std,6.87,82.09,15.59,,0.53,301508.71,2.24,2.35,
min,2.0,33.0,13.5,,1.0,13789.42,0.0,1.0,
25%,6.0,82.0,32.5,,2.0,37346.84,1.0,4.0,
50%,11.0,136.0,47.0,,2.0,63332.9,3.0,5.0,
75%,16.0,192.0,59.0,,2.0,137544.9,6.0,7.0,


In [61]:
print(f'Mode for years : {df_clean.clnt_tenure_yr.mode()[0]}')
print(f'\nMode for months : {df_clean.clnt_tenure_mnth.mode()[0]}')
print(f'\nMode for age : {df_clean.clnt_age.mode()[0]}')
print(f'\nMode for calls_6_mnth : {df_clean.calls_6_mnth.mode()[0]}')
print(f'\nMode for logons_6_mnth : {df_clean.logons_6_mnth.mode()[0]}')

Mode for years : 6.0

Mode for months : 58.0

Mode for age : 58.5

Mode for calls_6_mnth : 6.0

Mode for logons_6_mnth : 9.0


In [62]:
#Based on the initial statistical analysis, missing values in some of our numeric columns were replaced with the median
df_clean= Null_Median(df_clean,'clnt_tenure_mnth')
df_clean= Null_Median(df_clean,'clnt_tenure_yr')
df_clean= Null_Median(df_clean,'clnt_age')
df_clean= Null_Median(df_clean,'calls_6_mnth')
df_clean= Null_Median(df_clean,'logons_6_mnth')

# We chose to drop the rows in the remaing two columns (num_accts and bal) which only have 14 NaN values each
df_clean.dropna(inplace=True, ignore_index=True)
df_clean.tail(10)

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,variation
70585,9998964,5.0,71.0,45.5,U,3.0,91766.49,6.0,9.0,Test
70586,9998980,7.0,87.0,22.5,U,2.0,59061.36,7.0,7.0,Unknown
70587,9999009,18.0,216.0,68.5,M,3.0,2055381.29,6.0,9.0,Test
70588,9999150,5.0,66.0,30.0,U,3.0,97141.71,6.0,9.0,Test
70589,9999333,6.0,81.0,65.5,U,2.0,109932.1,7.0,7.0,Unknown
70590,9999400,7.0,86.0,28.5,U,2.0,51787.04,0.0,3.0,Test
70591,9999626,9.0,113.0,35.0,M,2.0,36642.88,6.0,9.0,Test
70592,9999729,10.0,124.0,31.0,F,3.0,107059.74,6.0,9.0,Test
70593,9999832,23.0,281.0,49.0,F,2.0,431887.61,1.0,4.0,Test
70594,9999839,13.0,160.0,28.5,F,2.0,67425.35,3.0,3.0,Unknown


In [63]:
print(f'The data set has {df_clean.shape[0]} rows and {df_clean.shape[1]} columns with the following types:')
print(df_clean.dtypes)
print(df_clean.isna().sum())

The data set has 70595 rows and 10 columns with the following types:
client_id             int64
clnt_tenure_yr      float64
clnt_tenure_mnth    float64
clnt_age            float64
gendr                object
num_accts           float64
bal                 float64
calls_6_mnth        float64
logons_6_mnth       float64
variation            object
dtype: object
client_id           0
clnt_tenure_yr      0
clnt_tenure_mnth    0
clnt_age            0
gendr               0
num_accts           0
bal                 0
calls_6_mnth        0
logons_6_mnth       0
variation           0
dtype: int64


In [64]:
df_clean.describe(include='all').drop(columns='client_id').round(2)

Unnamed: 0,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,variation
count,70595.0,70595.0,70595.0,70595,70595.0,70595.0,70595.0,70595.0,70595
unique,,,,3,,,,,3
top,,,,U,,,,,Test
freq,,,,24125,,,,,26961
mean,12.05,150.66,46.44,,2.26,147445.24,3.38,5.57,
std,6.87,82.09,15.59,,0.53,301508.71,2.24,2.35,
min,2.0,33.0,13.5,,1.0,13789.42,0.0,1.0,
25%,6.0,82.0,32.5,,2.0,37346.84,1.0,4.0,
50%,11.0,136.0,47.0,,2.0,63332.9,3.0,5.0,
75%,16.0,192.0,59.0,,2.0,137544.9,6.0,7.0,


In [9]:

# Generate the create table query
#DEMO
create_table_query = "CREATE TABLE demo ("
for column_name, dtype in zip(df_demo.columns, df_demo.dtypes):
    if str(dtype) == 'object':
        create_table_query += f"{column_name} VARCHAR(255), "
    elif str(dtype) == 'int64':
        create_table_query += f"{column_name} INT, "
    elif str(dtype) == 'float64':
        create_table_query += f"{column_name} FLOAT, "
create_table_query = create_table_query[:-2]  # Remove the last comma and space
create_table_query += ");"
print(create_table_query)

CREATE TABLE demo (client_id INT, clnt_tenure_yr FLOAT, clnt_tenure_mnth FLOAT, clnt_age FLOAT, gendr VARCHAR(255), num_accts FLOAT, bal FLOAT, calls_6_mnth FLOAT, logons_6_mnth FLOAT);


In [10]:
# Generate the create table query
#EXPRIMENT
create_table_query = "CREATE TABLE experiment ("
for column_name, dtype in zip(df_exp.columns, df_exp.dtypes):
    if str(dtype) == 'object':
        create_table_query += f"{column_name} VARCHAR(255), "
    elif str(dtype) == 'int64':
        create_table_query += f"{column_name} INT, "
    elif str(dtype) == 'float64':
        create_table_query += f"{column_name} FLOAT, "
create_table_query = create_table_query[:-2]  # Remove the last comma and space
create_table_query += ");"
print(create_table_query)

CREATE TABLE experiment (client_id INT, variation VARCHAR(255));


In [11]:
# Generate the create table query
#WEB
create_table_query = "CREATE TABLE web ("
for column_name, dtype in zip(df_web.columns, df_web.dtypes):
    if str(dtype) == 'object':
        create_table_query += f"{column_name} VARCHAR(255), "
    elif str(dtype) == 'int64':
        create_table_query += f"{column_name} INT, "
    elif str(dtype) == 'float64':
        create_table_query += f"{column_name} FLOAT, "
create_table_query = create_table_query[:-2]  # Remove the last comma and space
create_table_query += ");"
print(create_table_query)

CREATE TABLE web (client_id INT, process_step VARCHAR(255), date_time VARCHAR(255));
