In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./data/input/DSB Customer Survery.csv')
df.head()

Unnamed: 0,Customer ID,Mobile App - Ease of Use,Mobile App - Ease of Access,Mobile App - Navigation,Mobile App - Likelihood to Recommend,Mobile App - Overall Rating,Online Interface - Ease of Use,Online Interface - Ease of Access,Online Interface - Navigation,Online Interface - Likelihood to Recommend,Online Interface - Overall Rating
0,535084,2,1,5,4,1,4,4,5,2,3
1,250892,3,5,4,4,2,5,5,2,4,3
2,544191,5,3,4,4,1,3,3,2,3,1
3,949343,2,5,4,3,1,1,4,3,5,1
4,915305,3,1,2,1,1,4,2,4,3,2


# Preprocessing

In [3]:
df_prep = ( df
    # Reshape the data so we have 5 rows for each customer
    .melt(
        id_vars='Customer ID',
        value_name = 'Rating'
    )
    # Drop the Overall Rating rows
    .query("not variable.str.contains('Overall Rating')", engine='python')
    # Keep reformating the data
    .rename(columns={'variable': 'Category'})
    .assign(
        Platform = lambda x: x['Category'].str.split(' - ').str[0],
        Question = lambda x: x['Category'].str.split(' - ').str[1]
    )
    .drop(columns='Category')
    .pivot_table(
        index=['Customer ID', 'Question'],
        columns='Platform',
        values='Rating',
        aggfunc='first'
    )
    .reset_index()
    .sort_values(['Customer ID', 'Question'])
    .rename_axis(None, axis=1)
    .reset_index(drop=True)
)

df_prep.head()

Unnamed: 0,Customer ID,Question,Mobile App,Online Interface
0,101646,Ease of Access,5,4
1,101646,Ease of Use,3,2
2,101646,Likelihood to Recommend,4,4
3,101646,Navigation,2,3
4,101650,Ease of Access,4,5


# Calculations

In [4]:
customer_avg_ratings = ( df_prep
    # Calculate the average rating for each customer for the Mobile App and Online Interface columns
    .groupby('Customer ID')
    .agg(
        Mobile_App = ('Mobile App', 'mean'),
        Online_Interface = ('Online Interface', 'mean')
    )
    # Calculate the difference between the two columns
    .assign(
        Difference = lambda x: x['Mobile_App'] - x['Online_Interface']
    )
    .reset_index()
    # Categorize the customers into groups based on the difference
    .assign(
        Category = lambda x: pd.cut(
            x['Difference'],
            bins=[-float('inf'), -2, -1, 1, 2, float('inf')],
            labels=['Online Interface Superfan', 'Online Interface Fan', 'Neutral', 'Mobile App Fan', 'Mobile App Superfan']
        )
    )
)

customer_avg_ratings.head()

Unnamed: 0,Customer ID,Mobile_App,Online_Interface,Difference,Category
0,101646,3.5,3.25,0.25,Neutral
1,101650,2.25,3.0,-0.75,Neutral
2,105088,3.5,4.25,-0.75,Neutral
3,109306,2.0,2.0,0.0,Neutral
4,110719,3.0,3.5,-0.5,Neutral


In [5]:
# Get the percentage of customers in each category, as a percentage rounded to 1 decimal place
( customer_avg_ratings
    ['Category'].value_counts(normalize=True)
    .mul(100).round(1)
    .reset_index()
    .rename(columns={'Category': 'Preference', 'proportion': '% of Total'})
    .to_csv(path_or_buf='./data/output/output_2023_06.csv', index=False, quoting=1, quotechar='"', sep=';')
)