- Created date: 2023. 02. 12
- Last updated: 2023. 02. 12
- Author: Chongho Pyo
- Version: 0.0

**Market segmentation**
- (original)
- Consumer income and gender are target variables, all other features are predictors.
- Understand how to attract different types of customers. Useful for expanding their reach and consumer base

**Customer Segmentation**
- Customer segmentation: grouping customers together based on similar features or properties.

In [1]:
!pip install category_encoders

In [2]:
import pandas as pd
import numpy as np
import itertools

import random

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing 
import category_encoders as ce

# Visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.express as px

from mpl_toolkits.mplot3d import Axes3D

%matplotlib inline

In [3]:
df_raw = pd.read_csv("media prediction and its cost.csv")
df_raw.head()

In [4]:
# Customer related column names
cus_cols = df_raw.columns[8: 19]
cus_cols

# Visualization functions

## Pie chart
- A function to draw a pie chart
- Parameter
    - df (DataFrame): e.g., df_mid
    - group_name (string): a name of segment, in lower case (e.g., male, female)
    - columns = customer info attributes

In [5]:
def draw_pie (df, columns, group_name):
    figsize = (6, 6)
    if group_name in ['male', 'female']:
        cols = [c for c in columns if c != 'gender']
    
    elif group_name in ['low', 'mid', 'high']:
        cols = [c for c in columns if c != 'avg. yearly_income']
        
    for col in cols:
        df_pie = pd.DataFrame(df[col].value_counts())
        labels = df_pie.index

        fig1, ax1 = plt.subplots(figsize=figsize)

        colors = sns.color_palette("husl", len(labels))

        wedges, texts, autotexts = ax1.pie(df_pie[col], labels=labels, 
                                           colors=colors, autopct='%1.1f%%', startangle=90, 
                                           textprops={'fontsize': 10}, 
                                           wedgeprops={'linewidth': 2, 'edgecolor': 'white'}, 
                                           pctdistance=0.8)

        for autotext in autotexts:
            autotext.set_color('black')
            autotext.set_size(10)

        #draw a circle at the center of pie to make it look like a donut
        centre_circle = plt.Circle((0,0),0.6,color='white', fc='white',linewidth=0)
        fig = plt.gcf()
        fig.gca().add_artist(centre_circle)

        # Equal aspect ratio ensures that pie is drawn as a circle
        ax1.axis('equal')
        plt.title(col + " Distribution for " + group_name, fontsize=16, 
                  fontweight='bold')

        plt.legend(wedges, labels, fontsize=8, bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
        plt.tight_layout()
        plt.show()


## Histogram
- A function to draw histogram
- Parameter
    - df (DataFrame): e.g., df_mid
    - group_name (string): a name of segment, in lower case (e.g., male, female)
    - columns = customer info attributes

In [6]:
def draw_histo (df, columns, group_name):
    if group_name in ['male', 'female']:
        cols = [c for c in columns if c != 'gender']
    
    elif group_name in ['low', 'mid', 'high']:
        cols = [c for c in columns if c != 'avg. yearly_income']
    
    for col in cols:
        sns.histplot(data = df, x = col, kde=False, color="lightblue")

        plt.title("Distribution of " + str(col) + " " + group_name, 
                  fontsize=14, fontweight='bold')
        plt.xlabel(col, fontsize=12)
        plt.ylabel("Frequency", fontsize=12)

        sns.despine()

        plt.xticks(fontsize=7)
        plt.show()

## Bivariate Analysis
- A function to draw a scatter plots
- Parameter
    - df (DataFrame): e.g., df_mid
    - group_name (string): a name of segment, in lower case (e.g., male, female)
    - columns = customer info attributes

In [7]:
def draw_bivar (df, columns, group_name):
    combinations = list(itertools.combinations(['total_children', 
                                                'avg_cars_at home(approx)',
                                                'num_children_at_home', 
                                                'rand_income'], 2))

    for comb in combinations:
        x = comb[0]
        y = comb[1]

        fig = px.scatter(data_frame=df, x=x, y=y,
                     height=500,
                     color_discrete_sequence = ['skyblue'])

        fig.update_layout(title={
            'text': x + " and " + y + " " + group_name,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'})

        fig.show()

# Segmentation: Gender
- Characteristics of each group, based on gender
- Either Male or Female

In [8]:
# Dataframes based on gender
df_male = df_raw[df_raw['gender'] == 'M']
df_female = df_raw[df_raw['gender'] == 'F']

len(df_raw) == len(df_male) + len(df_female)

## Pie chart

### Male

In [9]:
draw_pie(df_male, cus_cols, "male")

### Female

In [10]:
draw_pie(df_female, cus_cols, "female")

## Univariate analysis (Histograms)
- Univariate analysis entails evaluating a single feature in order to get insights about it.

### Male

In [11]:
draw_histo(df_male, cus_cols, "male")

### Female

In [12]:
draw_histo(df_female, cus_cols, "female")

## Multivariate Analysis

### Male

In [13]:
cus_cols

In [14]:
cate_cols = ['marital_status', 'gender', 'education', 'member_card',
             'occupation', 'houseowner']

In [15]:
[c for c in cus_cols if c not in cate_cols]

In [16]:
for x in ['total_children',
 'avg_cars_at home(approx)',
 'avg. yearly_income',
 'num_children_at_home',
 'avg_cars_at home(approx).1']:
 
    for g in ['education', 'member_card', 'occupation']:
        fig = px.scatter(
            data_frame = df_male,
            x = x,
            y= "cost",
            title = "Relationship between " + x + " VS Cost based on " + g,
            color = g,
            height=500
        )

        fig.show()

### Female

In [17]:
for x in ['total_children',
 'avg_cars_at home(approx)',
 'avg. yearly_income',
 'num_children_at_home',
 'avg_cars_at home(approx).1']:
 
    for g in ['education', 'member_card', 'occupation']:
        fig = px.scatter(
            data_frame = df_female,
            x = x,
            y= "cost",
            title = "Relationship between " + x + " VS Cost based on " + g,
            color = g,
            height=500
        )

        fig.show()

# Segmentation: Income

## Spliting groups by income
- (dollars)
- df_low: 10,000 ~ 50,000
- df_mid: 50,000 ~ 110,000
- df_high: 110,000 ~

### Ordinal Encoding (income)

In [18]:
# Ordinal encoding, because the values have orders
ore = ce.OrdinalEncoder(
    mapping=[
        {
            
            "col": "avg. yearly_income",
            "mapping": {
                '$10K - $30K' : 1,
                '$30K - $50K' : 2,
                '$50K - $70K' : 3,
                '$70K - $90K' : 4,
                '$90K - $110K' : 5,
                '$110K - $130K' : 6,
                '$130K - $150K' : 7,
                '$150K +' : 8
            }, 
        }
    ]
)

df_raw['ordinal_encoded'] = ore.fit_transform(df_raw["avg. yearly_income"])

In [19]:
# Randomly assign a value within its range
# To change labeled to continuous values

'''
'$10K - $30K' : 1,
'$30K - $50K' : 2,
'$50K - $70K' : 3,
'$70K - $90K' : 4,
'$90K - $110K' : 5,
'$110K - $130K' : 6,
'$130K - $150K' : 7,
'$150K +' : 8
'''
random_numbers = [random.randint(10000, 20000) for i in range(10000)]

rand_income = []
for ord_num in df_raw['ordinal_encoded']:
    if ord_num == 1:
        n = random.randint(10000, 30000)
    elif ord_num == 2:
        n = random.randint(30000, 50000)
    elif ord_num == 3:
        n = random.randint(50000, 70000)
    elif ord_num == 4:
        n = random.randint(70000, 90000)
    elif ord_num == 5:
        n = random.randint(90000, 110000)
    elif ord_num == 6:
        n = random.randint(110000, 130000)
    elif ord_num == 7:
        n = random.randint(130000, 150000)
    else: # ord_num == 8
        n = random.randint(150000, 200000)
        
    rand_income.append(n)

In [20]:
df_raw['rand_income'] = rand_income

In [21]:
df_low = df_raw[(df_raw['ordinal_encoded'] == 1) | (df_raw['ordinal_encoded'] == 2)]
df_mid = df_raw[(df_raw['ordinal_encoded'] == 3) | (df_raw['ordinal_encoded'] == 4)
               | (df_raw['ordinal_encoded'] == 5)]
df_high = df_raw[(df_raw['ordinal_encoded'] == 6) | 
                 (df_raw['ordinal_encoded'] == 7) | 
                 (df_raw['ordinal_encoded'] == 8)]

In [22]:
df_low['ordinal_encoded'].value_counts()

In [23]:
df_mid['ordinal_encoded'].value_counts()

In [24]:
df_high['ordinal_encoded'].value_counts()

## Pie chart

### Low income

In [25]:
draw_pie(df_low, cus_cols, "low")

### Mid Income

In [26]:
draw_pie(df_mid, cus_cols, "mid")

### High Income

In [27]:
draw_pie(df_high, cus_cols, "high")

## Univariate analysis (Histograms)


### Low Income

In [28]:
for col in [c for c in cus_cols if c != 'avg. yearly_income']:
    sns.histplot(data = df_low, x = col, kde=False, color="lightblue")
    
    plt.title("Distribution of " + str(col) + " (Male)", fontsize=14, fontweight='bold')
    plt.xlabel(col, fontsize=12)
    plt.ylabel("Frequency", fontsize=12)
    
    sns.despine()
    
    plt.xticks(fontsize=7)
    plt.show()

### Mid Income

In [29]:
draw_histo(df_mid, cus_cols, "mid")

In [30]:
draw_histo(df_male, cus_cols, "male")

### High Income

In [31]:
draw_histo(df_high, cus_cols, "high")

## Multivariate Analysis

In [32]:
path = "/Users/chpyo/iCloud Drive (Archive)/Python/InformationSystem/Group Project/Multivariate_graphs/"

### Low Income

In [33]:
!pip install htmlmerge

In [34]:
figs_low = []

for x in ['total_children', 'avg_cars_at home(approx)', 'num_children_at_home', 'avg_cars_at home(approx).1']:
    for g in ['gender', 'education', 'member_card', 'occupation']:
        title = "Relationship between " + x + " VS Cost based on " + g
        
        fig = px.scatter(
            data_frame = df_low,
            x = x,
            y= "cost",
            title = "Relationship between " + x + " VS Cost based on " + g,
            color = g,
            height=500
        )
        plt.show()
        #fig.write_html(path + title + " (low)" + '.html')

### Mid Income

In [35]:
for x in ['total_children',
 'avg_cars_at home(approx)',
 'num_children_at_home',
 'avg_cars_at home(approx).1']:
 
    for g in ['gender', 'education', 'member_card', 'occupation']:
        title = "Relationship between " + x + " VS Cost based on " + g

        fig = px.scatter(
            data_frame = df_mid,
            x = x,
            y= "cost",
            title = "Relationship between " + x + " VS Cost based on " + g,
            color = g,
            height=500
        )
        fig.show()
        #fig.write_html(path + title + " (mid)" + '.html')

### High Income

In [36]:
for x in ['total_children',
 'avg_cars_at home(approx)',
 'num_children_at_home',
 'avg_cars_at home(approx).1']:
 
    for g in ['gender', 'education', 'member_card', 'occupation']:
        title = "Relationship between " + x + " VS Cost based on " + g

        fig = px.scatter(
            data_frame = df_high,
            x = x,
            y= "cost",
            title = "Relationship between " + x + " VS Cost based on " + g,
            color = g,
            height=500
        )
        fig.show()
        #fig.write_html(path + title + " (high)" + '.html')