# Set Up

## Imports

In [None]:
# IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.lines as mlines
from matplotlib import gridspec
import plotly
import plotly.graph_objects as go
plotly.offline.init_notebook_mode(connected=True)
import holoviews as hv
hv.extension('bokeh')
from holoviews.plotting.util import process_cmap
import selenium
import tableone
from tableone import TableOne
import stargazer
import statsmodels.api as sm
import statsmodels.formula.api as smf

## Loading Data

In [None]:
# LOAD DATA
data = pd.read_csv('chengdu_dataset.csv')
print(data.shape)
data.head()

# Data Cleaning

In [None]:
# Check data types
data.dtypes

In [None]:
def clean_i(old_str, new_str, i):
    if i == old_str:
        return new_str
    else:
        return i
    
def replace_nan(i):
    if i == np.nan:
        return 'Unspecified'
    else:
        return i

## Market/Parent Splits

In [None]:
# create a list of our conditions
conditions = [
    (data['online_dummy'] == 1) & (data['parent_dummy'] == 0),
    (data['online_dummy'] == 0) & (data['parent_dummy'] == 1),
    (data['online_dummy'] == 0) & (data['parent_dummy'] == 0),
    ]

# create a list of the values we want to assign for each condition
values_labels = ['online', 'offline_parents', 'offline_non_parents']
values_numeric = [0,1,2]

# create a new column and use np.select to assign values to it using our lists as arguments
data['market_class'] = np.select(conditions, values_labels)
print(data['market_class'].value_counts())

data['market_dummy'] = np.select(conditions, values_numeric)
print(data['market_dummy'].value_counts())

## Status

In [None]:
def clean_status(x):
    if x == 'separated':
        return 'Separated/Divorced'
    elif x == 'divorced':
        return 'Separated/Divorced'
    else:
        return x.capitalize()

In [None]:
data['status_own'] = data['status_own'].map(lambda x: clean_i('no answer', np.nan, x))
data['status_own'] = data['status_own'].replace(np.nan, 'Unspecified')
data['status_own'] = data['status_own'].map(lambda x: clean_status(x))

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (12,6))

axes[0] = sns.countplot(x="status_own", data=data[data['market_class']=='online'], hue = 'female', ax = axes[0])
axes[1] = sns.countplot(x="status_own", data=data[data['market_class']=='offline_parents'], hue = 'female', ax=axes[1])

axes[0].set_title('Online')
axes[0].set_ylabel('Count')

axes[1].set_title('Offline (Parents)')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

## Age

In [None]:
def return_diff(input_data, var):
    ''' Function which updates dataframe with relative differences between own variable, and min/max of partner variable'''
    data = input_data.copy()
    for i in range(len(data)) : 
        own_var = data.loc[i, f'{var}_own']
        min_var = data.loc[i, f'min{var}_partner']
        max_var = data.loc[i, f'max{var}_partner']
        if min_var !='Unspecified':
            diff_min = float(min_var) - float(own_var)
        else:
            diff_min = 'Unspecified'
        data.loc[i, f'diff_min_{var}'] = diff_min
        
        if max_var !='Unspecified':
            diff_max = float(max_var) - float(own_var)
        else:
            diff_max = 'Unspecified'
        data.loc[i, f'diff_max_{var}'] = diff_max
    return data

In [None]:
# Clean min age
data['minage_partner'] = data['minage_partner'].map(lambda x: clean_i('any', np.nan, x))
data['minage_partner'] = data['minage_partner'].replace(np.nan, 'Unspecified')

# Clean max age
data['maxage_partner'] = data['maxage_partner'].map(lambda x: clean_i('no answer', np.nan, x))
data['maxage_partner'] = data['maxage_partner'].replace(np.nan, 'Unspecified')

In [None]:
# Return differences in dataframe
data = return_diff(data, 'age')

In [None]:
fig, axes = plt.subplots(1, 2)

axes[0] = sns.boxplot(x="female", y="age_own", data=data[data['market_class']=='online'], orient='v', ax = axes[0])
axes[1] = sns.boxplot(x="female", y="age_own", data=data[data['market_class']=='offline_parents'], orient='v', 
    ax=axes[1])

axes[0].set_title('Online')
axes[0].set_ylabel('Age (yrs)')
axes[0].set_ylim(18, 60)
axes[1].set_title('Offline (Parents)')
axes[1].set_ylabel('Age (yrs)')
axes[1].set_ylim(18, 60)
plt.tight_layout()
plt.show()

## Height

In [None]:
# Clean min age
data['minheight_partner'] = data['minheight_partner'].replace(np.nan, 'Unspecified')

# Clean max age
data['maxheight_partner'] = data['maxheight_partner'].replace(np.nan, 'Unspecified')

In [None]:
# Return differences in dataframe
data = return_diff(data, 'height')

In [None]:
fig, axes = plt.subplots(1, 2)

axes[0] = sns.boxplot(x="female", y="height_own", data=data[data['market_class']=='online'], orient='v', ax = axes[0])
axes[1] = sns.boxplot(x="female", y="height_own", data=data[data['market_class']=='offline_parents'], orient='v', 
    ax=axes[1])

axes[0].set_title('Online')
axes[0].set_ylabel('Height (cm)')
axes[0].set_ylim(140, 210)
axes[1].set_title('Offline (Parents)')
axes[1].set_ylabel('Height (cm)')
axes[1].set_ylim(140, 210)
plt.tight_layout()
plt.show()

## Education

In [None]:
# Clean own education
data['educ_own_levels'] = data['educ_own_levels'].map(lambda x: clean_i('no answer', np.nan, x))
data['educ_own_levels'] = data['educ_own_levels'].replace(np.nan, 'Unspecified')

# Clean partner education
data['min_educ_partner_levels'] = data['min_educ_partner_levels'].map(lambda x: clean_i(0, np.nan, x))
data['min_educ_partner_levels'] = data['min_educ_partner_levels'].replace(np.nan, 'Unspecified')

In [None]:
# Check counts
data['educ_own_levels'].value_counts()

In [None]:
def educ_diffs(input_data):
    ''' Function to return educ diffs from own and partner educ levels'''
    data = input_data.copy()
    for i in range(len(data)):
        # Extract own education:
        own_educ = data.loc[i, 'educ_own_levels']
        partner_educ = data.loc[i, 'min_educ_partner_levels']
        if own_educ !='Unspecified':
            own_educ = float(own_educ)
            if partner_educ == 'Unspecified':
                educ_diff = 'Unspecified Partner Educ'
            else:
                partner_educ = float(partner_educ)
                if partner_educ > own_educ:
                    educ_diff = 'Specify Higher Partner Educ'
                elif partner_educ == own_educ:
                    educ_diff = 'Specify Same Partner Educ'
                elif partner_educ < own_educ:
                    educ_diff = 'Specify Lower Partner Educ'
        elif own_educ == 'Unspecified':
            educ_diff = 'Unspecified Own Educ'
        
        data.loc[i, 'diff_educ'] = educ_diff
    
    return data

In [None]:
# Calculate educ diffs in dataframe
data = educ_diffs(data)

In [None]:
# Check counts
data['diff_educ'].value_counts()

In [None]:
def get_educ_cat(i, person):
    '''Function to return string label from integer label'''
    try:
        i = int(i)
    except:
        pass
    if i == 1:
        return f"School {person}"
    elif i == 2:
        return f"Vocational {person}"
    elif i == 3:
        return f"BA {person}"
    elif i == 4:
        return f"MA {person}"
    elif i == 5:
        return f"PhD {person}"
    elif i == 'Unspecified':
        return f"Unspecified {person}"

In [None]:
data['educ_own_cats'] = data['educ_own_levels'].map(lambda x: get_educ_cat(x, 'Own'))
data['educ_partner_cats'] = data['min_educ_partner_levels'].map(lambda x: get_educ_cat(x, 'Partner'))

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (16,10))

axes[0] = sns.countplot(x="educ_own_cats", data=data[data['market_class']=='online'], hue = 'female', ax = axes[0])
axes[1] = sns.countplot(x="educ_own_cats", data=data[data['market_class']=='offline_parents'], hue = 'female', ax=axes[1])

axes[0].set_title('Online')
axes[0].set_ylabel('Count')

axes[1].set_title('Offline (Parents)')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

# Output Graphs

## Height and Age Differences

In [None]:
def get_unspecified_counts(i):
    if i == 'Unspecified':
        return 'Unspecified'
    elif i == np.nan:
        return 'Unspecified'
    else:
        return 'Specified'

# Add column of unspecified counts to dataframe
original_columns = ['age_own', 'diff_min_age', 'diff_max_age', 'height_own','diff_min_height', 'diff_max_height']
count_columns = ['own_age_specified', 'min_age_specified', 'max_age_specified', 'own_height_specified','min_height_specified', 'max_height_specified']
for orig, count in zip(original_columns, count_columns):
    data[count] = data[orig].map(lambda x: get_unspecified_counts(x))
    
    
s = data['own_age_specified'].value_counts()
s.to_dict()

In [None]:
def newline(p1, p2, color='black'):
    print(p1, p2)
    ax = plt.gca()
    l = mlines.Line2D([p1[0],p2[0]], [p1[1],p2[1]], color='grey')
    ax.add_line(l)
    return l


def plot_dumbell_and_bar(data, var, by_group, color_by_gender = False):
    fig = plt.figure(figsize=(12,5), dpi= 80)
    gs = gridspec.GridSpec(1, 2, width_ratios=[3, 1]) 
    gs.update(wspace=0.025, hspace=0.05)
    ax0 = fig.add_subplot(gs[0])
    #ax = axes.ravel()
    
    genders = ['male', 'male', 'female', 'female']
    if by_group == 'parents_vs_online':
        markets = [1,0, 1,0]
        parents = [0, 1, 0, 1]
    elif by_group == 'parents_vs_nonparents':
        markets = [0,0,0,0]
        parents = [0, 1, 0,1]
        
    market_dict = {1: 'online', 0:'offline'}
    parent_dict = {0: 'non-parent', 1: 'parent'}
    

    labels = []
    median_var = []
    max_var = []
    min_var = []
    spec_var = []
    unspec_var = []
    spec_min_var = []
    unspec_min_var = []
    spec_max_var = []
    unspec_max_var = []
    LQ = []
    UQ = []

    for gender, market, parent in zip(genders, markets, parents):
        # subset data
        subset_df = data[(data['gender_own']==gender) & (data['online_dummy']==market) & (data['parent_dummy']==parent)]
        #print(f"SUBSET: {gender, market, parent}")

        # Calculate Spec vs Unspecified counts for bar plot
        # OWN VAR
        counts = subset_df[f'own_{var}_specified'].value_counts()
        count_dict = counts.to_dict()
        #print(f'Own Var: {count_dict}')
    
        if 'Unspecified' in count_dict.keys():
            unspec_var.append(count_dict['Unspecified'])
        else:
            unspec_var.append(0)
        
        if 'Specified' in count_dict.keys():
            spec_var.append(count_dict['Specified'])
        else:
            spec_var.append(0)
        
        # MIN VAR
        counts = subset_df[f'min_{var}_specified'].value_counts()
        count_dict = counts.to_dict()
        #print(f'Min Var: {count_dict}')
        
        if 'Unspecified' in count_dict.keys():
            unspec_min_var.append(count_dict['Unspecified'])
        else:
            unspec_min_var.append(0)
        
        if 'Specified' in count_dict.keys():
            spec_min_var.append(count_dict['Specified'])
        else:
            spec_min_var.append(0)
        
        # MAX VAR
        counts = subset_df[f'max_{var}_specified'].value_counts()
        count_dict = counts.to_dict()
        #print(f'Max Var: {count_dict}')
        
        if 'Unspecified' in count_dict.keys():
            unspec_max_var.append(count_dict['Unspecified'])
        else:
            unspec_max_var.append(0)
        
        if 'Specified' in count_dict.keys():
            spec_max_var.append(count_dict['Specified'])
        else:
            spec_max_var.append(0)

        
        
        # Calculate medians and IQR for dumbell plot
        subset_df = subset_df.replace('Unspecified', np.nan)
        labels.append(f'{gender}_{parent_dict[parent]}_{market_dict[market]}')
        median_var.append(subset_df[f'{var}_own'].median())
        max_var.append(subset_df[f'{var}_own'].median() + subset_df[f'diff_max_{var}'].median())
        min_var.append(subset_df[f'{var}_own'].median() + subset_df[f'diff_min_{var}'].median())
        LQ.append(np.nanpercentile(subset_df[f'{var}_own'], 25))
        UQ.append(np.nanpercentile(subset_df[f'{var}_own'], 75)) 

    
    # Color set up
    col1 = 'red'
    col2 = 'blue'
    color_dict = {'female_non-parent_online': ['red', 'blue'], 
                  'female_parent_offline':['red', 'blue'],
                  'female_non-parent_offline': ['red', 'blue'],
                  'male_non-parent_online':['blue', 'red'], 
                  'male_parent_offline': ['blue', 'red'],
                 'male_non-parent_offline': ['blue', 'red']}
    own_color_list = ['red', 'red', 'blue', 'blue']
    partner_color_list = ['blue', 'blue', 'red', 'red']
    
    # Label set up
    #labels = sorted(labels)
    print(f'LABELS: {labels}')
    print(f'MEDIAN: {median_var}')
    print(f'LQ: {LQ}')
    print(f'UQ: {UQ}')
    y_pos = np.arange(len(labels))
    label_dict = {'female_non-parent_online': 'Female Online', 
                  'female_parent_offline': 'Female Offline (Parents)', 
                  'female_non-parent_offline': 'Female Offline (Non-Parents)',
                  'male_non-parent_online': 'Male Online', 
                  'male_parent_offline': 'Male Offline (Parents)',
                 'male_non-parent_offline': 'Male Offline (Non-Parents)'}
    
    # Own variable
    if color_by_gender == True:
        color_own = own_color_list
        color_partner = partner_color_list
    else:
        color_own = ['black'] * len(labels)
        color_partner = ['grey'] * len(labels)
    ax0.scatter(y=y_pos+0.1, x=median_var, s=100, color=color_own, alpha=1, label=f'Own {var}', marker = 'o')
    ax0.scatter(y=y_pos+0.1, x=LQ, s=100, color=color_own, alpha=1, marker = "4")
    ax0.scatter(y=y_pos+0.1, x=UQ, s=100, color=color_own, alpha=1, marker = "3")
    
    # Partner variable
    ax0.scatter(y=y_pos-0.1, x=min_var, s=100, color=color_partner, alpha=1, label=f'Min partner {var}', marker = "<")
    ax0.scatter(y=y_pos-0.1, x=max_var, s=100, color=color_partner, alpha=1, label=f'Max partner {var}', marker = ">")

    # Plot line
    for j, p1, p2, col in zip(y_pos, min_var, max_var, color_partner):
        p1 = [p1, j-0.1]
        p2 = [p2, j-0.1]
        l = mlines.Line2D([p1[0],p2[0]], [p1[1],p2[1]], color=col, alpha = 0.5)
        ax0.add_line(l)
        
    for j, p1, p2, col in zip(y_pos, LQ, UQ, color_own):
        p1 = [p1, j+0.1]
        p2 = [p2, j+0.1]
        l = mlines.Line2D([p1[0],p2[0]], [p1[1],p2[1]], color=col)
        ax0.add_line(l)

    # Set axes
    ax0.set_yticks(y_pos)
    ax0.set_yticklabels([label_dict[label] for label in labels], fontsize = 14) #
    x_labels_dict = {'age': 'Age (years)', 'height': 'Height (cm)'}
    ax0.set_xlabel(x_labels_dict[var], size = 14)
    ax0.tick_params(axis="x", labelsize=14)
    title_dict = {'height': 'Height Preferences by Gender and Market', 'age': 'Age Preferences by Gender and Market'}
    ax0.grid(False)
    
    # Set legend
    handles, leg_labels = ax0.get_legend_handles_labels()
    #ax0.get_legend().remove()
    fig.legend(handles, leg_labels, fontsize = 14, loc = 'upper center', ncol = 1, bbox_to_anchor= (0.5, 1.1))
    
    # Add unspecified bar plot
    ax1 = fig.add_subplot(gs[1], sharey = ax0)
    
    
    ax1.barh(y_pos, (np.array(unspec_var)/(np.array(unspec_var)+np.array(spec_var))*100), align = 'edge', height = 0, color='black', label = f'Own {var}')
    ax1.barh(y_pos, (np.array(unspec_min_var)/(np.array(unspec_min_var)+np.array(spec_min_var))*100), align = 'edge', height = 0.3, color='grey', edgecolor='black', label = f'Min partner {var}')
    ax1.barh(y_pos, (np.array(unspec_max_var)/(np.array(unspec_max_var)+np.array(spec_max_var))*100), align = 'edge', height = -0.3, color='lightgrey', edgecolor='black',  label = f'Max partner {var}')
    ax1.set_yticks(y_pos)
    
    # Set axes
    ax1.tick_params(axis="x", labelsize=14)
    ax1.tick_params(axis="y", labelsize = 0, colors = 'white')
    ax1.set_xlabel("% Unspecified", size = 14)
    ax1.grid(False)
    
    # Set legend
    handles, labels = ax1.get_legend_handles_labels()
    #ax0.get_legend().remove()
    ax0.legend(handles, labels, fontsize = 14, loc = 'upper center', ncol = 1, bbox_to_anchor= (1.18, 1.29))
    
    fig.savefig(f'Figs/diff_plot_{var}_{by_group}.png', format='png', dpi=900, bbox_inches='tight')
    plt.show()

In [None]:
plot_dumbell_and_bar(data, 'age', 'parents_vs_online', color_by_gender = False)

In [None]:
plot_dumbell_and_bar(data, 'height', 'parents_vs_online')

In [None]:
plot_dumbell_and_bar(data, 'age', 'parents_vs_nonparents')

In [None]:
plot_dumbell_and_bar(data, 'height', 'parents_vs_nonparents')

## Education (Sankeys)

In [None]:
def get_sort_value(i, person):
    '''Function to return integer label from string label'''
    if i == f'Unspecified {person}':
        return 0
    if i == f'School {person}':
        return 1
    if i == f'Vocational {person}':
        return 2
    if i == f'BA {person}':
        return 3
    if i == f'MA {person}':
        return 4
    if i == f'PhD {person}':
        return 5

In [None]:
def make_sankey_data(input_data, gender, market, parent, title):
    '''Function to subset dataframe by target group and group data by education counts. Required preparation for the Sankey diagrams'''
    market_dict = {'online':1, 'offline':0}
    parent_dict = {'non-parent':0, 'parent':1}
    
    # Subset dataframe
    data = input_data[(input_data['gender_own']==gender) & (input_data['online_dummy']==market_dict[market]) & (input_data['parent_dummy']==parent_dict[parent])]
    data.loc[:, 'educ_own_cats'] = data.loc[:, 'educ_own_levels'].map(lambda x: get_educ_cat(x, 'Own'))
    data.loc[:, 'educ_partner_cats'] = data.loc[:, 'min_educ_partner_levels'].map(lambda x: get_educ_cat(x, 'Partner'))
    educ_data = data.loc[:,('educ_own_levels', 'educ_own_cats', 'educ_partner_cats')]
    
    # Group data
    group_data = educ_data.groupby(['educ_own_cats', 'educ_partner_cats']).size().reset_index().rename(columns={0:'count'})
    group_data = group_data.sort_values(by = 'educ_own_cats')
    group_data.loc[:, 'sort_value_own'] = group_data.loc[:, 'educ_own_cats'].map(lambda x: get_sort_value(x, 'Own'))
    group_data.loc[:, 'sort_value_partner'] = group_data.loc[:, 'educ_partner_cats'].map(lambda x: get_sort_value(x, 'Partner')+6)
    group_data = group_data.sort_values(by = 'sort_value_own')
    
    return group_data, title


In [None]:
# Make Sankey Diagrams
group_data, title = make_sankey_data(data, 'male', 'offline', 'parent', 'Male Parents Offline')
sankey1 = hv.Sankey(group_data, kdims=["educ_own_cats", "educ_partner_cats"], vdims=["count"])

cmap_list = process_cmap("gray")
cmap = {
    "PhD Own": cmap_list[0], 
    "PhD Partner": cmap_list[0],
    "MA Own": cmap_list[1],
    "MA Partner": cmap_list[1],
    "BA Own": cmap_list[2],
    "BA Partner": cmap_list[2],
    "Vocational Own": cmap_list[3],
    "Vocational Partner": cmap_list[3],
    "School Own": cmap_list[5],
    "School Partner": cmap_list[5],
    "Unspecified Own": cmap_list[4],
    "Unspecified Partner": cmap_list[4],
}

sankey1.opts(label_position='outer',                   
             edge_line_width=0, edge_color='educ_own_cats', cmap = cmap,
                                 node_alpha=1.0, node_width=40, node_sort=True,
                                 width=900, height=600, bgcolor="white",
                                 title=f"Relation between Own and Partner Education [{title}]")

In [None]:
def make_sankey_plotly(group_data, title):

    own_labels = group_data['educ_own_cats'].to_list()
    partner_labels = group_data['educ_partner_cats'].to_list()
    labels = list(set(own_labels + partner_labels))
    print(labels)

    source = group_data['sort_value_own'].to_list()
    target = group_data['sort_value_partner'].to_list()
    value = group_data['count'].to_list()
    fig = go.Figure(go.Sankey(
        arrangement = "snap",
        node = {
            "label": ['Unspecified Own', 'School Own', 'Vocational Own', 'BA Own', 'MA Own', 'PhD Own', 'Unspecified Partner', 'School Partner', 'Vocational Partner', 'BA Partner', 'MA Partner', 'PhD Partner'],
            "x": [0,0,0,0,0,0,1,1,1,1,1,1],
            "y": [6, 5, 4, 3, 2, 1, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
            'pad':10, 
            'color': "grey"},  # 10 Pixels
        link = {
            "source": source,
            "target": target,
            "value": value}))

    fig.update_layout(
        title = title,
        autosize=False,
        width=600,
        height=600,
        margin=dict(
            l=50,
            r=50,
            b=100,
            t=100,
            pad=4
        ),
        paper_bgcolor="White",
    )
    
    
    #fig.write_image(f"Figs/{title}_Sankey.png", format="png", width=500, height=500, scale=4)

    fig.show()

In [None]:
# Males Offline
group_data, title = make_sankey_data(data, 'male', 'offline', 'parent', 'Male Parents Offline')
make_sankey_plotly(group_data, title)

In [None]:
# Females Offline
group_data, title = make_sankey_data(data, 'female', 'offline', 'parent', 'Female Parents Offline')
make_sankey_plotly(group_data, title)

In [None]:
# Males Online
group_data, title = make_sankey_data(data, 'male', 'online', 'non-parent', 'Male Online')
make_sankey_plotly(group_data, title)

In [None]:
# Females Online
group_data, title = make_sankey_data(data, 'female', 'online', 'non-parent', 'Female Online')
make_sankey_plotly(group_data, title)

# Summary Tables

In [None]:
print(data.columns)
data.head()

In [None]:
# Export data
data = data.replace('Unspecified', np.nan)
data.to_csv('data_analyzed.csv')

## Across Gender, Within Market

In [None]:
def gender_comparison_table(input_df, market, own = True):
    if market == 'online':
        subset_df = input_df[input_df['market_class']=='online']
    elif market == 'offline_parents':
        subset_df = input_df[input_df['market_class']=='offline_parents']
    elif market == 'offline_non_parents':
        subset_df = input_df[input_df['market_class']=='offline_non_parents']
        
    if own == True:
        cols = ['gender_own', 'age_own', 'height_own','status_own' , 'educ_own_cats']
        rename_dict = {'gender_own':'Gender', 'age_own': 'Age (years)', 'height_own':'Height (cm)', 'status_own': 'Status', 'educ_own_cats': 'Education'}
        categorical = ['Status', 'Education']
    elif own == False:
        cols = ['gender_own','diff_min_age','diff_max_age', 'diff_min_height', 'diff_max_height', 'diff_educ']
        rename_dict = {'gender_own':'Gender', 'diff_min_age': 'Diff Min Age (years)', 'diff_max_age': 'Diff Max Age (years)', 
                   'diff_max_height': 'Diff Max Height (cm)','diff_min_height': 'Diff Min Height (cm)', 'diff_educ': 'Diff Education'}
        categorical = ['Diff Education']
    table_df = subset_df[cols].rename(columns = rename_dict)
    table_df = table_df.replace('Unspecified', np.nan)
    
    gender_dict = {'female':'Female', 'male': 'Male'}
    table_df['Gender'] = table_df['Gender'].map(lambda x: gender_dict[x])
    columns = rename_dict.values()
    groupby = 'Gender'
    mytable = TableOne(table_df, columns=columns, categorical=categorical,
                       groupby=groupby, pval=True)
    
    print(f'Summary table for gender by {market}')
    
    return mytable

### Own Variables

In [None]:
table1A = gender_comparison_table(data, 'online')
table1A.to_excel('Tables/online_by_gender_own.xlsx')
table1A

In [None]:
table1B = gender_comparison_table(data, 'offline_parents')
table1B.to_excel('Tables/offline_parents_by_gender_own.xlsx')
table1B

In [None]:
table1C = gender_comparison_table(data, 'offline_non_parents')
table1C.to_excel('Tables/offline_non_parents_by_gender_own.xlsx')
table1C

### Partner Variables

In [None]:
table2A = gender_comparison_table(data, 'online', own = False)
table2A.to_excel('Tables/online_by_gender_partner.xlsx')
table2A

In [None]:
table2B = gender_comparison_table(data, 'offline_parents', own = False)
table2B.to_excel('Tables/offline_parents_by_gender_partner.xlsx')
table2B

In [None]:
table2C = gender_comparison_table(data, 'offline_non_parents', own = False)
table2C.to_excel('Tables/offline_non_parents_by_gender_partner.xlsx')
table2C

## Across Market, Within Gender

In [None]:
def make_summary_table_diffs(input_df, subset_col, subset_cat):
    subset_df = input_df[input_df[subset_col]==subset_cat]
    cols = ['gender_own','diff_min_age',
       'diff_max_age', 'diff_min_height', 'diff_max_height', 'diff_educ']
    rename_dict = {'gender_own':'Gender', 'diff_min_age': 'Diff Min Age (years)', 'diff_max_age': 'Diff Max Age (years)', 
                   'diff_max_height': 'Diff Max Height (cm)','diff_min_height': 'Diff Min Height (cm)', 'diff_educ': 'Diff Education'}
    table_df = subset_df[cols].rename(columns = rename_dict)
    table_df = table_df.replace('Unspecified', np.nan)
    
    gender_dict = {'female':'Female', 'male': 'Male'}
    table_df['Gender'] = table_df['Gender'].map(lambda x: gender_dict[x])
    columns = rename_dict.values()
    groupby = 'Gender'
    categorical = ['Diff Education']
    mytable = TableOne(table_df, columns=columns, categorical=categorical,
                       groupby=groupby, pval=True)
    
    print(f'Summary table for gender by {subset_cat}')
    
    return mytable

In [None]:
def market_comparison_table(input_df, gender, comparison, own = True):
    if comparison == 'online_offline_parents':
        subset_df = input_df[(input_df['market_class']!='offline_non_parents')]
    elif comparison == 'offline_parents_non_parents':
        subset_df = input_df[input_df['market_class']!='online']
    
    if gender == 'female':
        gender_df = subset_df[subset_df['female']==1]
    elif gender == 'male':
        gender_df = subset_df[subset_df['female']==0]
    
    if own == True:
        cols = ['market_class', 'age_own', 'height_own','status_own' , 'educ_own_cats']
        rename_dict = {'market_class':'Platform', 'age_own': 'Age (years)', 'height_own':'Height (cm)', 'status_own': 'Status', 'educ_own_cats': 'Education'}
        categorical = ['Status', 'Education']
    
    elif own == False:
        cols = ['market_class','diff_min_age','diff_max_age', 'diff_min_height', 'diff_max_height', 'diff_educ']
        rename_dict = {'market_class':'Platform', 'diff_min_age': 'Diff Min Age (years)', 'diff_max_age': 'Diff Max Age (years)', 
                   'diff_max_height': 'Diff Max Height (cm)','diff_min_height': 'Diff Min Height (cm)', 'diff_educ': 'Diff Education'}
        categorical = ['Diff Education']
    table_df = gender_df[cols].rename(columns = rename_dict)
    table_df = table_df.replace('Unspecified', np.nan)
    
    platform_dict = {'online':'Online', 'offline_parents': 'Offline Parents', 'offline_non_parents': 'Offline Non-Parents'}
    table_df['Platform'] = table_df['Platform'].map(lambda x: platform_dict[x])
    columns = rename_dict.values()
    groupby = 'Platform'
    mytable = TableOne(table_df, columns=columns, categorical=categorical,
                       groupby=groupby, pval=True)
    
    print(f'Summary table for platforms by {gender}')
    
    return mytable

### Own Variables

In [None]:
table3A = market_comparison_table(data, 'female', 'online_offline_parents')
table3A.to_excel('Tables/female_by_market_own_(online_vs_offline_parents).xlsx')
table3A

In [None]:
table3B = market_comparison_table(data, 'male', 'online_offline_parents')
table3B.to_excel('Tables/male_by_market_own_(online_vs_offline_parents).xlsx')
table3B

In [None]:
table3C = market_comparison_table(data, 'female', 'offline_parents_non_parents')
table3C.to_excel('Tables/female_by_market_own_(offline_parents_vs_non-parents).xlsx')
table3C

In [None]:
table3D = market_comparison_table(data, 'male', 'offline_parents_non_parents')
table3D.to_excel('Tables/male_by_market_own_(offline_parents_vs_non-parents).xlsx')
table3D

### Partner Variables

In [None]:
table4A = market_comparison_table(data, 'female', 'online_offline_parents', own = False)
table4A.to_excel('Tables/female_by_market_partner_(online_vs_offline_parents).xlsx')
table4A

In [None]:
table4B = market_comparison_table(data, 'male', 'online_offline_parents', own = False)
table4B.to_excel('Tables/male_by_market_partner_(online_vs_offline_parents).xlsx')
table4B

In [None]:
table4C = market_comparison_table(data, 'female', 'offline_parents_non_parents', own = False)
table4C.to_excel('Tables/female_by_market_partner_(offline_parents_vs_non-parents).xlsx')
table4C

In [None]:
table4D = market_comparison_table(data, 'male', 'offline_parents_non_parents', own = False)
table4D.to_excel('Tables/male_by_market_partner_(offline_parents_vs_non-parents).xlsx')
table4D

# Regression

In [None]:
data['market_class'].value_counts()

In [None]:
reg_data = data[(data['market_class']=='online')|(data['market_class']=='offline_parents')]

def get_offline_dummy(x):
    if x == 'online':
        return 0
    elif x =='offline_parents':
        return 1
    
reg_data['offline_parent_dummy'] = reg_data['market_class'].map(lambda x: get_offline_dummy(x))


In [None]:
reg_data = reg_data.replace('Unspecified', np.nan)
reg_data['Range_Age'] = reg_data['diff_max_age'] - reg_data['diff_min_age']
reg_data['Range_Height'] = reg_data['diff_max_height'] - reg_data['diff_min_height']


In [None]:
def binary_educ(x):
    if x == 'BA Partner':
        return 1
    elif x =='MA Partner':
        return 1
    elif x == 'PhD Partner':
        return 1
    else:
        return 0
    
reg_data['binary_educ'] = reg_data['educ_partner_cats'].map(lambda x: binary_educ(x))

In [None]:
print(reg_data.columns)
reg_data

In [None]:
numeric_dep_vars = ['diff_max_height', 'diff_min_height', 'Range_Height', 'diff_min_age', 'diff_max_age', 'Range_Age']
categorical_dep_vars = ['binary_educ']
models = []
for dep_var in numeric_dep_vars:
    ols_model = smf.ols(formula = f"{dep_var} ~ female + offline_parent_dummy + female*offline_parent_dummy", data = reg_data)
    res = ols_model.fit()
    models.append(res)
for dep_var in categorical_dep_vars:
    logit = smf.logit(formula = f"{dep_var} ~ female + offline_parent_dummy + female*offline_parent_dummy", data = reg_data)
    res = logit.fit(maxiter=10000)
    models.append(res)
    print(res.summary())

In [None]:
from stargazer.stargazer import Stargazer
stargazer = Stargazer(models)
stargazer.significant_digits(2)
stargazer.covariate_order(['Intercept', 'female', 'offline_parent_dummy', 'female:offline_parent_dummy'])
stargazer.rename_covariates({'female': 'Female', 'offline_parent_dummy':'Offline (parents)','female:offline_parent_dummy': 'Female*Offline(Parents)'})
stargazer.custom_columns(['Min Height', 'Max Height', 'Range Height', 'Min Age', 'Max Age', 'Range Age', 'Specify > BA'], [1,1,1,1,1,1,1])
stargazer.render_latex()