In [1]:
import os
import pandas as pd
import numpy as np
import datetime

import requests
from bs4 import BeautifulSoup
import time

#import sweetviz as sv
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

from scipy import stats
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
def ab_test (raw, x, y):
    if raw[y].dtype in [object, bool]:
        if raw[y].dtype == object:
            raw[y] = raw[y].map(lambda x: 1 if (str(x).lower() == 'true') or (str(x).lower() == 'yes') else 0)
        else:
            raw[y] = raw[y].astype(int)

    df = raw[[x, y]].copy()
    tab = df.groupby(x).agg(['mean', 'sem']).round(2) # 평균, 표준오차
    tab.columns = ['mean', 'sem']
    tab['CI_lower'] = tab.apply(lambda x: x['mean'] - 2 * x['sem'], axis=1) # 95% 신뢰구간
    tab['CI_upper'] = tab.apply(lambda x: x['mean'] + 2 * x['sem'], axis=1)

    t_stat, p_value = stats.ttest_ind(df.loc[df[x] == tab.index[0], y], 
                                      df.loc[df[x] == tab.index[1], y], equal_var=True)
    
    res = {
            'pre_treat': tab.iloc[0]['mean'],
            'post_treat': tab.iloc[1]['mean'],
            'change_coef': round(tab.iloc[1]['mean'] - tab.iloc[0]['mean'], 2),
            'change_perc': round((tab.iloc[1]['mean'] - tab.iloc[0]['mean']) / tab.iloc[0]['mean'] * 100, 2),
            't_stat': t_stat.round(4),
            'p_value': p_value.round(4)
    }

    return res

In [3]:
#ab_test(raw, 'Group', 'Time Spent')

In [11]:
def did (raw, x, y, fix, treat_group, control_group, dt, start_dt, treat_dt, end_dt):
    try:
        raw[dt] = raw[dt].map(lambda x: x.date())
    except:
        pass
    df = raw.loc[(raw[dt] >= start_dt) & 
             (raw[dt] <= end_dt), [x, y, fix, dt]].copy()
    df['treated'] = df[x].map(lambda x: 1 if x == treat_group else 0)
    df['post'] = df[dt].map(lambda x: 1 if x >= treat_dt else 0)

    tab = pd.DataFrame(df.groupby(['treated', 'post'])[y].mean())
    model = smf.ols(f'{y} ~ treated:post + C({x}) + C({fix}) + C({dt})', data=df).fit()
    #model.summary()
    #model.conf_int().loc['treated:post'] # 95% 신뢰구간
    
    res = {
            'pre_treat': tab.loc[1, 0][y].round(1),
            'post_treat': tab.loc[1, 1][y].round(1),
            'change_value':tab.loc[1, 1][y].round(1) - tab.loc[1, 0][y].round(1),
            'change_coef': model.params['treated:post'].round(2),
            'change_perc': round(model.params['treated:post']/tab.loc[1, 0][y]*100, 2),
            'p_value': model.pvalues['treated:post'].round(2)
    }

    return res

In [39]:
def rd (raw, x, y, fix, treat_group, dt, start_dt, treat_dt, end_dt):
    try:
        raw[dt] = raw[dt].map(lambda x: x.date())
    except:
        pass
    df = raw.loc[(raw[dt] >= start_dt) & 
                 (raw[dt] <= end_dt) & 
                 (raw[x] == treat_group), [y, fix, dt]].copy()
    
    df['treated'] = (df[dt] >= treat_dt).astype(int)
    df['diff'] = (df[dt] - treat_dt).map(lambda x: x.days).astype(int)

    tab = pd.DataFrame(df.groupby('treated')[y].mean())
    
    model = smf.ols(f'{y} ~ treated:diff + C({fix})', data=df).fit()
    #model.summary()
    #model.conf_int().loc['treated:post'] # 95% 신뢰구간
    
    res = {
            'pre_treat': tab.loc[0][y].round(1),
            'post_treat': tab.loc[1][y].round(1),
            'change_value':tab.loc[1][y].round(1) - tab.loc[0][y].round(1),
            'change_coef': model.params['treated:diff'].round(2),
            'change_perc': round(model.params['treated:diff']/tab.loc[0][y]*100, 2),
            'p_value': model.pvalues['treated:diff'].round(2)
    }

    return res

In [7]:
# raw = pd.read_excel('./data/rawdata.xlsx')
# raw.head(3)

Unnamed: 0,연월,일자,주차,시도,시군구,목적통행량,평균통행거리(km),총 인구 수,인구당 목적통행량,택시 승차량,자동차 통행량,따릉이 이용량,방문자 수,treated,post,w
0,202201,2022-01-01,202252,광주광역시,광산구,26473,7.1,404319,65.5,,,,777793,0,0,0
1,202201,2022-01-02,202252,광주광역시,광산구,26698,7.1,404319,66.0,,,,779073,0,0,0
2,202201,2022-01-03,202201,광주광역시,광산구,56775,7.1,404319,140.4,,,,826813,0,0,0


In [13]:
# x = '시도'
# y = '목적통행량'
# fix = '시군구'
# treat_group = '서울특별시'
# control_group = '부산광역시'

# dt = '일자'
# start_dt = datetime.date(2022,11,1)
# treat_dt = datetime.date(2024,1,27)
# end_dt = datetime.date(2024,10,31)

In [14]:
# did(raw, x, y, fix, treat_group, control_group, dt, start_dt, treat_dt, end_dt)

{'pre_treat': 305141.6,
 'post_treat': 313286.3,
 'change_value': 8144.700000000012,
 'change_coef': 6451.19,
 'change_perc': 2.11,
 'p_value': 0.0}

In [40]:
# rd(raw, x, y, fix, treat_group, dt, start_dt, treat_dt, end_dt)

{'pre_treat': 305141.6,
 'post_treat': 313286.3,
 'change_value': 8144.700000000012,
 'change_coef': 48.89,
 'change_perc': 0.02,
 'p_value': 0.0}