# 환경준비

In [2]:
# 필요한 기본 패키지 준비

# 데이터 처리 필요 패키지
import numpy as np
import pandas as pd
import datetime as dt

# 시각화 필요 패키지
%matplotlib inline
from plotnine import *
import folium
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import rc, font_manager
import seaborn as sns


# Machine Learning 분석 환경 준비

# 전처리, 스케일링
from sklearn.preprocessing import StandardScaler

# 선형회귀분석
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from scipy import stats

# OLS회귀분석
import statsmodels.api as sm

# GAM 일반화가법모형
# LinearGAM, LogisticGAM, PoissonGAM, GammaGAM, InvGuss
from pygam import LinearGAM, LogisticGAM, PoissonGAM, GammaGAM

# Boosting

#데이터셋 분리
from sklearn.model_selection import train_test_split



# 한글 처리
font_name = font_manager.FontProperties(fname='C:/Windows/Fonts/NanumGothicCoding.ttf').get_name()
rc('font',family=font_name)

# - 마이너스 사인 처리
matplotlib.rcParams['axes.unicode_minus'] = False

# jupyter notebook에서 warning 무시하기
import warnings
warnings.filterwarnings("ignore")

# 자체제작 함수 준비

In [3]:
#########################################################################
# MAD 기반 예제코드
def mad_based_outlier(points, thresh=3.5):
    if len(points.shape) == 1:
        points = points[:,None]
    median = np.median(points, axis=0)
    diff = np.sum((points - median)**2, axis=-1)
    diff = np.sqrt(diff)
    med_abs_deviation = np.median(diff)
    modified_z_score = 0.6745 * diff / med_abs_deviation
    return modified_z_score > thresh 

# 출처: https://pythonanalysis.tistory.com/7 [Python 데이터 분석]
#########################################################################

# 소셜 데이터 처리를 위한 함수
# 1. 모든 소셜 데이터 column들의 첫번째는 : 날짜다.
# 2. 각 소셜데이터는 social_키워드.블로그/트위터/뉴스/총합 으로 되어 있다.
def changeColNames(d) : 
    # 컬럼이름 리스트를 만들어 반환
    # 통합하기 쉽게, 모든 데이터들의 날짜컬럼 이름을 date로 통일
    new_col_names = ['date']
    new_col_names.extend(list(d.columns)[1:])
    d.columns = new_col_names
    return pd.Series(d.columns).apply(lambda x : x.replace('social_',''))



# 향후 OLS모델에 쓸지도...
def formula_gen(deg, df):
    func = "qty ~ "
    for i in range(deg):
        if i ==0:
            prefix =""
        else:
            prefix = " + "
        func += prefix + "I(temp**{})".format(i+1)
    model = sm.OLS.from_formula(func, data=df)
    return model


# modeling 함수로 만들어 처리하기
def linReg(df, item):
    col_to_use = ['temp','humid','wind','rain','snow','cloud','sun_time'
                 ,'pm.total', 'health.total','br.total', 'hobby.total','date.total']
    X = df.loc[df['category']==item,col_to_use]
    y = df.loc[df['category']==item,'qty']

    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

    model = LinearRegression().fit(X_train, y_train)
  
    print('LinearRegression을 이용한 %s의 회귀분석 결과 :'%item)
    print('훈련세트점수 : {:.2f}'.format(model.score(X_train, y_train)))
    print('검증세트점수 : {:.2f}'.format(model.score(X_test, y_test)))

    
def ridgeReg(df, item):
    col_to_use = ['temp','humid','wind','rain','snow','cloud','sun_time'
                 ,'pm.total', 'health.total','br.total', 'hobby.total','date.total']
    X = df.loc[df['category']==item,col_to_use]
    y = df.loc[df['category']==item,'qty']

    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

    ridge = Ridge(alpha=0.1, normalize=True, random_state=0, tol=0.001).fit(X_train, y_train)
    
    print('RidgeRegression을 이용한 %s의 회귀분석 결과 :'%item)
    print('훈련세트점수 : {:.2f}'.format(ridge.score(X_train, y_train)))
    print('검증세트점수 : {:.2f}'.format(ridge.score(X_test, y_test)))


def lassoReg(df, item):
    col_to_use = ['temp','humid','wind','rain','snow','cloud','sun_time'
                 ,'pm.total', 'health.total','br.total', 'hobby.total','date.total']
    X = df.loc[df['category']==item,col_to_use]
    y = df.loc[df['category']==item,'qty']

    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

    lasso = Lasso(alpha=0.1, max_iter=1000).fit(X=X_train, y=y_train)
  
    print('LassoRegression을 이용한 %s의 회귀분석 결과 :'%item)
    print('훈련세트점수 : {:.2f}'.format(lasso.score(X_train, y_train)) )
    print('검증세트점수 : {:.2f}'.format(lasso.score(X_test, y_test)) )

    #사용한 특성수
    print('사용한 특성수 : {}'.format(np.sum(lasso.coef_ != 0)) )

# 데이터 불러오기

In [4]:
# 데이터 불러오기 (전처리 된 GS, 랄라블라, 날씨)
gs = pd.read_csv('d:/project/contest/data/processed/p_gs.csv', parse_dates=['date'])
lavla = pd.read_csv('d:/project/contest/data/processed/p_lavla.csv', parse_dates=['date'])
weather = pd.read_csv('d:/project/contest/data/processed/p_weather.csv', parse_dates=['date'])

In [5]:
seoul_gu = list(gs[['pvn_nm','bor_nm']].loc[(gs['pvn_nm']=='서울특별시'),'bor_nm'].unique())
print(seoul_gu)

['종로구', '중구', '용산구', '성동구', '광진구', '동대문구', '중랑구', '성북구', '강북구', '도봉구', '노원구', '은평구', '서대문구', '마포구', '양천구', '강서구', '구로구', '금천구', '영등포구', '동작구', '관악구', '서초구', '강남구', '송파구', '강동구']


In [6]:
# 인덱스가 포함되어 있고(0번 컬럼에), 날짜컬럼(1)은 날짜로.
social_pm = pd.read_csv('D:/project/contest/data/original/social_pm.csv',index_col=0, parse_dates=[1])
social_health = pd.read_csv('d:/project/contest/data/original/social_health.csv',index_col=0, parse_dates=[1])
social_date = pd.read_csv('d:/project/contest/data/original/social_date.csv',index_col=0, parse_dates=[1])
social_br = pd.read_csv('d:/project/contest/data/original/social_br.csv',index_col=0, parse_dates=[1])
social_hobby = pd.read_csv('d:/project/contest/data/original/social_hobby.csv',index_col=0, parse_dates=[1])

# 소셜데이터 전처리

In [7]:
# 컬럼명 단순화
social_pm.columns = changeColNames(social_pm)
social_health.columns = changeColNames(social_health)
social_date.columns = changeColNames(social_date)
social_br.columns = changeColNames(social_br)
social_hobby.columns = changeColNames(social_hobby)

In [8]:
# 한꺼번에 처리하기위해 소셜 데이터셋 리스트 생성
collections = [social_health, social_date, social_br, social_hobby]

# 전부 들어갈 데이터프레임
social_all = social_pm.copy()
for df in collections:
    print('merging ', df.columns[1].split('.')[0])
    social_all = social_all.merge(df, on='date',how='left')

social_all.to_csv('D:/project/contest/data/processed/social_all.csv',encoding='utf-8', index=False)

merging  health
merging  date
merging  br
merging  hobby
