# 음식 추천하기

In [102]:
import pandas as pd

name2gid = {
    '매운맛': 0,
    '기름진맛': 1369311762,
    '단맛': 1090863632,
}

name2gid.keys()

dict_keys(['매운맛', '기름진맛', '단맛'])

## 구글 시트의 원본 데이터를 pd.DataFrame으로 읽어오기

In [103]:
def fetch_food_data(sheet_name):
    """구글시트의 음식 데이터를 pd.DataFrame 형태로 읽어온다"""
    url = \
        'https://docs.google.com/spreadsheets/d/e' \
        '/2PACX-1vSiAzsjRqvLWoFSpOuRlz2xtDef2yAN77AGsvmAgCWRtpF8NVr71sNTdNazri4o1FAmF7QA540PNveb' \
        f'/pub?single=true&output=csv&gid={name2gid[sheet_name]}'
    return pd.read_csv(url, index_col=0)

In [104]:
dfs = [fetch_food_data(f) for f in name2gid.keys()]

## 데이터가 우리의 가정을 만족하는지 알아보기

In [87]:
from statistics import mean

def check_range(df):
    lbound = 0.0
    ubound = 1.0
    df_mask = df.isna()
    df_filled = df.mask(df_mask, mean([lbound, ubound]))
    
    in_lbound = (df_filled >= lbound).all(axis=None)
    in_ubound = (df_filled <= ubound).all(axis=None)
    return in_lbound and in_ubound
"
def check_no_na(df):
    return not df.isna().any(axis=None)

def check_foodnames(dfs):
    sets = [set(df.index) for df in dfs]
    return set.intersection(*sets) == set.union(*sets)

In [83]:
check_range(df_spicy) and check_range(df_oily) and check_range(df_sweet)

False

In [84]:
check_no_na(df_spicy) and check_no_na(df_oily) and check_no_na(df_sweet)

False

In [85]:
check_foodnames([df_spicy, df_oily, df_sweet])

False

## 코드에 문제가 없는지 검사하기

In [10]:
def make_dummy(values, names):
    return pd.DataFrame(values, index=names)

테스트용 데이터를 만들어서 `check_range()`에 문제가 없는지 검사하기

In [11]:
check_range(make_dummy([[0, 0], [0, 1]], names=['a', 'b']))

True

In [12]:
check_range(make_dummy([[0, 0], [0, -1]], names=['a', 'b']))

False

In [13]:
check_range(make_dummy([[0, 0], [0, 2]], names=['a', 'b']))

False

## all()과 any() 이해하기

`all()`은 데이터프레임의 모든 값이 True인 경우에만 True를 반환합니다.

In [14]:
pd.DataFrame([True, True]).all(axis=None)

True

In [15]:
pd.DataFrame([True, False]).all(axis=None)

False

In [16]:
pd.DataFrame([False, False]).all(axis=None)

False

`any()`는 데이터프레임에 True인 값이 하나라도 있으면 True를 반환합니다. (is there **any**?)

In [17]:
pd.DataFrame([True, True]).any(axis=None)

True

In [18]:
pd.DataFrame([True, False]).any(axis=None)

True

In [19]:
pd.DataFrame([False, False]).any(axis=None)

False

## 클렌징

In [97]:
def scale_minmax(df):
    """min-max scaling 적용하고 min-max가 같은 칼럼은 제거하기"""
    min_value = df.min()
    max_value = df.max()
    extent = max_value - min_value

    scaled_df = (df - min_value) / extent
    columns_to_keep = extent > 0
    return scaled_df.loc[:, columns_to_keep]

In [98]:
df = make_dummy(
    [
        # u0, u1, u2, u3, ...
        [1, 0.1, 3, 0.3],  # f0
        [1, 0.2, 3, 0.5],  # f1
        [1, 0.3, 3, 1.0],  # f2
    ],
    ['f0', 'f1', 'f2'],
)
scale_minmax(df)

Unnamed: 0,1,3
f0,0.0,0.0
f1,0.5,0.285714
f2,1.0,1.0


In [106]:
# 대표값 구하기
means = [scale_minmax(df).fillna(0.5).mean(axis=1) for df in dfs]

In [107]:
# 하나로 합치기
df = pd \
    .concat(means, axis=1, keys=name2gid.keys()) \
    .dropna()
df.head()

Unnamed: 0_level_0,매운맛,기름진맛,단맛
menu,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
계란밥,0.019231,0.386111,0.215608
그릭요거트,0.019231,0.257407,0.221561
김치찌개,0.48212,0.426852,0.205026
김치찜,0.548191,0.617593,0.325397
꽃게탕,0.378594,0.178704,0.126323


## 추천하기