##  단일변수 데이터
###  데이터프레임 변수 유형
### 
- 범주형 변수 : 연속형 데이터와 범주형 데이터의 가장 큰 차이는 가감승제가 가능하지만 의미가 없다는 것
- 질적 자료 : 숫자들의 크기가 차이가 계산되지 않음(명목변수, 서열변수)
### 
- 연속형 변수 : 사칙연산이 가능
- 양적자료 : 숫자들의 크기 차이 계산(등간변수, 비율변수)

### 연속형분석 
#### - 수집된 수치 데이터의 정리, 표현, 요약 등을 통해 데이터의 전반적인 특성을 이해하는 분석 진입 단계, '기술통계'라고도 한다.
#### - 자료의 특성을  찾는 분석 접근 방법

# 3day 빈도 및 교차 데이터 만들기

### tips 데이터셋 읽어들이기

In [7]:
# 라이브러리 불러오기
import pandas as pd

import matplotlib as mpl
import matplotlib.pylab as plt
import seaborn as sns

import numpy as np

In [9]:
# tips 데이터셋 읽어 들이기
tips = sns.load_dataset("tips")
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


### 빈도분석

In [16]:
# 성별 관련 데이터 뽑기
tips[["sex"]]

Unnamed: 0,sex
0,Female
1,Male
2,Male
3,Male
4,Female
...,...
239,Male
240,Female
241,Male
242,Male


In [18]:
# 성별 빈도분석
tips["sex"].value_counts()

Male      157
Female     87
Name: sex, dtype: int64

In [21]:
# smoker 빈도분석
tips["smoker"].value_counts()

No     151
Yes     93
Name: smoker, dtype: int64

In [22]:
# 요일 빈도분석
tips["day"].value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

In [23]:
# time 빈도분석
tips["time"].value_counts()

Dinner    176
Lunch      68
Name: time, dtype: int64

### 교차분석

In [24]:
# 월별/성별 교차분석
pd.crosstab(tips["sex"],tips["day"])

day,Thur,Fri,Sat,Sun
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Male,30,10,59,58
Female,32,9,28,18


In [26]:
# 성별/ 월별 교차분석 총괄표
pd.crosstab(tips["sex"],
            tips["day"], 
            margins=True)

day,Thur,Fri,Sat,Sun,All
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Male,30,10,59,58,157
Female,32,9,28,18,87
All,62,19,87,76,244


In [28]:
# 빈도 비율 확인
pd.crosstab(tips["sex"],
            tips["day"]).apply(
                lambda r: r/len(tips),
                axis = 1
            )

day,Thur,Fri,Sat,Sun
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Male,0.122951,0.040984,0.241803,0.237705
Female,0.131148,0.036885,0.114754,0.07377


In [30]:
tips.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [32]:
# 연속형데이터
tips.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [33]:
# 성별 컬럼명을 sex_kor 새로 만들 남,여 로 새로 넣어라
tips["sex_kor"]=0

In [34]:
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_kor
0,16.99,1.01,Female,No,Sun,Dinner,2,0
1,10.34,1.66,Male,No,Sun,Dinner,3,0
2,21.01,3.50,Male,No,Sun,Dinner,3,0
3,23.68,3.31,Male,No,Sun,Dinner,2,0
4,24.59,3.61,Female,No,Sun,Dinner,4,0
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0
242,17.82,1.75,Male,No,Sat,Dinner,2,0


In [36]:
tips["sex_kor"]=tips["sex"]

In [37]:
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_kor
0,16.99,1.01,Female,No,Sun,Dinner,2,Female
1,10.34,1.66,Male,No,Sun,Dinner,3,Male
2,21.01,3.50,Male,No,Sun,Dinner,3,Male
3,23.68,3.31,Male,No,Sun,Dinner,2,Male
4,24.59,3.61,Female,No,Sun,Dinner,4,Female
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,Male
240,27.18,2.00,Female,Yes,Sat,Dinner,2,Female
241,22.67,2.00,Male,Yes,Sat,Dinner,2,Male
242,17.82,1.75,Male,No,Sat,Dinner,2,Male


In [65]:
tips["sex_kor"].values

['Female', 'Male', 'Male', 'Male', 'Female', ..., 'Male', 'Female', 'Male', 'Male', 'Female']
Length: 244
Categories (2, object): ['Male', 'Female']

In [73]:
tips["sex_kor"].dtype

CategoricalDtype(categories=['Male', 'Female'], ordered=False)

In [74]:
tips["sex_kor"].replace("Female","여",inplace=True)

In [76]:
tips["sex_kor"].replace("Male","남",inplace=True)

In [77]:
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_kor
0,여,여,Female,No,Sun,Dinner,2,여
1,10.34,1.66,Male,No,Sun,Dinner,3,남
2,21.01,3.5,Male,No,Sun,Dinner,3,남
3,23.68,3.31,Male,No,Sun,Dinner,2,남
4,여,여,Female,No,Sun,Dinner,4,여
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,남
240,여,여,Female,Yes,Sat,Dinner,2,여
241,22.67,2.0,Male,Yes,Sat,Dinner,2,남
242,17.82,1.75,Male,No,Sat,Dinner,2,남
