# 3day 빈도 및 교차 데이터 만들기

### # 기본 라이브러리 장착

In [3]:
import pandas as pd

import matplotlib as mpl

import matplotlib.pylab as plt

import seaborn as sns # seaborn 라이브러리는 matplotlib 라이브러리랑 연동되어 있어서 함께 장착

import numpy as np # loc 함수 사용시 필요

### # tips 데이터셋 읽어들이기

In [2]:
tips = sns.load_dataset("tips")

tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


### # 성별를 데이터 프레임 형태로 출력

In [5]:
tips[["sex"]]

Unnamed: 0,sex
0,Female
1,Male
2,Male
3,Male
4,Female
...,...
239,Male
240,Female
241,Male
242,Male


### 성별 데이터를 여러 정보를 포함해서 출력

In [6]:
tips.sex

0      Female
1        Male
2        Male
3        Male
4      Female
        ...  
239      Male
240    Female
241      Male
242      Male
243    Female
Name: sex, Length: 244, dtype: category
Categories (2, object): ['Male', 'Female']

### # 하나의 컬럼 안에 있는 각 값들의 빈도 조회

In [13]:
print(tips["sex"].value_counts())

print(tips["day"].value_counts())

print(tips["smoker"].value_counts())

print(tips["time"].value_counts())

Male      157
Female     87
Name: sex, dtype: int64
Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64
No     151
Yes     93
Name: smoker, dtype: int64
Dinner    176
Lunch      68
Name: time, dtype: int64


### # 월별/성별 교차분석

In [14]:
pd.crosstab(tips["sex"], tips["day"])

day,Thur,Fri,Sat,Sun
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Male,30,10,59,58
Female,32,9,28,18


### # 성별/월별 교차분석 총괄표
  * 행렬별 합계

In [15]:
pd.crosstab(tips["sex"],
            tips["day"],
            margins = True)

day,Thur,Fri,Sat,Sun,All
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Male,30,10,59,58,157
Female,32,9,28,18,87
All,62,19,87,76,244


### # 빈도 비율 확인

In [22]:
pd.crosstab(tips["sex"],
            tips["day"]).apply(lambda r : r/len(tips), axis = 1)

day,Thur,Fri,Sat,Sun
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Male,0.122951,0.040984,0.241803,0.237705
Female,0.131148,0.036885,0.114754,0.07377


### # 컬럼별 데이터 타입 확인
  * category라는 타입이 나오면 범주형 데이터로 분석할 수 있다.

In [23]:
tips.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

### # 기초 통계 조회

In [25]:
tips.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


### # 행별로 결측치 확인

In [36]:
tips.isnull().sum(1)

0      0
1      0
2      0
3      0
4      0
      ..
239    0
240    0
241    0
242    0
243    0
Length: 244, dtype: int64

### # 열별로 결측치 확인

In [35]:
tips.isnull().sum(0)

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

### # 행 단위로 결측치 삭제 

In [29]:
tips.dropna(axis = 0)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


### # 열 단위로 결측치 삭제 

In [30]:
tips.dropna(axis = 1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


### # 데이터프레임 형태로 결측치 제거

In [31]:
tips.dropna()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


### # 결측치를 0으로 채우기

In [32]:
tips.fillna(0)

ValueError: Cannot setitem on a Categorical with a new category, set the categories first

### # 결측치를 문자열로 대체

In [34]:
tips.fillna(' ')

ValueError: Cannot setitem on a Categorical with a new category, set the categories first

### # 결측치를 평균값으로 대체
  * 범주형 데이터는 비율을 확인해서 비율대로 데이터값을 채운다.

In [None]:
tips.fillna(tips.mean())