### summary
- pandas
    - 데이터 분석 : 데이터 전처리 파트
    - 테이블 형태의 데이터를 처리할때 사용하는 python 라이브러리
    - Series; DataFrame
    - Series
        - 생성, 선택, 수정 방법
    - DataFrame
        - 생성 방법 1 : 딕셔너리의 리스트 : 리스트 -> 컬럼
        - 생성 방법 2 : 리스트의 딕셔너리 : 딕셔너리 -> 로우 데이터
        - row 선택 : `df.loc[idx]`
        - column 선택 : `df[column name]`
        - row, column 선택 : `df.loc[idx, column]`
        - 함수
            - apply, append, concat
            - groupby, merge

In [29]:
# 경고 무시
import warnings
warnings.simplefilter('ignore')

# basic
import time
import random

# 자주 사용하는 패키지를 임포트
import matplotlib as mpl
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import sklearn as sk

# web crawling
import requests
from bs4 import BeautifulSoup

# matplotlib 설정
mpl.use('Agg')

# seaborn 설정
sns.set()
sns.set_style("whitegrid")
sns.set_color_codes()

In [1]:
import makedata

In [6]:
makedata.get_name()

'Alvin'

In [7]:
makedata.get_age()

33

In [8]:
makedata.make_data()

[{'Age': 28, 'Name': 'Alan'},
 {'Age': 24, 'Name': 'Alex'},
 {'Age': 22, 'Name': 'Alvin'},
 {'Age': 36, 'Name': 'Alvin'},
 {'Age': 27, 'Name': 'Adam'},
 {'Age': 40, 'Name': 'Alan'},
 {'Age': 28, 'Name': 'Billy'},
 {'Age': 38, 'Name': 'Anthony'},
 {'Age': 35, 'Name': 'Alex'},
 {'Age': 35, 'Name': 'Alex'}]

### quiz
- makedata 모듈을 이용해서 데이터 프레임 만들기
- user_df
    - UserID : 1~8
    - Name : makedata.get_name()
    - Age : makedata.get_age()
    - 중복되는 Name 값이 없도록

In [9]:
# 딕셔너리의 리스트 : UserID, Name, Age
datas = {}
datas["UserID"] = list(range(1,9))
datas["Age"] = [makedata.get_age() for _ in range(8)]
names = []
while True:
    name = makedata.get_name()
    if name not in names:
        names.append(name)
    if len(names) >= 8:
        break
datas["Name"] = names

user_df = pd.DataFrame(datas)
user_df

Unnamed: 0,UserID,Age,Name
0,1,28,Alan
1,2,30,Billy
2,3,24,Anchal
3,4,30,Andrew
4,5,35,Anthony
5,6,29,Jin
6,7,30,Alex
7,8,38,Adam


In [19]:
# 리스트의 딕셔너리 : UserID, Name, Age
user_df = pd.DataFrame(columns=["UserID", "Name", "Age"])
datas = []

for idx in range(1,9):
    
    name = makedata.get_name()
    while name in list(user_df["Name"]):
        name = makedata.get_name()
    data = {"Name":name, "UserID":idx, "Age":makedata.get_age()}
    
    user_df.loc[len(user_df)] = data

user_df

Unnamed: 0,UserID,Name,Age
0,1,Anthony,38
1,2,Adam,24
2,3,Alex,21
3,4,Anchal,27
4,5,Alvin,38
5,6,Jin,29
6,7,Arnold,35
7,8,Billy,23


### quiz2
- money_df 만들기
    - 15개의 데이터
    - ID : 1 ~ 8 랜덤
    - Money : 1000원 단위로 1000원 ~ 20000원까지의 숫자가 저장

In [30]:
# 딕셔너리 데이터를 데이터 프레임에 하나씩 추가하기
money_df = pd.DataFrame(columns=["ID","Money"])
# np.random.randint(1,9)
for _ in range(15):
    money_df.loc[len(money_df)] = {
        "ID" : np.random.randint(1,9),
        "Money" : np.random.randint(1, 21) * 1000
    }

# 컬럼데이터에서 Unique 값 확인
ids = money_df["ID"].unique()
ids.sort()
ids

array([1, 3, 4, 5, 6, 7, 8], dtype=object)

In [31]:
money_df

Unnamed: 0,ID,Money
0,8,1000
1,8,5000
2,1,12000
3,8,4000
4,1,8000
5,7,14000
6,1,12000
7,7,7000
8,5,4000
9,3,6000


In [32]:
user_df

Unnamed: 0,UserID,Name,Age
0,1,Anthony,38
1,2,Adam,24
2,3,Alex,21
3,4,Anchal,27
4,5,Alvin,38
5,6,Jin,29
6,7,Arnold,35
7,8,Billy,23


### 1. merge

In [34]:
user_df.merge(money_df, left_on = "UserID", right_on = "ID").tail()

Unnamed: 0,UserID,Name,Age,ID,Money
10,7,Arnold,35,7,14000
11,7,Arnold,35,7,7000
12,8,Billy,23,8,1000
13,8,Billy,23,8,5000
14,8,Billy,23,8,4000


In [36]:
# 컬럼명 변경
user_df.rename(columns={"UserID":"ID"}, inplace = True)
user_df.tail(2)

Unnamed: 0,ID,Name,Age
6,7,Arnold,35
7,8,Billy,23


In [39]:
user_df.merge(money_df).tail()

Unnamed: 0,ID,Name,Age,Money
10,7,Arnold,35,14000
11,7,Arnold,35,7000
12,8,Billy,23,1000
13,8,Billy,23,5000
14,8,Billy,23,4000


In [38]:
result_df = pd.merge(money_df, user_df)
result_df.tail()

Unnamed: 0,ID,Money,Name,Age
10,3,14000,Alex,21
11,4,1000,Anchal,27
12,6,7000,Jin,29
13,6,18000,Jin,29
14,6,11000,Jin,29


In [41]:
# groupby : sum, size, min .. 함수 : Series
money_list = result_df.groupby("Name").sum()["Money"].reset_index()
money_list

Unnamed: 0,Name,Money
0,Alex,20000
1,Alvin,4000
2,Anchal,1000
3,Anthony,32000
4,Arnold,21000
5,Billy,10000
6,Jin,36000


In [48]:
# groupby : agg("sum"), agg("size"), min .. 함수 : DataFrame
money_list2 = result_df.groupby("Name").agg("sum").reset_index()[["Name", "Money"]]
money_list2

Unnamed: 0,Name,Money
0,Alex,20000
1,Alvin,4000
2,Anchal,1000
3,Anthony,32000
4,Arnold,21000
5,Billy,10000
6,Jin,36000


In [None]:
# merge : money_list, user_df : join = "outer"

In [54]:
result = pd.merge(user_df, money_list, how="outer")
result.head()

Unnamed: 0,ID,Name,Age,Money
0,1,Anthony,38,32000.0
1,2,Adam,24,
2,3,Alex,21,20000.0
3,4,Anchal,27,1000.0
4,5,Alvin,38,4000.0


In [None]:
# fillana : NaN을 특정 데이터로 채워줌

In [56]:
result.fillna(value = 0, inplace = True)
result

Unnamed: 0,ID,Name,Age,Money
0,1,Anthony,38,32000.0
1,2,Adam,24,0.0
2,3,Alex,21,20000.0
3,4,Anchal,27,1000.0
4,5,Alvin,38,4000.0
5,6,Jin,29,36000.0
6,7,Arnold,35,21000.0
7,8,Billy,23,10000.0


In [57]:
# money 컬럼을 정수로 데이터 타입을 변경
result.dtypes

ID         int64
Name      object
Age        int64
Money    float64
dtype: object

In [59]:
result["Money"]=result["Money"].astype("int")
result

Unnamed: 0,ID,Name,Age,Money
0,1,Anthony,38,32000
1,2,Adam,24,0
2,3,Alex,21,20000
3,4,Anchal,27,1000
4,5,Alvin,38,4000
5,6,Jin,29,36000
6,7,Arnold,35,21000
7,8,Billy,23,10000


In [67]:
np.average(result.sort_values("Money", ascending=False)[:3]["Age"])

34.0