# DataFrame

## 1. DataFrame 생성
- 2차원 배열 형식
- 표같은 스프레드시트 자료구조
- 여러개의 컬럼을 가지며 서로다른 종류의 값이 담긴다.

**DataFrame 생성 방법**
1. 리스트 값을 딕셔너리로 사용
2. Numpy 배열을 이용
3. read_csv(), read_excel()... 함수 사용

In [36]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
%matplotlib inline

**1. 딕셔너리로 데이터프레임 생성**
- 딕셔너리로 만들면 좋은점 : key값이 columns로 들어간다.
- 데이터프레임의 각 열은 Series이다. 즉, 각 Series마다 데이터타입(dtype)이 다를 수 있다.

In [19]:
dic = {
    'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
    'year':[2000, 2001, 2002, 2001, 2002, 2003],
    'pop':[1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
      }
df1 = pd.DataFrame(dic)
print('*'*50)
print(df1['state'])
print(type(df1['state']))
print('*'*50)
print(df1['year'])
print(type(df1['year']))
print('*'*50)
print(df1['pop'])
print(type(df1['pop']))
print('*'*50)

**************************************************
0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object
<class 'pandas.core.series.Series'>
**************************************************
0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64
<class 'pandas.core.series.Series'>
**************************************************
0    1.5
1    1.7
2    3.6
3    2.4
4    2.9
5    3.2
Name: pop, dtype: float64
<class 'pandas.core.series.Series'>
**************************************************


In [20]:
dic2 = {
    'name' :['James', 'Peter', 'Tomas', 'Robert'],
    'address' :['NY', ' TEXAS', 'LA', 'TEXAS'],
    'age':[33, 44, 55, 66]
}

df2 = DataFrame(dic2)
df2

Unnamed: 0,name,address,age
0,James,NY,33
1,Peter,TEXAS,44
2,Tomas,LA,55
3,Robert,TEXAS,66


**2. Numpy 배열을 이용**

In [23]:
np.random.seed(100)
df3 = DataFrame(np.random.randint(10,100,16).reshape(4,4),
                            index=list('abcd'),
                            columns=list('abcd'))

# 컬럼명을 수정
df3.columns = ['one', 'two', 'three', 'four']
df3.head()

Unnamed: 0,one,two,three,four
a,18,34,77,97
b,89,58,20,62
c,63,76,24,44
d,34,25,70,68


In [28]:
df3.rename(columns={'one': '1', 'two':'둘', 'four':'포'}) # 컬럼명을 부분적으로 바꿔줄 수 있다.

Unnamed: 0,1,둘,three,포
a,18,34,77,97
b,89,58,20,62
c,63,76,24,44
d,34,25,70,68


**3. read_csv(), read_excel()... 함수 사용**

In [50]:
read_df = pd.read_csv('../data/tips.csv')
read_df.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2.0
1,10.34,1.66,Male,No,Sun,Dinner,3.0
2,21.01,3.5,Male,No,Sun,Dinner,3.0


## 2. DataFrame 구조
- 구조를 확인하는 속성
    - index
    - columns
    - values
    - dtype**s**
    - info()
    - describe
    - shape

In [58]:
print(read_df.index)

RangeIndex(start=0, stop=245, step=1)


In [59]:
print(read_df.columns)

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')


In [60]:
print(read_df.values)

[[16.99 1.01 'Female' ... 'Sun' 'Dinner' 2.0]
 [10.34 1.66 'Male' ... 'Sun' 'Dinner' 3.0]
 [21.01 3.5 'Male' ... 'Sun' 'Dinner' 3.0]
 ...
 [17.82 1.75 'Male' ... 'Sat' 'Dinner' 2.0]
 [18.78 3.0 'Female' ... 'Thur' 'Dinner' 2.0]
 [25.34 nan nan ... nan nan nan]]


In [62]:
print(read_df.dtypes)

total_bill    float64
tip           float64
sex            object
smoker         object
day            object
time           object
size          float64
dtype: object


In [61]:
print(read_df.describe)

<bound method NDFrame.describe of      total_bill   tip     sex smoker   day    time  size
0         16.99  1.01  Female     No   Sun  Dinner   2.0
1         10.34  1.66    Male     No   Sun  Dinner   3.0
2         21.01  3.50    Male     No   Sun  Dinner   3.0
3         23.68  3.31    Male     No   Sun  Dinner   2.0
4         24.59  3.61  Female     No   Sun  Dinner   4.0
..          ...   ...     ...    ...   ...     ...   ...
240       27.18  2.00  Female    Yes   Sat  Dinner   2.0
241       22.67  2.00    Male    Yes   Sat  Dinner   2.0
242       17.82  1.75    Male     No   Sat  Dinner   2.0
243       18.78  3.00  Female     No  Thur  Dinner   2.0
244       25.34   NaN     NaN    NaN   NaN     NaN   NaN

[245 rows x 7 columns]>


In [54]:
read_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245 entries, 0 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  245 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    float64
dtypes: float64(3), object(4)
memory usage: 13.5+ KB


In [66]:
read_df.shape # tuple로 출력된다.

(245, 7)

## 3. DataFrame 조회
- info()
- head()
- tail()

In [67]:
read_df.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2.0
1,10.34,1.66,Male,No,Sun,Dinner,3.0
2,21.01,3.5,Male,No,Sun,Dinner,3.0


In [68]:
read_df.tail(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
242,17.82,1.75,Male,No,Sat,Dinner,2.0
243,18.78,3.0,Female,No,Thur,Dinner,2.0
244,25.34,,,,,,


## 4. DataFrame 컬럼명 변경 및 추가
1. 컬럼명 전체 수정 : columns
    - df.columns = [전체 컬럼 수정명]
    - 원본 변경 된다.
2. 컬럼명 부분 수정 : rename
    - df.rename({'처음컬럼명' : '바꿀컬럼명', ...})
    - 원본 변경 X
    - 변경 희망시 inplace=True를 지정해준다.
3. 컬럼 추가

In [69]:
np.random.seed(100)
df3 = DataFrame(np.random.randint(10,100,16).reshape(4,4),
                            index=list('abcd'),
                            columns=list('abcd'))

# 컬럼명을 수정
df3.columns = ['one', 'two', 'three', 'four']
df3.head()

Unnamed: 0,one,two,three,four
a,18,34,77,97
b,89,58,20,62
c,63,76,24,44
d,34,25,70,68


In [70]:
# 전체 컬럼명 변경
df3.columns = ['A-class', 'B-class', 'C-class', 'D-class']
df3

Unnamed: 0,A-class,B-class,C-class,D-class
a,18,34,77,97
b,89,58,20,62
c,63,76,24,44
d,34,25,70,68


In [71]:
# 부분 컬럼명 변경
df2

Unnamed: 0,name,address,age
0,James,NY,33
1,Peter,TEXAS,44
2,Tomas,LA,55
3,Robert,TEXAS,66


In [76]:
df2.rename(columns={'address' : 'addr'}, inplace=True)
df2

Unnamed: 0,name,addr,age
0,James,NY,33
1,Peter,TEXAS,44
2,Tomas,LA,55
3,Robert,TEXAS,66


In [77]:
# 컬럼 추가하기
df2['phone'] = np.nan

In [78]:
df2

Unnamed: 0,name,addr,age,phone
0,James,NY,33,
1,Peter,TEXAS,44,
2,Tomas,LA,55,
3,Robert,TEXAS,66,
