### [참고] <a href="https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf">Pandas Cheat Sheet</a>

### DataFrame (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html)

<img src="https://miro.medium.com/max/1059/1*5zJ9tsVIRvxY83GsO8eyOw.png" width="500" height="350">

**pd.DataFrame(data=None,index: Union[Collection, NoneType] = None, columns: Union[Collection, NoneType] = None,  dtype: Union[str, numpy.dtype, ForwardRef('ExtensionDtype'), NoneType] = None,   copy: bool = False)**

- 데이터프레임은 테이블형(2차원) 데이터이며, 데이터 분석/머신 러닝에서 데이터 처리를 위해 주로 사용됨
- 2차원이기 때문에 엑셀/csv와 같이 데이터가 row, column로 구성되며, 인덱스도 두 개, row/column 각각 존재함
  - 행의 레이블은 인덱스로, 열의 레이블은 컬럼으로 부름

In [1]:
import pandas as pd
import numpy as np

### 생성
#### 1) 딕셔너리로 생성

In [2]:
# ? shift + enter ==> 설명 및 예제 확인 가능
pd.DataFrame?

In [4]:
friend_dict = [
    {
        "name" : "John",
        "age":25,
        "job":"student"
    },
    {
        "name" : "Nate",
        "age":34,
        "job":"teacher"
    },
    {
        "name" : "Jenny",
        "age":30,
        "job":"developer"
    }
]
friend_df = pd.DataFrame(friend_dict)
friend_df
# 키값 == 컬럼

Unnamed: 0,name,age,job
0,John,25,student
1,Nate,34,teacher
2,Jenny,30,developer


In [5]:
# 열 단위로 입력
# dict1=[
#     {
#         "국어" : 15,
#         "영어" : 45,
#         "수학" : 75
#     },
#     {
#         "국어" : 25,
#         "영어" : 55,
#         "수학" : 85
#     },
#     {
#         "국어" : 35,
#         "영어" : 65,
#         "수학" : 95
#     }
# ]

# 행 단위로 입력
dict1 = {"국어":[15,25,35],"영어" : [45,55,65], "수학" : [75,85,95]}
student_df = pd.DataFrame(dict1)
student_df

Unnamed: 0,국어,영어,수학
0,15,45,75
1,25,55,85
2,35,65,95


><b>index 넣어서 생성</b>

In [6]:
person_df = pd.DataFrame(friend_dict, index = ["f1","f2","f3"])
person_df

Unnamed: 0,name,age,job
f1,John,25,student
f2,Nate,34,teacher
f3,Jenny,30,developer


In [7]:
month_df = pd.DataFrame(dict1, index = ["1월","2월","3월"])
month_df

Unnamed: 0,국어,영어,수학
1월,15,45,75
2월,25,55,85
3월,35,65,95


#### 2) 이차원 리스트로 생성

In [8]:
list1 = [[1,2,3,4,5],[6,7,8,9,10]]

two_df = pd.DataFrame(list1, index=["내용1","내용2"],
                      columns=["c1","c2","c3","c4","c5"])
two_df

Unnamed: 0,c1,c2,c3,c4,c5
내용1,1,2,3,4,5
내용2,6,7,8,9,10


#### 3) csv 파일로 생성

In [9]:
sample3_df = pd.read_csv("../data/sample3.csv", header = None)
sample3_df

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9
3,10,11,12
4,13,14,15


- header = None : csv 첫 행을 컬럼명으로 사용하지 않음

In [10]:
sample3_df = pd.read_csv("../data/sample3.csv", header = None,
                         names=["c1","c2","c3"])
sample3_df

Unnamed: 0,c1,c2,c3
0,1,2,3
1,4,5,6
2,7,8,9
3,10,11,12
4,13,14,15


- names : 컬럼의 이름을 부여

In [11]:
# UnicodeDecodeError  
sample1_df = pd.read_csv("../data/sample1.csv")
sample1_df

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb9 in position 0: invalid start byte

In [12]:
# 인코딩,cp 949 : 윈도우
sample1_df = pd.read_csv("../data/sample1.csv", encoding="cp949")
sample1_df

Unnamed: 0,번호,이름,가입일시,나이
0,1,김정수,2017-01-19 11:30:00,25
1,2,박민구,2017-02-07 10:22:00,35
2,3,정순미,2017-01-22 09:10:00,33
3,4,김정현,2017-02-22 14:09:00,45
4,5,홍미진,2017-04-01 18:00:00,17
5,6,김순철,2017-05-14 22:33:07,22
6,7,이동철,2017-03-01 23:44:45,27
7,8,박지숙,2017-01-11 06:04:18,30
8,9,김은미,2017-02-08 07:44:33,51
9,10,장혁철,2017-12-01 13:01:11,16


- 인코딩 옵션
    - encoding : 기본 utf-8, utf-8-sig, cp949 중에서 사용

In [13]:
sample2_df = pd.read_csv("../data/sample2.csv", encoding="cp949", delimiter="|")
sample2_df

Unnamed: 0,번호,이름,가입일시,나이
0,1,김정수,2017-01-19 11:30:00,25
1,2,박민구,2017-02-07 10:22:00,35
2,3,정순미,2017-01-22 09:10:00,33
3,4,김정현,2017-02-22 14:09:00,45
4,5,홍미진,2017-04-01 18:00:00,17
5,6,김순철,2017-05-14 22:33:07,22
6,7,이동철,2017-03-01 23:44:45,27
7,8,박지숙,2017-01-11 06:04:18,30
8,9,김은미,2017-02-08 07:44:33,51
9,10,장혁철,2017-12-01 13:01:11,16


- 셀 구분하기 : delimiter="|"
    - csv는 컬럼의 구분이 콤마가 기본임, 다른 걸 사용하는 경우 지정

In [14]:
sample2_df = pd.read_csv("../data/sample2.csv", encoding="cp949",
                         delimiter="|", index_col=0)
sample2_df

Unnamed: 0_level_0,이름,가입일시,나이
번호,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,김정수,2017-01-19 11:30:00,25
2,박민구,2017-02-07 10:22:00,35
3,정순미,2017-01-22 09:10:00,33
4,김정현,2017-02-22 14:09:00,45
5,홍미진,2017-04-01 18:00:00,17
6,김순철,2017-05-14 22:33:07,22
7,이동철,2017-03-01 23:44:45,27
8,박지숙,2017-01-11 06:04:18,30
9,김은미,2017-02-08 07:44:33,51
10,장혁철,2017-12-01 13:01:11,16


- index_col : 인덱스로 사용하고자 하는 컬럼

#### 4) excel 파일로 생성

In [15]:
sample_df = pd.read_excel("../data/sample.xlsx")
sample_df

Unnamed: 0,Sap Co.,대리점,영업사원,전월,금월,TEAM,총 판매수량
0,KI1316,경기수원대리점,이기정,1720000,2952000,1,123
1,KI1451,충청홍성대리점,정미진,4080000,2706000,2,220
2,KI1534,경기화성대리점,경인선,600000,2214000,1,320
3,KI1636,강원속초대리점,이동권,3720000,2870000,3,110
4,KI1735,경기안양대리점,강준석,4800000,2296000,1,134
5,KI1875,제주제주시대리점,민경수,3420000,2346000,4,210
6,KI1917,경기광주대리점,김진혜,1292000,1518000,1,110
7,KI2032,경기평택대리점,고유정,2736000,2139000,2,90
8,KI2153,경기의정부대리점,김은향,1368000,2484000,1,183
9,KI2214,경기성남대리점,이준수,1976000,1518000,3,73


In [16]:
train_df = pd.read_excel("../data/dataset/train.xlsx")
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### 조회

In [17]:
# 인덱스 조회
friend_df.index

RangeIndex(start=0, stop=3, step=1)

In [18]:
# 컬럼 조회
friend_df.columns

Index(['name', 'age', 'job'], dtype='object')

In [19]:
# value 조회
friend_df.values

array([['John', 25, 'student'],
       ['Nate', 34, 'teacher'],
       ['Jenny', 30, 'developer']], dtype=object)

In [20]:
# 데이터 타입 조회
friend_df.dtypes

name    object
age      int64
job     object
dtype: object

### 삭제
* 수정과 삭제는 해당 dataframe에 적용을 하지 않으면 반영 안됨
* 직접 반영을 위해 inplace=True 사용/ df에 직접 변경을 원하지 않으면 새로운 변수에 담아서 사용하기

#### 1) drop()

In [31]:
# 인덱스 값 사용
friend_df.drop([0,2])

Unnamed: 0,name,age,job
1,Nate,34,teacher


In [32]:
# 원본에 적용이 안됨
friend_df

Unnamed: 0,name,age,job
0,John,25,student
1,Nate,34,teacher
2,Jenny,30,developer


In [33]:
# 새로 담기
friend_df = friend_df.drop([0,2])
friend_df

Unnamed: 0,name,age,job
1,Nate,34,teacher


In [37]:
# 새로 생성
friend_df = pd.DataFrame(friend_dict)
friend_df

Unnamed: 0,name,age,job
0,John,25,student
1,Nate,34,teacher
2,Jenny,30,developer


In [35]:
friend_df.drop([0,2],inplace = True)

In [36]:
friend_df

Unnamed: 0,name,age,job
1,Nate,34,teacher


- 삭제 할 때
    - inplace= True 를 줘서 데이터 프레임에 직접 반영
    - drop 한 결과를 변수에 담으면 됨 

In [38]:
friend_df = pd.DataFrame(friend_dict)
friend_df

Unnamed: 0,name,age,job
0,John,25,student
1,Nate,34,teacher
2,Jenny,30,developer


In [39]:
# KeyError: "['age'] not found in axis" : 축을 모르겠음
friend_df.drop("age")

KeyError: "['age'] not found in axis"

In [40]:
# 컬럼에서 찾기
friend_df.drop("age", axis = 1)

Unnamed: 0,name,job
0,John,student
1,Nate,teacher
2,Jenny,developer


In [42]:
# 컬럼명 주기
friend_df.drop(columns="age")

Unnamed: 0,name,job
0,John,student
1,Nate,teacher
2,Jenny,developer


### 수정

#### 1)  컬럼명 수정

In [43]:
friend_df

Unnamed: 0,name,age,job
0,John,25,student
1,Nate,34,teacher
2,Jenny,30,developer


In [44]:
# 컬럼명 순수 한글로 바꾸기
friend_df.columns = ['이름','나이','직업']
friend_df

Unnamed: 0,이름,나이,직업
0,John,25,student
1,Nate,34,teacher
2,Jenny,30,developer


#### 2) 인덱스 수정

In [45]:
friend_df.index

RangeIndex(start=0, stop=3, step=1)

In [46]:
friend_df.index = ["p1","p2","p3"]
friend_df

Unnamed: 0,이름,나이,직업
p1,John,25,student
p2,Nate,34,teacher
p3,Jenny,30,developer


#### 3) 컬럼 추가

In [47]:
# 기본은 제일 마지막에 추가 됨.
friend_df['주소'] = ["서울","경기","부산"]
friend_df

Unnamed: 0,이름,나이,직업,주소
p1,John,25,student,서울
p2,Nate,34,teacher,경기
p3,Jenny,30,developer,부산


In [49]:
# insert(삽일할 위치, 컬럼명, 삽입 데이터)

friend_df.insert(1,'전화번호',['010-1234-5678','010-5678-4568','010-9876-5432'])
friend_df

Unnamed: 0,이름,전화번호,나이,직업,주소
p1,John,010-1234-5678,25,student,서울
p2,Nate,010-5678-4568,34,teacher,경기
p3,Jenny,010-9876-5432,30,developer,부산


#### 4) 행 추가

In [50]:
friend_df.loc["p4"] = ['Tom','010-9876-1234',28,'tester','대전']
friend_df

Unnamed: 0,이름,전화번호,나이,직업,주소
p1,John,010-1234-5678,25,student,서울
p2,Nate,010-5678-4568,34,teacher,경기
p3,Jenny,010-9876-5432,30,developer,부산
p4,Tom,010-9876-1234,28,tester,대전
