In [1]:
import numpy as np
import pandas as pd

## 1. DataFrame

#### 1) DataFrame 정의
* Series가 1차원이라면 DataFrame은 2차원
* **Excel Sheet에 비유**하면 이해하기 쉬움
* 인덱스가 **Row, Column**으로 구성
    * **Row**는 각 **개별 데이터**를, **Column**은 **개별 속성**을 의미

In [21]:
# Data 출처는 Kaggle
train_data = pd.read_csv('./train.csv')

#### 2) Dataframe 데이터 파악 하기
* head, tail 함수
* shape 속성 (row, column)
* describe 함수 (숫자형 데이터의 통계 계산)
* info 함수 (데이터 타입 및 Itme 개수 등 출력)

In [8]:
train_data.head(n=2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [5]:
train_data.shape

(891, 12)

In [6]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


#### Index와 Columns

In [3]:
## 데이터 Index 조회
train_data.index

RangeIndex(start=0, stop=891, step=1)

In [5]:
## 데이터 컬럼 조회
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

#### 3) Dataframe 생성 (Dummy)

In [16]:
#########################################
## [1] Dictionery 타입으로부터 생성
#########################################
# data의 key값은 컬럼명에 해당
data = {'a' : [1,2,3], 'b' : [4,5,6], 'c' : [7,8,9]}

pd.DataFrame(data, index=['x', 'y', 'z'])

#########################################
## [2] Sereis 타입으로부터 생성
#########################################

a = pd.Series([100,200,300], ['a','b','c'])
b = pd.Series([100,222,352], ['a','b','c'])
c = pd.Series([6,25,3110], ['a','b','c'])

pd.DataFrame([a,b,c], index=['x', 'y', 'z'])

Unnamed: 0,a,b,c
x,100,200,300
y,100,222,352
z,6,25,3110


#### 4) 데이터 파일 읽어오기 (read_csv)
* sep : 구분자 설정
* header : Header가 존재하지 않는 경우, None 선언
* index_col : index로 사용할 column 설정
* usecols : 실제로 DataFrame으로 로딩할 컬럼 설정

In [20]:
train_data = pd.read_csv('./train.csv', index_col='PassengerId', usecols=['PassengerId', 'Name', 'Age'])
train_data

Unnamed: 0_level_0,Name,Age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"Braund, Mr. Owen Harris",22.0
2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0
3,"Heikkinen, Miss. Laina",26.0
4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0
5,"Allen, Mr. William Henry",35.0
6,"Moran, Mr. James",
7,"McCarthy, Mr. Timothy J",54.0
8,"Palsson, Master. Gosta Leonard",2.0
9,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",27.0
10,"Nasser, Mrs. Nicholas (Adele Achem)",14.0


#### 5) Column 선택

In [24]:
train_data['Name'] # 단일값
train_data[['Ticket', 'Name']] # 다중값

Unnamed: 0,Ticket,Name
0,A/5 21171,"Braund, Mr. Owen Harris"
1,PC 17599,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,STON/O2. 3101282,"Heikkinen, Miss. Laina"
3,113803,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,373450,"Allen, Mr. William Henry"
5,330877,"Moran, Mr. James"
6,17463,"McCarthy, Mr. Timothy J"
7,349909,"Palsson, Master. Gosta Leonard"
8,347742,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)"
9,237736,"Nasser, Mrs. Nicholas (Adele Achem)"


#### 6) Row 선택
* **Series는 []**를 통해 Row를 선택 -> **DataFrame은 loc 또는 iloc 이용** 하여 Row 선택
* 아래 함수는 column 선택 시에도 사용 가능
    * loc : 명시된 인덱스 값을 사용
    * iloc : 0 based index로 사용 (어떤 인덱스 값이든 내부적으로 가진 0부터의 값으로 가져온다)

In [26]:
## 데이터 슬라이싱 
# []는 보통 컬럼 선택에 사용하나 슬라이싱만 예외
train_data[:3]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [29]:
## 인덱스 값을 가져오기 위해 임의적으로 인덱스 값 변경
train_data.index = np.arange(100, 991)
train_data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
986,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
987,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
988,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
989,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
990,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [31]:
train_data.loc[989] # 단일 값

train_data.loc[[989,100,102]] # 다중 값

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
989,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
100,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
102,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [36]:
# 명시된 Index가 따로 존재하지만, 해당 값들은 내부적으로 
# 0부터 시작된 값을 참고할 수 있음
train_data.iloc[[0,1]]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
100,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
101,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


#### 참고) Column과 Row 를 동시에 가져올 경우

In [37]:
train_data.loc[[989,100,102], ['Name', 'Age', 'Fare']] # 다중 값

Unnamed: 0,Name,Age,Fare
989,"Behr, Mr. Karl Howell",26.0,30.0
100,"Braund, Mr. Owen Harris",22.0,7.25
102,"Heikkinen, Miss. Laina",26.0,7.925
