In [2]:
import pandas as pd
import numpy as np

# User Guide
# 10 minutes to pandas
이것은 주로 신규 사용자를 대상으로 하는 pandas에 대한 짧은 소개입니다.
## Object creation (객체 생성)
list를 전달하여 시리즈 생성하기, pandas가 기본 정수 인덱스를 생성하도록 하기

In [4]:
# list
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

날짜 범위()를 사용한 날짜/시간 인덱스와 레이블이 지정된 열이 있는 NumPy 배열을 전달하여 데이터 프레임을 만듭니다.

In [5]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [6]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.971705,0.120595,0.164994,-0.251434
2013-01-02,0.070716,-0.514656,0.516765,-1.691488
2013-01-03,-1.266724,-1.471358,-1.226137,0.790024
2013-01-04,-0.566069,-0.235745,0.287757,0.297955
2013-01-05,0.615365,-0.533185,0.894591,-0.786175
2013-01-06,2.046327,1.510082,0.618119,-0.161702


In [7]:
# dictionary
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [8]:
# 데이터 프레임의 열은 서로 다른 dtype을 갖습니다.
df2.dtypes


A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [9]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 3
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   A       4 non-null      float64       
 1   B       4 non-null      datetime64[ns]
 2   C       4 non-null      float32       
 3   D       4 non-null      int32         
 4   E       4 non-null      category      
 5   F       4 non-null      object        
dtypes: category(1), datetime64[ns](1), float32(1), float64(1), int32(1), object(1)
memory usage: 288.0+ bytes


## Viewing data
* head()
* tail()
* index
* columns
* describe()
* T
* sort_index()
* sort_values()

In [10]:
# head : view the top rows of the frame
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.971705,0.120595,0.164994,-0.251434
2013-01-02,0.070716,-0.514656,0.516765,-1.691488
2013-01-03,-1.266724,-1.471358,-1.226137,0.790024
2013-01-04,-0.566069,-0.235745,0.287757,0.297955
2013-01-05,0.615365,-0.533185,0.894591,-0.786175


In [11]:
# tail : view the bottom rows of the frame
df.tail(2)

Unnamed: 0,A,B,C,D
2013-01-05,0.615365,-0.533185,0.894591,-0.786175
2013-01-06,2.046327,1.510082,0.618119,-0.161702


In [12]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [13]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

DataFrame.to_numpy()는 기초 데이터의 NumPy 표현을 제공합니다.  
데이터 프레임에 데이터 유형이 다른 열이 있는 경우 이 작업은 비용이 많이 드는 작업이 될 수 있는데,  
이는 pandas와 NumPy의 근본적인 차이에서 비롯됩니다.  
NumPy 배열에는 전체 배열에 대해 하나의 dtype이 있는 반면, pandas 데이터프레임에는 열당 하나의 dtype이 있습니다.  
DataFrame.to_numpy()를 호출하면 pandas는 데이터프레임에 있는 모든 타입을 담을 수 있는 NumPy 타입을 찾습니다. 이것은 결국 객체가 될 수 있으며, 모든 값을 파이썬 객체로 형변환해야 합니다.
* DataFrame.to_numpy()는 출력에 인덱스 또는 열 레이블을 포함하지 않습니다.

In [14]:
# floating-point
# 모든 부동소수점 값의 DataFrame, DataFrame.to_numpy()는 빠르며 데이터를 복사할 필요가 없습니다.
df.to_numpy()

array([[-0.97170475,  0.12059468,  0.16499441, -0.25143421],
       [ 0.07071562, -0.51465585,  0.51676534, -1.69148836],
       [-1.2667242 , -1.47135762, -1.22613652,  0.79002358],
       [-0.5660689 , -0.23574452,  0.28775727,  0.29795469],
       [ 0.61536538, -0.53318454,  0.89459076, -0.78617474],
       [ 2.04632744,  1.51008169,  0.61811896, -0.16170177]])

In [15]:
# 여러 dtype을 가진 데이터프레임의 경우
# DataFrame.to_numpy()는 상대적으로 expensive하다.
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [16]:
# describe()
# 데이터의 빠른 통계 요약을 보여줍니다.

df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.012015,-0.187378,0.209348,-0.30047
std,1.219795,0.985277,0.748316,0.864846
min,-1.266724,-1.471358,-1.226137,-1.691488
25%,-0.870296,-0.528552,0.195685,-0.65249
50%,-0.247677,-0.3752,0.402261,-0.206568
75%,0.479203,0.03151,0.592781,0.183041
max,2.046327,1.510082,0.894591,0.790024


In [17]:
# T
# Transposing your data
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.971705,0.070716,-1.266724,-0.566069,0.615365,2.046327
B,0.120595,-0.514656,-1.471358,-0.235745,-0.533185,1.510082
C,0.164994,0.516765,-1.226137,0.287757,0.894591,0.618119
D,-0.251434,-1.691488,0.790024,0.297955,-0.786175,-0.161702


In [18]:
# sort_index() : 축(axis)을 기준으로 정렬
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.251434,0.164994,0.120595,-0.971705
2013-01-02,-1.691488,0.516765,-0.514656,0.070716
2013-01-03,0.790024,-1.226137,-1.471358,-1.266724
2013-01-04,0.297955,0.287757,-0.235745,-0.566069
2013-01-05,-0.786175,0.894591,-0.533185,0.615365
2013-01-06,-0.161702,0.618119,1.510082,2.046327


In [19]:
# sort_values() : 값을 기준으로 정렬
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-03,-1.266724,-1.471358,-1.226137,0.790024
2013-01-05,0.615365,-0.533185,0.894591,-0.786175
2013-01-02,0.070716,-0.514656,0.516765,-1.691488
2013-01-04,-0.566069,-0.235745,0.287757,0.297955
2013-01-01,-0.971705,0.120595,0.164994,-0.251434
2013-01-06,2.046327,1.510082,0.618119,-0.161702
