In [1]:
# 판다스 기초 api

# 판다스 사용
import pandas as pd
import numpy as np

# Series 정의
obj = pd.Series([4, 3, 2, 1])
obj

0    4
1    3
2    2
3    1
dtype: int64

In [9]:
# series 값만 확인하기

print(obj.values)
print(obj[0])
print(obj[1])

[4 3 2 1]
4
3


In [11]:
# series 인덱스만 확인하기

print(obj.index)
obj.dtypes

RangeIndex(start=0, stop=4, step=1)


dtype('int64')

In [12]:
# 인덱스를 바꿀 수 있다.
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [13]:
# python의 dictionary 자료형을 Series data로 만들 수 있다.
# dictionary의 key가 Series의 index가 된다
sdata = {'Kim': 35000, 'Beomwoo': 67000, 'Joan': 12000, 'Choi': 4000}
obj3 = pd.Series(sdata)
obj3        

Kim        35000
Beomwoo    67000
Joan       12000
Choi        4000
dtype: int64

In [14]:
obj3.name = 'Salary'
obj3.index.name = "Names"
obj3

Names
Kim        35000
Beomwoo    67000
Joan       12000
Choi        4000
Name: Salary, dtype: int64

In [15]:
obj3.index = ["A", "B", "C", "D"]

In [16]:
obj3

A    35000
B    67000
C    12000
D     4000
Name: Salary, dtype: int64

In [18]:
# Data Frame 정의하기
# 이전에 DataFrame에 들어갈 데이터를 정의해주어야 하는데,
# 이는 python의 dictionary 또는 numpy의 array로 정의할 수 있다.
data = {'name': ['Beomwoo', 'Beomwoo', 'Beomwoo', 'Kim', 'Park'],
        'year': [2013, 2014, 2015, 2016, 2015],
        'points': [1.5, 1.7, 3.6, 2.4, 2.9]}
print(data)
df = pd.DataFrame(data)
df

{'name': ['Beomwoo', 'Beomwoo', 'Beomwoo', 'Kim', 'Park'], 'year': [2013, 2014, 2015, 2016, 2015], 'points': [1.5, 1.7, 3.6, 2.4, 2.9]}


Unnamed: 0,name,year,points
0,Beomwoo,2013,1.5
1,Beomwoo,2014,1.7
2,Beomwoo,2015,3.6
3,Kim,2016,2.4
4,Park,2015,2.9


In [19]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [21]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [22]:
df.columns

Index(['name', 'year', 'points'], dtype='object')

In [23]:
df.values

array([['Beomwoo', 2013, 1.5],
       ['Beomwoo', 2014, 1.7],
       ['Beomwoo', 2015, 3.6],
       ['Kim', 2016, 2.4],
       ['Park', 2015, 2.9]], dtype=object)

In [24]:
# DataFrame을 만들면서 columns와 index를 설정할 수 있다.
df2 = pd.DataFrame(data, columns=['year', 'name', 'points', 'penalty'],
                  index=['one', 'two', 'three', 'four', 'five'])
df2

Unnamed: 0,year,name,points,penalty
one,2013,Beomwoo,1.5,
two,2014,Beomwoo,1.7,
three,2015,Beomwoo,3.6,
four,2016,Kim,2.4,
five,2015,Park,2.9,


In [25]:
df2.describe()

Unnamed: 0,year,points
count,5.0,5.0
mean,2014.6,2.42
std,1.140175,0.864292
min,2013.0,1.5
25%,2014.0,1.7
50%,2015.0,2.4
75%,2015.0,2.9
max,2016.0,3.6


In [26]:
data = {"names": ["Kilho", "Kilho", "Kilho", "Charles", "Charles"],
           "year": [2014, 2015, 2016, 2015, 2016],
           "points": [1.5, 1.7, 3.6, 2.4, 2.9]}
df = pd.DataFrame(data, columns=["year", "names", "points", "penalty"],
                          index=["one", "two", "three", "four", "five"])
df

Unnamed: 0,year,names,points,penalty
one,2014,Kilho,1.5,
two,2015,Kilho,1.7,
three,2016,Kilho,3.6,
four,2015,Charles,2.4,
five,2016,Charles,2.9,


In [27]:
df['year']

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [28]:
print(df['year'])

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64


In [33]:
df[['year', 'points']]

Unnamed: 0,year,points
one,2014,1.5
two,2015,1.7
three,2016,3.6
four,2015,2.4
five,2016,2.9


In [34]:
df['penalty'] = [1, 2, 3, 4, 5] # 한개를 넣던지.. 아니면 여러개를 넣던지, series 형태로 표현한다는 걸 잊지 말아야함

In [35]:
df.values

array([[2014, 'Kilho', 1.5, 1],
       [2015, 'Kilho', 1.7, 2],
       [2016, 'Kilho', 3.6, 3],
       [2015, 'Charles', 2.4, 4],
       [2016, 'Charles', 2.9, 5]], dtype=object)

In [37]:
# 새로운 열을 추가하기
df['zeros'] = np.arange(5)

In [38]:
df.values

array([[2014, 'Kilho', 1.5, 1, 0],
       [2015, 'Kilho', 1.7, 2, 1],
       [2016, 'Kilho', 3.6, 3, 2],
       [2015, 'Charles', 2.4, 4, 3],
       [2016, 'Charles', 2.9, 5, 4]], dtype=object)

In [39]:
# Series를 추가할 수도 있다.
val = pd.Series([-1.2, -1.5, -1.7], index=['two','four','five'])

In [40]:
df['debt'] = val

In [41]:
df

Unnamed: 0,year,names,points,penalty,zeros,debt
one,2014,Kilho,1.5,1,0,
two,2015,Kilho,1.7,2,1,-1.2
three,2016,Kilho,3.6,3,2,
four,2015,Charles,2.4,4,3,-1.5
five,2016,Charles,2.9,5,4,-1.7


In [42]:
df['net_points'] = df['points'] - df['penalty']
df['high_points'] = df['net_points'] > 2.0

In [43]:
df.values

array([[2014, 'Kilho', 1.5, 1, 0, nan, 0.5, False],
       [2015, 'Kilho', 1.7, 2, 1, -1.2, -0.30000000000000004, False],
       [2016, 'Kilho', 3.6, 3, 2, nan, 0.6000000000000001, False],
       [2015, 'Charles', 2.4, 4, 3, -1.5, -1.6, False],
       [2016, 'Charles', 2.9, 5, 4, -1.7, -2.1, False]], dtype=object)

In [44]:
df

Unnamed: 0,year,names,points,penalty,zeros,debt,net_points,high_points
one,2014,Kilho,1.5,1,0,,0.5,False
two,2015,Kilho,1.7,2,1,-1.2,-0.3,False
three,2016,Kilho,3.6,3,2,,0.6,False
four,2015,Charles,2.4,4,3,-1.5,-1.6,False
five,2016,Charles,2.9,5,4,-1.7,-2.1,False


In [45]:
# 열 삭제하기
del df['high_points']
del df['net_points']
del df['zeros']

In [46]:
df.head(1)

Unnamed: 0,year,names,points,penalty,debt
one,2014,Kilho,1.5,1,


In [47]:
del df['debt']

In [48]:
df['year'][0]

2014

In [49]:
df.columns

Index(['year', 'names', 'points', 'penalty'], dtype='object')

In [50]:
df.index.name = "Order"
df.columns.name = "info"

In [51]:
df

info,year,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,2014,Kilho,1.5,1
two,2015,Kilho,1.7,2
three,2016,Kilho,3.6,3
four,2015,Charles,2.4,4
five,2016,Charles,2.9,5


In [52]:
# 0번째 부터 2(3-1) 번째까지 가져온다.
# 뒤에 써준 숫자번째의 행은 뺀다.
df[0:3]

info,year,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,2014,Kilho,1.5,1
two,2015,Kilho,1.7,2
three,2016,Kilho,3.6,3


In [55]:
df["three":]

info,year,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
three,2016,Kilho,3.6,3
four,2015,Charles,2.4,4
five,2016,Charles,2.9,5


In [56]:
# https://doorbw.tistory.com/172 참조...