In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pylab as plt
import scipy as sp
import scipy.stats as stats

In [31]:
# making series
s = pd.Series(np.arange(5), dtype="f")
s

0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float32

In [42]:
# indexing, slicing

# 1번째 값
print(s[0])
print(s.iloc[0])

# 1~3번째 index 값
print(s[1:4])
print(s.iloc[1:4])

# 전체 평균 이상인 값들만 return
print(s[s >= s.mean()])

# 4, 2, 0 index 선택
print(s[[4, 2, 0]])
print(s.iloc[[4, 2, 0],])


0.0
0.0
1    1.0
2    2.0
3    3.0
dtype: float32
1    1.0
2    2.0
3    3.0
dtype: float32
2    2.0
3    3.0
4    4.0
dtype: float32
4    4.0
2    2.0
0    0.0
dtype: float32
4    4.0
2    2.0
0    0.0
dtype: float32


In [43]:
# making series with index name
s1 = pd.Series([x for x in range(5)], dtype="f",
              index=["a", "b", "c", "d", "e"])
s1

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float32

In [58]:
# a, b, e index
print(s1.loc[["a", "b", "e"]])

# c~e index
print(s1.loc["c":"e"])
print(s1.iloc[2:5])
print(s1.loc["c":])

# get() method : choose a, b, c index
print(s1.get([0, 1, 2]))
print(s1.get(["a", "b", "c"]))



a    0.0
b    1.0
e    4.0
dtype: float32
c    2.0
d    3.0
e    4.0
dtype: float32
c    2.0
d    3.0
e    4.0
dtype: float32
c    2.0
d    3.0
e    4.0
dtype: float32
a    0.0
b    1.0
c    2.0
dtype: float32
a    0.0
b    1.0
c    2.0
dtype: float32


In [60]:
# add value, change value

# add
s1.loc["f"] = 100
s1["g"] = 50
s1

a      0.0
b      1.0
c      2.0
d      3.0
e      4.0
f    100.0
g     50.0
dtype: float64

In [62]:
# change
s1.loc["c"] = 777
print(s1)
s1.loc["d":"f"] = 111
print(s1)

a      0.0
b      1.0
c    777.0
d      3.0
e      4.0
f    100.0
g     50.0
dtype: float64
a      0.0
b      1.0
c    777.0
d    111.0
e    111.0
f    111.0
g     50.0
dtype: float64


In [66]:
# checking index label
print("a" in s1)
print("x" in s1)

# checking values
print(777 in s1.values)
print(-1241 in s1.values)

True
False
True
False


In [150]:
# DataFrame 행과 열 생성, 선택, 삭제 
# (creation, selection, drop of row and column)

# making df
df = pd.DataFrame({'C1': [0., 1., 2., 3.],
                   'C2': [4., 5., 6., 7.],
                   'C3': [8., 9., 10., np.nan]},
                  index=['R1', 'R2', 'R3', 'R4'])
print(df.index)
print(df.columns)
print(df.axes)
df



Index(['R1', 'R2', 'R3', 'R4'], dtype='object')
Index(['C1', 'C2', 'C3'], dtype='object')
[Index(['R1', 'R2', 'R3', 'R4'], dtype='object'), Index(['C1', 'C2', 'C3'], dtype='object')]


Unnamed: 0,C1,C2,C3
R1,0.0,4.0,8.0
R2,1.0,5.0,9.0
R3,2.0,6.0,10.0
R4,3.0,7.0,


In [70]:
# making new df (choose index or columns)

# r1, r3 index
df_r1r3 = pd.DataFrame(df, index=["R1", "R3"])
df_r1r3

Unnamed: 0,C1,C2,C3
R1,0.0,4.0,8.0
R3,2.0,6.0,10.0


In [73]:
# c3, c1 columns
df_c3c1 = pd.DataFrame(df, columns=["C3", "C1"])
df_c3c1

Unnamed: 0,C3,C1
R1,8.0,0.0
R2,9.0,1.0
R3,10.0,2.0
R4,,3.0


In [76]:
# r4, r2 index, c3, c1 columns
df_r4r2_c3c1 = pd.DataFrame(df, index=["R4", "R2"],
                            columns=["C3", "C1"])
df_r4r2_c3c1

Unnamed: 0,C3,C1
R4,,3.0
R2,9.0,1.0


In [79]:
# selecting columns

# c3, c1
print(df.loc[:, ["C3", "C1"]])
print(df.iloc[:, [2, 0]])
print(df[["C3", "C1"]])

      C3   C1
R1   8.0  0.0
R2   9.0  1.0
R3  10.0  2.0
R4   NaN  3.0
      C3   C1
R1   8.0  0.0
R2   9.0  1.0
R3  10.0  2.0
R4   NaN  3.0
      C3   C1
R1   8.0  0.0
R2   9.0  1.0
R3  10.0  2.0
R4   NaN  3.0


In [99]:
# selecting rows(index)
# R1, R3
print(df.loc[["R1", "R3"], :])


     C1   C2    C3
R1  0.0  4.0   8.0
R3  2.0  6.0  10.0


In [101]:
# selecting index and columns

# R1, R2 index, C1, C3 columns
print(df.loc["R1":"R2", ["C1", "C3"]])

     C1   C3
R1  0.0  8.0
R2  1.0  9.0


In [129]:
# using boolean

# c1 열이 1보다 작은 것들만 return
print(df[df.loc[:, "C1"] <= 1.0])

# C2열이 5초과인 경우만 RETURN
print(df[df.loc[:, "C2"] > 5.0])

     C1   C2   C3
R1  0.0  4.0  8.0
R2  1.0  5.0  9.0
     C1   C2    C3
R3  2.0  6.0  10.0
R4  3.0  7.0   NaN


In [130]:
# 새로운 컬럼 만들기

# C5 = C1 + C2
df.loc[:, "C5"] = df.loc[:, "C1"] + df.loc[:, "C2"]
df

Unnamed: 0,C1,C2,C3,C5
R1,0.0,4.0,8.0,4.0
R2,1.0,5.0,9.0,6.0
R3,2.0,6.0,10.0,8.0
R4,3.0,7.0,,10.0


In [151]:
# assign() method : C5 = C1 * C2
df = df.assign(C5 = df.loc[:, "C1"] * df.loc[:, "C2"])
df

Unnamed: 0,C1,C2,C3,C5
R1,0.0,4.0,8.0,0.0
R2,1.0,5.0,9.0,5.0
R3,2.0,6.0,10.0,12.0
R4,3.0,7.0,,21.0


In [152]:
# assign() method using lambda
df = df.assign(C6 = lambda df: df.C1 * df.C2)
df

Unnamed: 0,C1,C2,C3,C5,C6
R1,0.0,4.0,8.0,0.0,0.0
R2,1.0,5.0,9.0,5.0,5.0
R3,2.0,6.0,10.0,12.0,12.0
R4,3.0,7.0,,21.0,21.0


In [141]:
# drop(), del df[] : 컬럼 삭제
print(df)

# drop(axis=1) : c5, c6 열 삭제
df1 = df.drop(labels=["C5", "C6"], axis=1)
df1


     C1   C2    C3    C5    C6
R1  0.0  4.0   8.0   0.0   0.0
R2  1.0  5.0   9.0   5.0   5.0
R3  2.0  6.0  10.0  12.0  12.0
R4  3.0  7.0   NaN  21.0  21.0


Unnamed: 0,C1,C2,C3
R1,0.0,4.0,8.0
R2,1.0,5.0,9.0
R3,2.0,6.0,10.0
R4,3.0,7.0,


In [143]:
# drop(axis=0) : 행 삭제
df1 = df.drop(labels=["R2", "R4"], axis=0)
df1

Unnamed: 0,C1,C2,C3,C5,C6
R1,0.0,4.0,8.0,0.0,0.0
R3,2.0,6.0,10.0,12.0,12.0


In [156]:
# del df[]
df2 = df
print(df2)

# c2열 삭제
del df["C2"]
df

     C1   C2    C3    C5    C6
R1  0.0  4.0   8.0   0.0   0.0
R2  1.0  5.0   9.0   5.0   5.0
R3  2.0  6.0  10.0  12.0  12.0
R4  3.0  7.0   NaN  21.0  21.0


Unnamed: 0,C1,C3,C5,C6
R1,0.0,8.0,0.0,0.0
R2,1.0,9.0,5.0,5.0
R3,2.0,10.0,12.0,12.0
R4,3.0,,21.0,21.0


In [167]:
# # Select columns by column vector : df[col_bool_vec]
col_selector = ["C1", "C5"]
df[col_selector]

Unnamed: 0,C1,C5
R1,0.0,0.0
R2,1.0,5.0
R3,2.0,12.0
R4,3.0,21.0
