#### 분석용 함수 사용하기

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = [
    [1.4, np.nan],
    [7.1, -4.5],
    [np.nan, np.nan],
    [0.75, -1.3]
]
data

[[1.4, nan], [7.1, -4.5], [nan, nan], [0.75, -1.3]]

In [None]:
df =\
    pd.DataFrame(
    data, 
    index = ['a', 'b', 'c', 'd'],
    columns = ['one', 'two'],
)
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [9]:
df.info()

<class 'pandas.DataFrame'>
Index: 4 entries, a to d
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   one     3 non-null      float64
 1   two     2 non-null      float64
dtypes: float64(2)
memory usage: 96.0+ bytes


In [11]:
# 열방향 합계 구하기
# pandas 에서는 nan 을 빼고 계산을 함
df.sum(
    axis='rows'
)

one    9.25
two   -5.80
dtype: float64

In [12]:
# 행방향 합계 구하기
# pandas 에서는 nan 을 빼고 계산을 함
df.sum(
    axis='columns'
)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [None]:
# 특정 열의 합계 구하기
# df.one => series 1차원 데이터라서 방향을 줄 필요가 X
df.one.sum()

np.float64(9.25)

In [15]:
# 특정 행의 합계 구하기
df.loc['b'].sum()

np.float64(2.5999999999999996)

In [21]:

df.mean(
    axis = 'rows',
    skipna=True
)

one    3.083333
two   -2.900000
dtype: float64

In [27]:
# NaN 처리
df.one.fillna(value=df.one.mean(), inplace=True)
df

C:\Users\tjoeun\AppData\Local\Temp\ipykernel_14040\1853566580.py:2: ChainedAssignmentError: A value is being set on a copy of a DataFrame or Series through chained assignment using an inplace method.
Such inplace method never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy (due to Copy-on-Write).

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' instead, to perform the operation inplace on the original object, or try to avoid an inplace operation using 'df[col] = df[col].method(value)'.

See the documentation for a more detailed explanation: https://pandas.pydata.org/pandas-docs/stable/user_guide/copy_on_write.html
  df.one.fillna(value=df.one.mean(), inplace=True)


Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


-----
#### DataFrame Merging(병합) 
###### 키값 중심으로 데이터를 합치는것

In [31]:
df1 = pd.DataFrame(
    {
        'key': list('bbacaab'),
        'data1' : range(7)
    }
)
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [32]:
df2 = pd.DataFrame({
    'key': list('abd'),
    'data2':range(3)
})
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [None]:
# 병합하기 : 기본값은 inner join
pd.merge(
    df1,
    df2,
    on='key'
) # 교집합. 

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,a,2,0
3,a,4,0
4,a,5,0
5,b,6,1


In [35]:
# 병합하기 : outer join
pd.merge(
    df1,
    df2,
    on='key',
    how='outer'
) # 교집합. 

Unnamed: 0,key,data1,data2
0,a,2.0,0.0
1,a,4.0,0.0
2,a,5.0,0.0
3,b,0.0,1.0
4,b,1.0,1.0
5,b,6.0,1.0
6,c,3.0,
7,d,,2.0


In [36]:
# 병합하기 : left join
pd.merge(
    df1,
    df2,
    on='key',
    how='left'
) # 교집합. 

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,1,1.0
2,a,2,0.0
3,c,3,
4,a,4,0.0
5,a,5,0.0
6,b,6,1.0


In [37]:
# 병합하기 : right join
pd.merge(
    df1,
    df2,
    on='key',
    how='right'
) # 교집합. 

Unnamed: 0,key,data1,data2
0,a,2.0,0
1,a,4.0,0
2,a,5.0,0
3,b,0.0,1
4,b,1.0,1
5,b,6.0,1
6,d,,2


In [45]:
df3 = pd.DataFrame(
    {
        'rkey': list('bbaaab'),
        'data1':range(6)
    }
)
df3

Unnamed: 0,rkey,data1
0,b,0
1,b,1
2,a,2
3,a,3
4,a,4
5,b,5


In [40]:
df4 = pd.DataFrame(
    {
        'rkey': list('abd'),
        'data2':range(3)
    }
)
df4

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


In [47]:
pd.merge(
    df3,
    df4,
    left_on='rkey',
    right_on='rkey',
    how='inner'
)

Unnamed: 0,rkey,data1,data2
0,b,0,1
1,b,1,1
2,a,2,0
3,a,3,0
4,a,4,0
5,b,5,1


----
#### Dataframe의 Concatenating


In [48]:
pd.concat(
    [df1, df2],
    axis='columns',
    sort=True
)

Unnamed: 0,key,data1,key.1,data2
0,b,0,a,0.0
1,b,1,b,1.0
2,a,2,d,2.0
3,c,3,,
4,a,4,,
5,a,5,,
6,b,6,,


In [56]:
df5 = pd.DataFrame(
    {
        'k1':['one']*3 + ['two'] *4,
        'k2':[1, 1, 2, 3, 3, 4,4]

    }
)
df5


Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [58]:
# 데이터프레임 중복값 제거
df5.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [63]:
# 중복값 제거
df5.drop_duplicates(inplace=True)
df5.reset_index(drop=True)

Unnamed: 0,k1,k2
0,one,1
1,one,2
2,two,3
3,two,4


In [64]:
# 데이터의 범위 구하기
ages = [20, 22, 25, 27, 23, 37, 31, 61 ,45,41,32]
bins = [18, 25, 35, 60, 100]

In [66]:
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 11
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [None]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [None]:
cats.codes  

array([0, 0, 0, 1, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [69]:
cats.value_counts()

(18, 25]     4
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: count, dtype: int64

In [70]:
# catogory 이름 정하기
group_names = ['Youth', 'YoungAdult', 'MiddleAge', 'Senior']
cat2 = pd.cut(ages, bins, labels = group_names)
cat2.value_counts()

Youth         4
YoungAdult    3
MiddleAge     3
Senior        1
Name: count, dtype: int64

In [71]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')