In [3]:
import pandas as pd
import numpy as np

- 시리즈

In [4]:
items = [i*10 for i in range(1, 10, 2)]
items

[10, 30, 50, 70, 90]

In [5]:
column = pd.Series(items)
column

0    10
1    30
2    50
3    70
4    90
dtype: int64

In [6]:
print(column[0])
print(column[2])
print(column[4])

10
50
90


In [7]:
column.values

array([10, 30, 50, 70, 90], dtype=int64)

In [8]:
type(column.values)

numpy.ndarray

In [9]:
mylist = list(column.values)
mylist

[10, 30, 50, 70, 90]

- 인덱스

In [10]:
column.index

RangeIndex(start=0, stop=5, step=1)

In [11]:
type(column.index)

pandas.core.indexes.range.RangeIndex

In [12]:
mylist = list(column.index)
mylist

[0, 1, 2, 3, 4]

- 시리즈 연산

In [13]:
week1 = pd.Series([290000, 310000], index = ['sat', 'sun'])
week1

sat    290000
sun    310000
dtype: int64

In [14]:
week2 = pd.Series([120000, 220000], index = ['sun', 'sat'])
week2

sun    120000
sat    220000
dtype: int64

In [15]:
week1 + week2

sat    510000
sun    430000
dtype: int64

In [16]:
week3 = pd.Series([100000, 500000], index = ['mon', 'sat'])
week3

mon    100000
sat    500000
dtype: int64

In [17]:
week1 + week3

mon         NaN
sat    790000.0
sun         NaN
dtype: float64

In [18]:
type((week1 + week3)[0])

numpy.float64

In [19]:
type(np.nan)

float

- Series 활용

In [20]:
column = pd.Series(np.arange(0, 100, 10))
column

0     0
1    10
2    20
3    30
4    40
5    50
6    60
7    70
8    80
9    90
dtype: int32

- 조건식

In [21]:
in1 = column[column <= 70][column >= 30]
in1

3    30
4    40
5    50
6    60
7    70
dtype: int32

In [22]:
in2 = column[(column <= 10) | (column >= 90)]
in2

0     0
1    10
9    90
dtype: int32

In [23]:
column.dtype

dtype('int32')

In [24]:
column.info()

<class 'pandas.core.series.Series'>
RangeIndex: 10 entries, 0 to 9
Series name: None
Non-Null Count  Dtype
--------------  -----
10 non-null     int32
dtypes: int32(1)
memory usage: 172.0 bytes


- 타입변환

In [25]:
x = column.astype('float')
print(x.dtype)
x

float64


0     0.0
1    10.0
2    20.0
3    30.0
4    40.0
5    50.0
6    60.0
7    70.0
8    80.0
9    90.0
dtype: float64

In [26]:
y = column.astype('str')
print(y.dtype)
y

object


0     0
1    10
2    20
3    30
4    40
5    50
6    60
7    70
8    80
9    90
dtype: object

- 집계

In [27]:
print("합계 :", column.sum())
print("평균 :", column.mean())
print("최대 :", column.max())
print("최소 :", column.min())
print("분산 :", column.var())
print("표준편차 :", column.std())
print("중앙값 :", column.median())

합계 : 450
평균 : 45.0
최대 : 90
최소 : 0
분산 : 916.6666666666666
표준편차 : 30.276503540974915
중앙값 : 45.0


- 데이터프레임

In [28]:
grade_data = [
    [1, "남자", 98, 77, 88, 64],
    [2, "여자", 88, 90, 62, 72],
    [1, "남자", 92, 70, 83, 79],
    [3, "여자", 63, 60, 31, 70],
    [4, "남자", 75, 50, 90, 88],
]

In [29]:
df = pd.DataFrame(grade_data)
df

Unnamed: 0,0,1,2,3,4,5
0,1,남자,98,77,88,64
1,2,여자,88,90,62,72
2,1,남자,92,70,83,79
3,3,여자,63,60,31,70
4,4,남자,75,50,90,88


In [30]:
grade_data[1][0]

2

In [31]:
df[0][1]

2

In [32]:
df.iloc[1, 0]

2

In [33]:
i_names = ['철수', '영희', '민철', '수현', '호영']
c_names = ['학년', '성별', '국어', '수학', '영어', '과학']

In [34]:
df = pd.DataFrame(grade_data, columns = c_names, index = i_names)
df

Unnamed: 0,학년,성별,국어,수학,영어,과학
철수,1,남자,98,77,88,64
영희,2,여자,88,90,62,72
민철,1,남자,92,70,83,79
수현,3,여자,63,60,31,70
호영,4,남자,75,50,90,88


In [35]:
grade_dict = {'학년': [1, 2, 1, 3, 4],
 '성별': ['남자', '여자', '남자', '여자', '남자'],
 '국어': [98, 88, 92, 63, 75],
 '수학': [77, 90, 70, 60, 50],
 '영어': [88, 62, 83, 31, 90],
 '과학': [64, 72, 79, 70, 88]}

In [36]:
df = pd.DataFrame(grade_dict, index = i_names)
df

Unnamed: 0,학년,성별,국어,수학,영어,과학
철수,1,남자,98,77,88,64
영희,2,여자,88,90,62,72
민철,1,남자,92,70,83,79
수현,3,여자,63,60,31,70
호영,4,남자,75,50,90,88


In [37]:
df.index.name = "이름"
df

Unnamed: 0_level_0,학년,성별,국어,수학,영어,과학
이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
철수,1,남자,98,77,88,64
영희,2,여자,88,90,62,72
민철,1,남자,92,70,83,79
수현,3,여자,63,60,31,70
호영,4,남자,75,50,90,88


- 딕셔너리를 원소로 갖는 리스트로 생성하기

In [38]:
grade_list = [
{'이름' : '철수', '학년': 1, '성별': '남자', '국어': 98, '수학': 77, '영어': 88, '과학': 64},
{'이름' : '영희', '학년': 2, '성별': '여자', '국어': 88, '수학': 90, '영어': 62, '과학': 72},
{'이름' : '민철', '학년': 1, '성별': '남자', '국어': 92, '수학': 70, '영어': 83, '과학': 79},
{'이름' : '수현', '학년': 3, '성별': '여자', '국어': 63, '수학': 60, '영어': 31, '과학': 70},
{'이름' : '호영', '학년': 4, '성별': '남자', '국어': 75, '수학': 50, '영어': 90, '과학': 88}]

In [39]:
df = pd.DataFrame(grade_list)
df

Unnamed: 0,이름,학년,성별,국어,수학,영어,과학
0,철수,1,남자,98,77,88,64
1,영희,2,여자,88,90,62,72
2,민철,1,남자,92,70,83,79
3,수현,3,여자,63,60,31,70
4,호영,4,남자,75,50,90,88


In [40]:
df2 = df.set_index('이름')
df2

Unnamed: 0_level_0,학년,성별,국어,수학,영어,과학
이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
철수,1,남자,98,77,88,64
영희,2,여자,88,90,62,72
민철,1,남자,92,70,83,79
수현,3,여자,63,60,31,70
호영,4,남자,75,50,90,88


In [41]:
df3 = df2.reset_index()
df3

Unnamed: 0,이름,학년,성별,국어,수학,영어,과학
0,철수,1,남자,98,77,88,64
1,영희,2,여자,88,90,62,72
2,민철,1,남자,92,70,83,79
3,수현,3,여자,63,60,31,70
4,호영,4,남자,75,50,90,88


In [42]:
df3.reset_index(drop = True)

Unnamed: 0,이름,학년,성별,국어,수학,영어,과학
0,철수,1,남자,98,77,88,64
1,영희,2,여자,88,90,62,72
2,민철,1,남자,92,70,83,79
3,수현,3,여자,63,60,31,70
4,호영,4,남자,75,50,90,88


In [43]:
url = "https://data.hossam.kr/pydata/grade.csv"

In [44]:
df = pd.read_csv(url, encoding = 'euc-kr', index_col = '이름')
df

Unnamed: 0_level_0,학년,성별,국어,영어,수학,과학
이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
철수,1,남자,98,77,88,64
영희,2,여자,88,90,62,72
민철,1,남자,92,70,83,79
수현,3,여자,63,60,31,70
호영,4,남자,75,50,90,88


In [45]:
df = pd.read_excel('https://data.hossam.kr/pydata/grade.xlsx', index_col = '이름')
df

Unnamed: 0_level_0,학년,성별,국어,영어,수학,과학
이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
철수,1,남자,98,77,88,64
영희,2,여자,88,90,62,72
민철,1,남자,92,70,83,79
수현,3,여자,63,60,31,70
호영,4,남자,75,50,90,88


- 데이터 정보 확인

In [46]:
df.ndim

2

In [47]:
df.shape

(5, 6)

In [48]:
df.dtypes

학년     int64
성별    object
국어     int64
영어     int64
수학     int64
과학     int64
dtype: object

- 상세 정보

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 철수 to 호영
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   학년      5 non-null      int64 
 1   성별      5 non-null      object
 2   국어      5 non-null      int64 
 3   영어      5 non-null      int64 
 4   수학      5 non-null      int64 
 5   과학      5 non-null      int64 
dtypes: int64(5), object(1)
memory usage: 280.0+ bytes


- 데이터 내용 확인

In [50]:
df['국어']

이름
철수    98
영희    88
민철    92
수현    63
호영    75
Name: 국어, dtype: int64

In [51]:
type(df['국어'])

pandas.core.series.Series

In [52]:
df.loc['철수']

학년     1
성별    남자
국어    98
영어    77
수학    88
과학    64
Name: 철수, dtype: object

In [53]:
type(df.loc['철수'])

pandas.core.series.Series

In [54]:
df['국어']['철수']

98

In [55]:
df.loc['철수', '국어']

98

In [56]:
df

Unnamed: 0_level_0,학년,성별,국어,영어,수학,과학
이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
철수,1,남자,98,77,88,64
영희,2,여자,88,90,62,72
민철,1,남자,92,70,83,79
수현,3,여자,63,60,31,70
호영,4,남자,75,50,90,88


In [57]:
df.loc['철수', '국어'] = 100
df

Unnamed: 0_level_0,학년,성별,국어,영어,수학,과학
이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
철수,1,남자,100,77,88,64
영희,2,여자,88,90,62,72
민철,1,남자,92,70,83,79
수현,3,여자,63,60,31,70
호영,4,남자,75,50,90,88


In [58]:
df.head(2)

Unnamed: 0_level_0,학년,성별,국어,영어,수학,과학
이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
철수,1,남자,100,77,88,64
영희,2,여자,88,90,62,72


In [59]:
df.tail(2)

Unnamed: 0_level_0,학년,성별,국어,영어,수학,과학
이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
수현,3,여자,63,60,31,70
호영,4,남자,75,50,90,88


In [60]:
df.value_counts()

학년  성별  국어   영어  수학  과학
1   남자  92   70  83  79    1
        100  77  88  64    1
2   여자  88   90  62  72    1
3   여자  63   60  31  70    1
4   남자  75   50  90  88    1
dtype: int64

In [61]:
df['성별'].value_counts()

남자    3
여자    2
Name: 성별, dtype: int64

- 데이터 내보내기

In [62]:
df.to_excel('grade.xlsx')

In [63]:
df.to_csv('grade.csv', encoding = 'utf-8')

In [64]:
df.to_excel('grade2.xlsx',
            index = False,
            columns = ['학년', '성별', '국어'], 
            header = ['level', 'sex', 'kor'])

In [1]:
from my_scaler import my_scaler
import pandas as pd

In [2]:
train = pd.read_csv("../../../script/Study/Exam/Book/yemoonsaBigdata/datasets/Part3/304_x_train.csv")
test = pd.read_csv("../../../script/Study/Exam/Book/yemoonsaBigdata/datasets/Part3/304_x_test.csv")
Y = pd.read_csv("../../../script/Study/Exam/Book/yemoonsaBigdata/datasets/Part3/304_y_train.csv")

In [3]:
train.head()

Unnamed: 0,ID,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad
0,1704,26,Private Sector/Self Employed,Yes,1400000,3,1,No,Yes
1,491,28,Private Sector/Self Employed,Yes,1100000,4,1,No,No
2,414,33,Private Sector/Self Employed,Yes,1400000,4,0,No,Yes
3,120,28,Private Sector/Self Employed,Yes,800000,3,1,No,No
4,1268,33,Government Sector,Yes,1000000,5,0,No,Yes


In [4]:
train.describe(include = 'O')

Unnamed: 0,Employment Type,GraduateOrNot,FrequentFlyer,EverTravelledAbroad
count,1490,1490,1490,1490
unique,2,2,2,2
top,Private Sector/Self Employed,Yes,No,No
freq,1070,1270,1183,1197


In [11]:
a, b = my_scaler(train, test, scale = 'mm', obj = 'le')