### Python intro

In [None]:
# warning message 제거
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
import pandas as pd
print("pandas 버전: {}".format(pd.__version__))

import matplotlib
print("matplotlib 버전: {}".format(matplotlib.__version__))

import numpy as np
print("NumPy 버전: {}".format(np.__version__))

import sklearn
print("scikit-learn 버전: {}".format(sklearn.__version__))

pandas 버전: 1.3.4
matplotlib 버전: 3.4.3
NumPy 버전: 1.23.2
scikit-learn 버전: 0.24.2


## Pandas

데이터분석을 위해 사용하는 패키지(Package).  
파이썬을 이용해 엑셀과 같은 역할을 수행한다고 생각하면 편하다.  
판다스는 대용량 데이터를 효율적으로 다룰 수 있기 때문에 빅데이터 분석에 유리하며  
여러가지 복잡한 기능을 구현할 수 있다.

### 1. Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# 데이터 불러오기
dataframe_no_index = pd.read_csv("/content/drive/MyDrive/machine_learning/1weeks/Practice-01/College.csv")
# 데이터의 shape (row * column) : 불러온 데이터의 matrix 사이즈를 나타냄
print("data의 shape",dataframe_no_index.shape)
# 상위 5개 출력
dataframe_no_index.head()

data의 shape (777, 19)


Unnamed: 0.1,Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [None]:
# 데이터 프레임의 특정 컬럼(여기서는 Unnamed: 0)을 index 로 가져오고 싶을 때 index_col="Unnamed: 0" 명령어를 쓰면 된다

dataframe = pd.read_csv("/content/drive/MyDrive/machine_learning/1weeks/Practice-01/College.csv", index_col = "Unnamed: 0")  # "Unnamed: 0"기준으로 인덱스를 잡았다.
dataframe.head(5)                                                 # index_col = 0 (인덱스가 없음. False)

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [None]:
# dataframe 의 column 과 index 를 가져와보자
dataframe.columns

Index(['Private', 'Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc',
       'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board', 'Books',
       'Personal', 'PhD', 'Terminal', 'S.F.Ratio', 'perc.alumni', 'Expend',
       'Grad.Rate'],
      dtype='object')

In [None]:
dataframe.index

Index(['Abilene Christian University', 'Adelphi University', 'Adrian College',
       'Agnes Scott College', 'Alaska Pacific University', 'Albertson College',
       'Albertus Magnus College', 'Albion College', 'Albright College',
       'Alderson-Broaddus College',
       ...
       'Winthrop University', 'Wisconsin Lutheran College',
       'Wittenberg University', 'Wofford College',
       'Worcester Polytechnic Institute', 'Worcester State College',
       'Xavier University', 'Xavier University of Louisiana',
       'Yale University', 'York College of Pennsylvania'],
      dtype='object', length=777)

### 2. Matrix, Row, Column

#### Column Slicing

In [None]:
# 특정 column 가져오기
dataframe['Private']

Abilene Christian University      Yes
Adelphi University                Yes
Adrian College                    Yes
Agnes Scott College               Yes
Alaska Pacific University         Yes
                                 ... 
Worcester State College            No
Xavier University                 Yes
Xavier University of Louisiana    Yes
Yale University                   Yes
York College of Pennsylvania      Yes
Name: Private, Length: 777, dtype: object

특정 column 여러개 가져오기

In [None]:
# 특정 컬럼 여러개를 가져오기 위해서는 1. list 에 컬럼명을 담거나 2. [[]] 형식으로 이중 대괄호 안에 컬럼명을 가지고 온다
# 1. list 에 컬럼명 담기
col_list = ["Private", "Apps" ,"Accept", "Enroll"]


In [None]:
# 2 이중 대괄호 [[]] 사용
df_double = dataframe[["Private", "Apps" ,"Accept", "Enroll"]]
df_double.head()

Unnamed: 0,Private,Apps,Accept,Enroll
Abilene Christian University,Yes,1660,1232,721
Adelphi University,Yes,2186,1924,512
Adrian College,Yes,1428,1097,336
Agnes Scott College,Yes,417,349,137
Alaska Pacific University,Yes,193,146,55
...,...,...,...,...
Worcester State College,No,2197,1515,543
Xavier University,Yes,1959,1805,695
Xavier University of Louisiana,Yes,2097,1915,695
Yale University,Yes,10705,2453,1317


#### Row Slicing

In [None]:
# row 를 slicing 할 때는 .loc 를 붙여 사용해야 한다.
# row 명을 그대로 가지고 와서 slicing 을 시켜준다.

# 0번 index 부터 4번 index 까지 출력
dataframe_0_4 = dataframe.loc['Abilene Christian University':'Agnes Scott College']
dataframe_0_4

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59


In [None]:
# index 가 숫자인 경우 
dataframe_no_index.loc[0:3]

Unnamed: 0.1,Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59


In [None]:
# 특정 row 를 출력하기 위해서는 아까와 같은 방법으로 이중 대괄호를 사용하면 된다
dataframe_no_index.loc[[0, 2, 4]]

Unnamed: 0.1,Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
2,Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
4,Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


#### Extract Feature

In [None]:
dataframe[dataframe['Private'] == 'No']

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
Angelo State University,No,3540,2001,1016,24,54,4190,1512,5130,3592,500,2000,60,62,23.1,5,4010,34
Appalachian State University,No,7313,4664,1910,20,63,9940,1035,6806,2540,96,2000,83,96,18.3,14,5854,70
Arizona State University Main campus,No,12809,10308,3761,24,49,22593,7585,7434,4850,700,2100,88,93,18.9,5,4602,48
Arkansas Tech University,No,1734,1729,951,12,52,3602,939,3460,2650,450,1000,57,60,19.6,5,4739,48
Auburn University-Main Campus,No,7548,6791,3070,25,57,16262,1716,6300,3933,600,1908,85,91,16.7,18,6642,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Westfield State College,No,3100,2150,825,3,20,3234,941,5542,3788,500,1300,75,79,15.7,20,4222,65
Westmont College,No,950,713,351,42,72,1276,9,14320,5304,490,1410,77,77,14.9,17,8837,87
Winona State University,No,3325,2047,1301,20,45,5800,872,4200,2700,300,1200,53,60,20.2,18,5318,58
Winthrop University,No,2320,1805,769,24,61,3395,670,6400,3392,580,2150,71,80,12.8,26,6729,59


In [None]:
# indexing
# 특점 컬럼에서 특정 값을 가지는 것만 가져오자

dataframe_private = dataframe[dataframe['Private'] == 'No'].head(5)
dataframe_private

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
Angelo State University,No,3540,2001,1016,24,54,4190,1512,5130,3592,500,2000,60,62,23.1,5,4010,34
Appalachian State University,No,7313,4664,1910,20,63,9940,1035,6806,2540,96,2000,83,96,18.3,14,5854,70
Arizona State University Main campus,No,12809,10308,3761,24,49,22593,7585,7434,4850,700,2100,88,93,18.9,5,4602,48
Arkansas Tech University,No,1734,1729,951,12,52,3602,939,3460,2650,450,1000,57,60,19.6,5,4739,48
Auburn University-Main Campus,No,7548,6791,3070,25,57,16262,1716,6300,3933,600,1908,85,91,16.7,18,6642,69


In [None]:
dataframe[(dataframe['Private']== 'No') & (dataframe['PhD'] > 95)]

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
New Mexico Institute of Mining and Tech.,No,787,601,233,40,73,1017,411,5376,3214,600,1100,99,100,13.7,11,9241,34
Texas A&M University at Galveston,No,529,481,243,22,47,1206,134,4860,3122,600,650,103,88,17.4,16,6415,43
University of Alabama at Birmingham,No,1797,1260,938,24,35,6960,4698,4440,5175,750,2200,96,96,6.7,16,16352,33
University of California at Irvine,No,15698,10775,2478,85,100,12677,864,12024,5302,790,1818,96,96,16.1,11,15934,66
University of North Dakota,No,2777,2249,1652,20,54,8334,1435,5634,2703,450,1200,97,97,15.9,16,9424,49
University of Washington,No,12749,7025,3343,40,81,20356,4582,8199,4218,708,2172,96,94,9.0,10,16527,65


In [None]:
# private 이 no 이면서 PhD column 의 값이 95 이상인 경우
dataframe_pri_phd = dataframe[(dataframe['Private'] == 'No') & (dataframe['PhD'] > 95)]
dataframe_pri_phd

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
New Mexico Institute of Mining and Tech.,No,787,601,233,40,73,1017,411,5376,3214,600,1100,99,100,13.7,11,9241,34
Texas A&M University at Galveston,No,529,481,243,22,47,1206,134,4860,3122,600,650,103,88,17.4,16,6415,43
University of Alabama at Birmingham,No,1797,1260,938,24,35,6960,4698,4440,5175,750,2200,96,96,6.7,16,16352,33
University of California at Irvine,No,15698,10775,2478,85,100,12677,864,12024,5302,790,1818,96,96,16.1,11,15934,66
University of North Dakota,No,2777,2249,1652,20,54,8334,1435,5634,2703,450,1200,97,97,15.9,16,9424,49
University of Washington,No,12749,7025,3343,40,81,20356,4582,8199,4218,708,2172,96,94,9.0,10,16527,65


In [None]:
dataframe[dataframe['Private']=='No'][['Apps','Accept','Enroll','Top10perc']]

Unnamed: 0,Apps,Accept,Enroll,Top10perc
Angelo State University,3540,2001,1016,24
Appalachian State University,7313,4664,1910,20
Arizona State University Main campus,12809,10308,3761,24
Arkansas Tech University,1734,1729,951,12
Auburn University-Main Campus,7548,6791,3070,25
...,...,...,...,...
Westfield State College,3100,2150,825,3
Westmont College,950,713,351,42
Winona State University,3325,2047,1301,20
Winthrop University,2320,1805,769,24


In [None]:
# private 이 no 이면서 "Apps","Accept","Enroll","Top10perc" column을 가져올 경우

print(dataframe[dataframe['Private'] == 'No'][['Apps', 'Accept', 'Enroll', 'Top10perc']].shape)  # shape : 불러온 데이터의 matrix 사이즈

dataframe_pri_col = dataframe[dataframe['Private'] == 'No'][['Apps', 'Accept', 'Enroll', 'Top10perc']]
dataframe_pri_col.head()

(212, 4)


Unnamed: 0,Apps,Accept,Enroll,Top10perc
Angelo State University,3540,2001,1016,24
Appalachian State University,7313,4664,1910,20
Arizona State University Main campus,12809,10308,3761,24
Arkansas Tech University,1734,1729,951,12
Auburn University-Main Campus,7548,6791,3070,25


#### Add and remove columns

In [None]:
dataframe['master'] = 50
print(dataframe[['Private','master']])

                               Private  master
Abilene Christian University       Yes      50
Adelphi University                 Yes      50
Adrian College                     Yes      50
Agnes Scott College                Yes      50
Alaska Pacific University          Yes      50
...                                ...     ...
Worcester State College             No      50
Xavier University                  Yes      50
Xavier University of Louisiana     Yes      50
Yale University                    Yes      50
York College of Pennsylvania       Yes      50

[777 rows x 2 columns]


In [None]:
# add and remove columns

# 데이터프레임[추가할컬럼] = 추가할 값 의 형식으로 column 을 추가
dataframe['master'] = 50

# master 컬럼만 출력
# 컬럼 안에 있는 모든 값에 50 이라는 값으로 채워졌다
dataframe[['master']].head()

Unnamed: 0,master
Abilene Christian University,50
Adelphi University,50
Adrian College,50
Agnes Scott College,50
Alaska Pacific University,50


column을 제거하는 방법은 두가지가 있는데, 첫번째로는 새로 컬럼을 만드는것 <br>
두번째로는 직접적으로 drop 기능을 쓰는 것이다.<br>

다른 데이터를 불러와서 좀 더 빠른 이해를 해보자<br>
(기존에 쓰던 데이터는 column 명이 너무 복잡하다)

In [None]:
iris = pd.read_csv('iris.csv')
iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [None]:
# 1. 컬럼을 새로 지정해주기
# List method 를 이용한다

print("variety 컬럼 삭제")
new_col = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
new_iris = iris[new_col]
new_iris.head()

variety 컬럼 삭제


Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [None]:
New_col = ['Private', 'Apps','Accept']
new_dataframe = dataframe[New_col]
print(new_dataframe.head())

                             Private  Apps  Accept
Abilene Christian University     Yes  1660    1232
Adelphi University               Yes  2186    1924
Adrian College                   Yes  1428    1097
Agnes Scott College              Yes   417     349
Alaska Pacific University        Yes   193     146


In [None]:
drop_dataframe = new_dataframe.drop('Private', axis = 1)
print(drop_dataframe)

                                 Apps  Accept
Abilene Christian University     1660    1232
Adelphi University               2186    1924
Adrian College                   1428    1097
Agnes Scott College               417     349
Alaska Pacific University         193     146
...                               ...     ...
Worcester State College          2197    1515
Xavier University                1959    1805
Xavier University of Louisiana   2097    1915
Yale University                 10705    2453
York College of Pennsylvania     2989    1855

[777 rows x 2 columns]


In [None]:
# 2. drop method 사용
# drop으로 컬럼을 제거할때는 언제나 axis=1이라는 옵션을 넣어주어야 한다 (axis =0 은 row 를 제거)

new_iris1 = iris.drop('sepal.length', axis = 1)
new_iris1.head()

Unnamed: 0,sepal.width,petal.length,petal.width,variety
0,3.5,1.4,0.2,Setosa
1,3.0,1.4,0.2,Setosa
2,3.2,1.3,0.2,Setosa
3,3.1,1.5,0.2,Setosa
4,3.6,1.4,0.2,Setosa
