# 머신러닝에 필요한 공부들
1. 확률, 통계, 수열
- 확률(조건부 확률) : 베이즈이론 (베이지안 필터기 : 스팸 메일 분류기) -> 햄/스팸 분류기 (R 머신러닝)

2. 벡터와 행렬
- 벡터 : 신경망
- 행렬 : 다차원 데이터

3. 함수와 미분 
- 미분 : 최적값, optimal(*), 편미분 ( 여러 개 변수를 미분 ) 
- 함수 : 미분 가능한 함수 
- cost(loss) function을 미분하고 cost가 최적에 해당하는 가중치(weight), 편향(bias)을 구할 수 있음 

4. 예측과 최적화
- 회귀분석 -> 연속형 예측, 최적화 

5. 신경망과 딥러닝 
- 단일 / 멀티 퍼셉트론 -> 깊은 신경망 
- 순전파(forward propagation) 방식으로 입력 신호가 전달
- 역전파(back propagation) 방식으로 가중치 업데이트 

6. 강화학습
- 최적해
- 몬테카를로 기법
- 마르코프 결정과정 

===========================================================================================================
## 데이터 베이스
- 데이터 베이스 생성, 테이블 생성, 데이터 저장/ 삭제 

## 데이터 웨어하우스 (data warehouse)
- 빅데이터를 저장해놓고, 빅데이터 기반으로 분석하고 정보를 가공(분석하기 용이하게 특화)
- 하둡(hadoop, 빅데이터 분산 처리 프레임웍)과 연결 

## 데이터 마트
- 데이터 웨어하우스의 일부분, 사용자에게 데이터 웨어하우스에서 데이터를 꺼내서 제공하는 역할 


In [149]:
# json : 웹 형식 문서 표현 방법 중 하나, 가볍고 속도가 빠르다. 딕셔너리 구조로 작성된 문서
import pandas as pd 
import numpy as np 
import json 
import matplotlib.pyplot as plt


In [150]:
obj = '''
    {"name":"Wes",
    "places_lived":["United States","Spain","Germany"],
    "pet":null,
    "siblings":[{"name":"Kim","age":25,"pets":["ba","ka"]},
                {"name":"Lee","age":22,"pets":["aa","bb","cc"]}]
    }
'''
print(obj)


    {"name":"Wes",
    "places_lived":["United States","Spain","Germany"],
    "pet":null,
    "siblings":[{"name":"Kim","age":25,"pets":["ba","ka"]},
                {"name":"Lee","age":22,"pets":["aa","bb","cc"]}]
    }



In [151]:
res = json.loads(obj)
res

{'name': 'Wes',
 'pet': None,
 'places_lived': ['United States', 'Spain', 'Germany'],
 'siblings': [{'age': 25, 'name': 'Kim', 'pets': ['ba', 'ka']},
  {'age': 22, 'name': 'Lee', 'pets': ['aa', 'bb', 'cc']}]}

In [152]:
ajson = json.dumps(res) # 파이썬 형태로 읽어진 객체를 진짜 json 형식으로 변환
ajson 

'{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Kim", "age": 25, "pets": ["ba", "ka"]}, {"name": "Lee", "age": 22, "pets": ["aa", "bb", "cc"]}]}'

In [153]:
res['siblings']

[{'age': 25, 'name': 'Kim', 'pets': ['ba', 'ka']},
 {'age': 22, 'name': 'Lee', 'pets': ['aa', 'bb', 'cc']}]

In [154]:
pd.DataFrame(res['siblings'])

Unnamed: 0,name,age,pets
0,Kim,25,"[ba, ka]"
1,Lee,22,"[aa, bb, cc]"


In [155]:
df = pd.DataFrame(res['siblings'],columns = ['name','age'])
df

Unnamed: 0,name,age
0,Kim,25
1,Lee,22


In [156]:
df.to_json() # json 문서 형식으로 변환 

'{"name":{"0":"Kim","1":"Lee"},"age":{"0":25,"1":22}}'

In [157]:
df.to_json('myjson.json') # 문서저장

In [158]:
pd.read_json('myjson.json')

Unnamed: 0,name,age
0,Kim,25
1,Lee,22


In [159]:
# 데이터 정제 
stringData = pd.Series(['aaa','bbb',np.nan,'ccc'])
stringData

0    aaa
1    bbb
2    NaN
3    ccc
dtype: object

In [160]:
stringData.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [161]:
stringData[stringData.isnull()]

2    NaN
dtype: object

In [162]:
stringData[0] = None # None 값은 NA와 같음 
stringData

0    None
1     bbb
2     NaN
3     ccc
dtype: object

In [163]:
stringData.isnull() 

0     True
1    False
2     True
3    False
dtype: bool

In [164]:
# NA 처리 메서드 
# dropna : 누락된 데이터가 있는 축(행,열)을 제외
# fillna : 누락된 데이터를 대신할 값으로 채움 ffill,bfill
# isnull : 누락 데이터를 추출 
# notnull : 누락 데이터 아닌 값 추출

In [165]:
from numpy import nan as NA
data = pd.Series([1,NA,3.5,NA,7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [166]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [167]:
data.notnull()

0     True
1    False
2     True
3    False
4     True
dtype: bool

In [168]:
data = pd.DataFrame([[1,6,3],
              [1,NA,NA],
              [NA,NA,NA],
              [NA,5,2]])
data

Unnamed: 0,0,1,2
0,1.0,6.0,3.0
1,1.0,,
2,,,
3,,5.0,2.0


In [169]:
data.dropna(axis= 0) # 디폴트 axis = 0 

Unnamed: 0,0,1,2
0,1.0,6.0,3.0


In [170]:
data.dropna(axis = 1)

0
1
2
3


In [171]:
data.dropna(how= 'all') # 행의 값들이 모두 nan일 때 제거 

Unnamed: 0,0,1,2
0,1.0,6.0,3.0
1,1.0,,
3,,5.0,2.0


In [172]:
# 중복제거
data = pd.DataFrame({'a':['one','two']*3+['two'],'b':[1,1,2,3,3,4,4]})
data.info()
data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   a       7 non-null      object
 1   b       7 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 240.0+ bytes


Unnamed: 0,a,b
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [173]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [174]:
data.drop_duplicates() # duplicated 함수결과가 False인 데이터 프레임을 리턴 

Unnamed: 0,a,b
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [175]:
data['v1'] = range(7)
data

Unnamed: 0,a,b,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [176]:
# 특정 컬럼에 대한 중복값을 제외
data.drop_duplicates(['a'])

Unnamed: 0,a,b,v1
0,one,1,0
1,two,1,1


In [177]:
data.drop_duplicates(['a','b'],keep='last')

Unnamed: 0,a,b,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [178]:
# 데이터 치환 : map함수, replace함수 (딕셔너리)

In [179]:
ages = [15,20,25,28,39,29,22,37,61,44,46,33,111] # 구간을 벗어나면 nan
bins = [10,20,30,40,60,100]
# 연령 대이터 -> 연령대로 나눔 -> 연령대별로 카테고리화 
res = pd.cut(ages,bins=bins)

In [180]:
res.codes # -1은 nan을 지칭 

array([ 0,  0,  1,  1,  2,  1,  1,  2,  4,  3,  3,  2, -1], dtype=int8)

In [181]:
res.categories

IntervalIndex([(10, 20], (20, 30], (30, 40], (40, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [182]:
res.value_counts() # pd.value_counts(res)

(10, 20]     2
(20, 30]     4
(30, 40]     3
(40, 60]     2
(60, 100]    1
dtype: int64

In [183]:
pd.cut(ages,[15,26,36,61,100]) # 직접 bins를 []로 사용할 수 잇다. 

[NaN, (15.0, 26.0], (15.0, 26.0], (26.0, 36.0], (36.0, 61.0], ..., (36.0, 61.0], (36.0, 61.0], (36.0, 61.0], (26.0, 36.0], NaN]
Length: 13
Categories (4, interval[int64]): [(15, 26] < (26, 36] < (36, 61] < (61, 100]]

In [184]:
pd.cut(ages,[15,26,36,61,100],right = True) # 구간 설정 변경 이상 미만

[NaN, (15.0, 26.0], (15.0, 26.0], (26.0, 36.0], (36.0, 61.0], ..., (36.0, 61.0], (36.0, 61.0], (36.0, 61.0], (26.0, 36.0], NaN]
Length: 13
Categories (4, interval[int64]): [(15, 26] < (26, 36] < (36, 61] < (61, 100]]

In [185]:
gn = ['youth','youngyouth','middleaged','senior']
pd.cut(ages,[15,26,36,61,100],labels=gn)

[NaN, 'youth', 'youth', 'youngyouth', 'middleaged', ..., 'middleaged', 'middleaged', 'middleaged', 'youngyouth', NaN]
Length: 13
Categories (4, object): ['youth' < 'youngyouth' < 'middleaged' < 'senior']

In [186]:
res = pd.qcut(ages,4)
res

[(14.999, 25.0], (14.999, 25.0], (14.999, 25.0], (25.0, 33.0], (33.0, 44.0], ..., (44.0, 111.0], (33.0, 44.0], (44.0, 111.0], (25.0, 33.0], (44.0, 111.0]]
Length: 13
Categories (4, interval[float64]): [(14.999, 25.0] < (25.0, 33.0] < (33.0, 44.0] < (44.0, 111.0]]

In [187]:
res.value_counts() # 구간에 데이터를 동등하게 넣는다.하지만 구간의 길이는 다르다

(14.999, 25.0]    4
(25.0, 33.0]      3
(33.0, 44.0]      3
(44.0, 111.0]     3
dtype: int64

In [188]:
# 그룹별 집계 : groupby() 그룹단위로 집계 (요약)
# 전체 데이터 -> 그룹별로 분할 -> 각 그룹별로 집계(요약)함수 적용 -> 각 그룹별 집계 결과물 -> 합침
# http://archive.ics.uci.edu/ml/datasets/Abalone

from google.colab import files

files.upload()

{}

In [189]:
# Sex / nominal / -- / M, F, and I (infant)
# Length / continuous / mm / Longest shell measurement
# Diameter / continuous / mm / perpendicular to length
# Height / continuous / mm / with meat in shell
# Whole weight / continuous / grams / whole abalone
# Shucked weight / continuous / grams / weight of meat
# Viscera weight / continuous / grams / gut weight (after bleeding)
# Shell weight / continuous / grams / after being dried
# Rings / integer / -- / +1.5 gives the age in years

In [190]:
abalon = pd.read_csv('abalone.txt',names= ['Sex','Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight','Rings'])
abalon

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [191]:
np.sum(abalon.isnull())

Sex               0
Length            0
Diameter          0
Height            0
Whole weight      0
Shucked weight    0
Viscera weight    0
Shell weight      0
Rings             0
dtype: int64

In [192]:
grouped = abalon['Whole weight'].groupby(abalon['Sex'])
grouped # abalon 데이터의 sex에 따른 그룹화 

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fd0b6988518>

In [193]:
grouped.size() # abalondml sex 그룹화 -> 각 그룹별 whole_weight 컬럼값의 size()호출 결과 

Sex
F    1307
I    1342
M    1528
Name: Whole weight, dtype: int64

In [194]:
grouped.mean()# abalondml sex 그룹화 -> 각 그룹별 whole_weight 컬럼값의 mean()호출 결과 

Sex
F    1.046532
I    0.431363
M    0.991459
Name: Whole weight, dtype: float64

In [195]:
abalon.groupby(abalon['Sex']).sum()

Unnamed: 0_level_0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,756.875,594.335,206.52,1367.8175,583.1675,301.51,394.727,14546
I,574.035,438.155,144.93,578.8885,256.369,123.4775,172.0205,10589
M,857.805,671.23,231.31,1514.95,661.5415,329.352,430.849,16358


In [196]:
abalon.groupby(['Sex']).sum()

Unnamed: 0_level_0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,756.875,594.335,206.52,1367.8175,583.1675,301.51,394.727,14546
I,574.035,438.155,144.93,578.8885,256.369,123.4775,172.0205,10589
M,857.805,671.23,231.31,1514.95,661.5415,329.352,430.849,16358


In [197]:
abalon['length_med'] = np.where(abalon.Length>abalon.Length.median(),'length_long','length_short')

In [198]:
abalon[['Length','length_med']]

Unnamed: 0,Length,length_med
0,0.455,length_short
1,0.350,length_short
2,0.530,length_short
3,0.440,length_short
4,0.330,length_short
...,...,...
4172,0.565,length_long
4173,0.590,length_long
4174,0.600,length_long
4175,0.625,length_long


In [199]:
mean_weight = abalon['Whole weight'].groupby([abalon['Sex'],abalon['length_med']]).mean()
mean_weight

Sex  length_med  
F    length_long     1.261330
     length_short    0.589702
I    length_long     0.923215
     length_short    0.351234
M    length_long     1.255182
     length_short    0.538157
Name: Whole weight, dtype: float64

In [200]:
mean_weight.unstack()

length_med,length_long,length_short
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,1.26133,0.589702
I,0.923215,0.351234
M,1.255182,0.538157


In [201]:
# 그룹별로 특정 작업을 반복
abalon

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,length_med
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,length_short
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,length_short
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,length_short
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,length_short
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,length_short
...,...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11,length_long
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10,length_long
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9,length_long
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10,length_long


In [202]:
for sex,group_data in abalon[['Sex','length_med','Whole weight']].groupby('Sex'):
    print(sex),print(group_data[:5])

F
   Sex    length_med  Whole weight
2    F  length_short        0.6770
6    F  length_short        0.7775
7    F  length_short        0.7680
9    F   length_long        0.8945
10   F  length_short        0.6065
I
   Sex    length_med  Whole weight
4    I  length_short        0.2050
5    I  length_short        0.3515
16   I  length_short        0.2905
21   I  length_short        0.2255
42   I  length_short        0.0700
M
   Sex    length_med  Whole weight
0    M  length_short        0.5140
1    M  length_short        0.2255
3    M  length_short        0.5160
8    M  length_short        0.5095
11   M  length_short        0.4060


In [203]:
for (sex,length_med),group_data in abalon[['Sex','length_med','Whole weight','Rings']].groupby(['Sex','length_med']):
    print(sex,length_med),print(group_data[:5])

F length_long
   Sex   length_med  Whole weight  Rings
9    F  length_long        0.8945     19
22   F  length_long        0.9395     12
23   F  length_long        0.7635      9
24   F  length_long        1.1615     10
25   F  length_long        0.9285     11
F length_short
   Sex    length_med  Whole weight  Rings
2    F  length_short        0.6770      9
6    F  length_short        0.7775     20
7    F  length_short        0.7680     16
10   F  length_short        0.6065     14
13   F  length_short        0.6845     10
I length_long
    Sex   length_med  Whole weight  Rings
509   I  length_long        0.8735     16
510   I  length_long        1.1095     10
549   I  length_long        0.8750     11
550   I  length_long        1.1625     17
551   I  length_long        0.9885     13
I length_short
   Sex    length_med  Whole weight  Rings
4    I  length_short        0.2050      7
5    I  length_short        0.3515      8
16   I  length_short        0.2905      7
21   I  length_short    

In [204]:
# 성별로 그룹화 -> 성별 그룹을 key로 설정 -> 데이터 셋은 value로 설정
aba_group = dict(list(abalon[:10][['Sex','length_med','Whole weight','Rings']].groupby('Sex')))

In [205]:
aba_group['M'] # 상위 10개 데이터에 대해 'Sex'이 'M'인 데이터 

Unnamed: 0,Sex,length_med,Whole weight,Rings
0,M,length_short,0.514,15
1,M,length_short,0.2255,7
3,M,length_short,0.516,10
8,M,length_short,0.5095,9


In [206]:
abalon[:10][abalon['Sex'] == 'M'][['Sex','length_med','Whole weight','Rings']]

  """Entry point for launching an IPython kernel.


Unnamed: 0,Sex,length_med,Whole weight,Rings
0,M,length_short,0.514,15
1,M,length_short,0.2255,7
3,M,length_short,0.516,10
8,M,length_short,0.5095,9


In [207]:
# 특정 문자열을 매핑 규칙에 따른 변환 -> dict.get()

df = pd.DataFrame({'name':['kim','KIM','Kim','lee','LEE','Lee','cho','choi'],
              'value1':[1,2,3,4,5,6,7,8],
              'value2':[100,200,300,100,200,100,300,500]})
df

Unnamed: 0,name,value1,value2
0,kim,1,100
1,KIM,2,200
2,Kim,3,300
3,lee,4,100
4,LEE,5,200
5,Lee,6,100
6,cho,7,300
7,choi,8,500


In [208]:
nameMaping = {
    "KIM": 'kim',
    "Kim": 'kim',
    'LEE' : 'lee',
    'Lee' : 'lee',
    'cho' : 'others',
    'woo' : 'others'

}

In [209]:
func = lambda x :nameMaping.get(x,x)
df['name2'] = df.name.map(func)

In [210]:
df

Unnamed: 0,name,value1,value2,name2
0,kim,1,100,kim
1,KIM,2,200,kim
2,Kim,3,300,kim
3,lee,4,100,lee
4,LEE,5,200,lee
5,Lee,6,100,lee
6,cho,7,300,others
7,choi,8,500,choi


In [211]:
df.groupby('name2').sum()

Unnamed: 0_level_0,value1,value2
name2,Unnamed: 1_level_1,Unnamed: 2_level_1
choi,8,500
kim,6,600
lee,15,400
others,7,300


In [212]:
df.groupby(['name2','name']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
name2,name,Unnamed: 2_level_1,Unnamed: 3_level_1
choi,choi,8,500
kim,KIM,2,200
kim,Kim,3,300
kim,kim,1,100
lee,LEE,5,200
lee,Lee,6,100
lee,lee,4,100
others,cho,7,300


In [213]:
df=pd.DataFrame({'id':[1,2,10,20,100,200],
                 'name':['aa','aa2','aa3','aa4','aa5','aa6']})
df

Unnamed: 0,id,name
0,1,aa
1,2,aa2
2,10,aa3
3,20,aa4
4,100,aa5
5,200,aa6


In [214]:
df.id.apply(lambda x: '0'*(5-len(str(x)))+str(x))

0    00001
1    00002
2    00010
3    00020
4    00100
5    00200
Name: id, dtype: object

In [215]:
df.id.apply(lambda x: "%05d"%x)

0    00001
1    00002
2    00010
3    00020
4    00100
5    00200
Name: id, dtype: object

In [216]:
df['id2'] = df['id'].apply(lambda x:"{:0>5d}".format(x))

In [217]:
df

Unnamed: 0,id,name,id2
0,1,aa,1
1,2,aa2,2
2,10,aa3,10
3,20,aa4,20
4,100,aa5,100
5,200,aa6,200


In [218]:
# https://mkaz.blog/code/python-string-format-cookbook/
# format 형식 설명 

In [219]:
abalon

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,length_med
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,length_short
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,length_short
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,length_short
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,length_short
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,length_short
...,...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11,length_long
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10,length_long
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9,length_long
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10,length_long


In [220]:
abalon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   object 
 1   Length          4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole weight    4177 non-null   float64
 5   Shucked weight  4177 non-null   float64
 6   Viscera weight  4177 non-null   float64
 7   Shell weight    4177 non-null   float64
 8   Rings           4177 non-null   int64  
 9   length_med      4177 non-null   object 
dtypes: float64(7), int64(1), object(2)
memory usage: 326.5+ KB


In [221]:
abalon.describe()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0
