In [11]:
import pandas as pd
import numpy as np

In [12]:
df = pd.read_csv('../../../05000266/part5/auto-mpg.csv')
df.head()

Unnamed: 0,18.0,8,307.0,130.0,3504.,12.0,70,1,chevrolet chevelle malibu
0,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
1,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
2,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
3,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
4,15.0,8,429.0,198.0,4341.0,10.0,70,1,ford galaxie 500


In [13]:
df.columns = ['mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','name']

In [14]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,name
0,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
1,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
2,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
3,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
4,15.0,8,429.0,198.0,4341.0,10.0,70,1,ford galaxie 500


### 단위 환산

In [15]:
# mile per gallon -> km per liter 으로 환산
mpg_to_kpl = 1.6093 / 3.78541
df['kpl'] = df['mpg'] * mpg_to_kpl
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,name,kpl
0,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,6.376984
1,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite,7.652381
2,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,6.802117
3,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino,7.227249
4,15.0,8,429.0,198.0,4341.0,10.0,70,1,ford galaxie 500,6.376984


In [16]:
df['kpl'] = df['kpl'].round(2)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,name,kpl
0,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,6.38
1,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite,7.65
2,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,6.8
3,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino,7.23
4,15.0,8,429.0,198.0,4341.0,10.0,70,1,ford galaxie 500,6.38


### 자료형 변환

In [17]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight          float64
acceleration    float64
model year        int64
origin            int64
name             object
kpl             float64
dtype: object

In [18]:
df['horsepower'].unique()

array(['165.0', '150.0', '140.0', '198.0', '220.0', '215.0', '225.0',
       '190.0', '170.0', '160.0', '95.00', '97.00', '85.00', '88.00',
       '46.00', '87.00', '90.00', '113.0', '200.0', '210.0', '193.0', '?',
       '100.0', '105.0', '175.0', '153.0', '180.0', '110.0', '72.00',
       '86.00', '70.00', '76.00', '65.00', '69.00', '60.00', '80.00',
       '54.00', '208.0', '155.0', '130.0', '112.0', '92.00', '145.0',
       '137.0', '158.0', '167.0', '94.00', '107.0', '230.0', '49.00',
       '75.00', '91.00', '122.0', '67.00', '83.00', '78.00', '52.00',
       '61.00', '93.00', '148.0', '129.0', '96.00', '71.00', '98.00',
       '115.0', '53.00', '81.00', '79.00', '120.0', '152.0', '102.0',
       '108.0', '68.00', '58.00', '149.0', '89.00', '63.00', '48.00',
       '66.00', '139.0', '103.0', '125.0', '133.0', '138.0', '135.0',
       '142.0', '77.00', '62.00', '132.0', '84.00', '64.00', '74.00',
       '116.0', '82.00'], dtype=object)

In [19]:
# ? -> NaN 변환
df['horsepower'].replace('?', np.nan, inplace=True)

# NaN drop
df.dropna(subset=['horsepower'], axis=0, inplace=True)

# object -> float
df['horsepower'] = df['horsepower'].astype('float')

In [20]:
df['horsepower'].dtypes

dtype('float64')

In [22]:
# origin 에서 1,2,3은 사실 USA, EU, JPN을 뜻한다.
df['origin'].replace({1:'USA', 2:'EU', 3:'JPN'}, inplace=True)

df['origin'].head()

0    USA
1    USA
2    USA
3    USA
4    USA
Name: origin, dtype: object

In [23]:
print(df['origin'].unique())
print(df['origin'].dtypes)

['USA' 'JPN' 'EU']
object


In [26]:
df['origin'] = df['origin'].astype('category')
print(df['origin'].dtypes)

df['origin'] = df['origin'].astype('str')
print(df['origin'].dtypes)

category
object


In [34]:
# model year
df['model year'].unique()

array([70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82])

In [37]:
print(df['model year'].sample(3))
df['model year'] = df['model year'].astype('category')
print(df['model year'].sample(3))

313    80
239    77
150    74
Name: model year, dtype: category
Categories (13, int64): [70, 71, 72, 73, ..., 79, 80, 81, 82]
139    74
227    77
225    77
Name: model year, dtype: category
Categories (13, int64): [70, 71, 72, 73, ..., 79, 80, 81, 82]
