In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
df = pd.read_csv('auto-mpg.csv')

In [7]:
df.columns = ['mpg', 'cylinders', 'displacement', 'HP', 'weight', 'acceleration', 'model_year', 'origin', 'name']

In [8]:
df.head(3)

Unnamed: 0,mpg,cylinders,displacement,HP,weight,acceleration,model_year,origin,name
0,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
1,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
2,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst


In [9]:
mpg_to_kpl = 1.60934 / 3.78541

In [12]:
df['kpl'] = df['mpg'] * mpg_to_kpl

In [13]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,HP,weight,acceleration,model_year,origin,name,kpl
0,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,6.377143
1,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite,7.652571
2,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,6.802286
3,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino,7.227428
4,15.0,8,429.0,198.0,4341.0,10.0,70,1,ford galaxie 500,6.377143


In [15]:
print(df['HP'].unique())

['165.0' '150.0' '140.0' '198.0' '220.0' '215.0' '225.0' '190.0' '170.0'
 '160.0' '95.00' '97.00' '85.00' '88.00' '46.00' '87.00' '90.00' '113.0'
 '200.0' '210.0' '193.0' '?' '100.0' '105.0' '175.0' '153.0' '180.0'
 '110.0' '72.00' '86.00' '70.00' '76.00' '65.00' '69.00' '60.00' '80.00'
 '54.00' '208.0' '155.0' '130.0' '112.0' '92.00' '145.0' '137.0' '158.0'
 '167.0' '94.00' '107.0' '230.0' '49.00' '75.00' '91.00' '122.0' '67.00'
 '83.00' '78.00' '52.00' '61.00' '93.00' '148.0' '129.0' '96.00' '71.00'
 '98.00' '115.0' '53.00' '81.00' '79.00' '120.0' '152.0' '102.0' '108.0'
 '68.00' '58.00' '149.0' '89.00' '63.00' '48.00' '66.00' '139.0' '103.0'
 '125.0' '133.0' '138.0' '135.0' '142.0' '77.00' '62.00' '132.0' '84.00'
 '64.00' '74.00' '116.0' '82.00']


In [16]:
df.HP.replace('?', np.nan, inplace = True)
df.dropna(subset=['HP'], axis = 0, inplace = True)
df.HP = df.HP.astype('float')

In [17]:
df.HP.dtypes

dtype('float64')

In [18]:
df.origin.unique()

array([1, 3, 2])

In [20]:
df.origin.replace({1 : 'USA', 2: 'EU', 3:'JPN'}, inplace = True)

In [21]:
print(df.origin.unique())
print(df.origin.dtypes)

['USA' 'JPN' 'EU']
object


In [24]:
count, bin_dividers = np.histogram(df.HP, bins = 3)

In [25]:
print(bin_dividers)

[ 46.         107.33333333 168.66666667 230.        ]


In [27]:
bin_names = ['저출력', '보통', '고출력']

In [30]:
df['HP_bin'] = pd.cut(x = df.HP, bins=bin_dividers, labels=bin_names, include_lowest=True)

In [31]:
print(df[['HP', 'HP_bin']].head())

      HP HP_bin
0  165.0     보통
1  150.0     보통
2  150.0     보통
3  140.0     보통
4  198.0    고출력


In [32]:
print(df.HP.describe())

count    391.000000
mean     104.404092
std       38.518732
min       46.000000
25%       75.000000
50%       93.000000
75%      125.000000
max      230.000000
Name: HP, dtype: float64


In [33]:
df.HP = df.HP / abs(df.HP.max())

In [34]:
print(df.HP.head())

0    0.717391
1    0.652174
2    0.652174
3    0.608696
4    0.860870
Name: HP, dtype: float64


In [35]:
print(df.HP.describe())

count    391.000000
mean       0.453931
std        0.167473
min        0.200000
25%        0.326087
50%        0.404348
75%        0.543478
max        1.000000
Name: HP, dtype: float64


In [36]:
min_x = df.HP - df.HP.min()
min_max = df.HP.max() - df.HP.min()
df.HP = min_x / min_max

In [37]:
df

Unnamed: 0,mpg,cylinders,displacement,HP,weight,acceleration,model_year,origin,name,kpl,HP_bin
0,15.0,8,350.0,0.646739,3693.0,11.5,70,USA,buick skylark 320,6.377143,보통
1,18.0,8,318.0,0.565217,3436.0,11.0,70,USA,plymouth satellite,7.652571,보통
2,16.0,8,304.0,0.565217,3433.0,12.0,70,USA,amc rebel sst,6.802286,보통
3,17.0,8,302.0,0.510870,3449.0,10.5,70,USA,ford torino,7.227428,보통
4,15.0,8,429.0,0.826087,4341.0,10.0,70,USA,ford galaxie 500,6.377143,고출력
...,...,...,...,...,...,...,...,...,...,...,...
392,27.0,4,140.0,0.217391,2790.0,15.6,82,USA,ford mustang gl,11.478857,저출력
393,44.0,4,97.0,0.032609,2130.0,24.6,82,EU,vw pickup,18.706285,저출력
394,32.0,4,135.0,0.206522,2295.0,11.6,82,USA,dodge rampage,13.604571,저출력
395,28.0,4,120.0,0.179348,2625.0,18.6,82,USA,ford ranger,11.904000,저출력


In [38]:
def fl(x,y ):
    return x + y

In [39]:
fl(3, 4)

7

In [41]:
f2 = lambda x, y : x + y

In [42]:
f2 (4, 6)

10

In [43]:
f3 = lambda x : x ** 2

In [44]:
data = [1, 2, 3, 4]

In [45]:
list(map(f3, data))

[1, 4, 9, 16]

In [46]:
def kim(x) :
    return x + 5

In [50]:
s1 = pd.Series(np.arange(10))

In [51]:
s1

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [52]:
s1.map(kim)

0     5
1     6
2     7
3     8
4     9
5    10
6    11
7    12
8    13
9    14
dtype: int64

In [53]:
s2 = pd.Series(np.arange(10,30))

In [54]:
s2

0     10
1     11
2     12
3     13
4     14
5     15
6     16
7     17
8     18
9     19
10    20
11    21
12    22
13    23
14    24
15    25
16    26
17    27
18    28
19    29
dtype: int64

In [55]:
s1.map(s2)

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64

In [56]:
import seaborn as sns
titanic = sns.load_dataset('titanic')
df= titanic.loc[:, ['age', 'fare']]
df['ten'] = 10

In [57]:
df.head()

Unnamed: 0,age,fare,ten
0,22.0,7.25,10
1,38.0,71.2833,10
2,26.0,7.925,10
3,35.0,53.1,10
4,35.0,8.05,10


In [60]:
def add_10 (n):
    return n + 10

def add_two_obj(a, b):
    return a + b

In [61]:
print(add_10(10))
print(add_two_obj(10, 10))

20
20


In [62]:
df_map = df.applymap(add_10)

In [63]:
df_map

Unnamed: 0,age,fare,ten
0,32.0,17.2500,20
1,48.0,81.2833,20
2,36.0,17.9250,20
3,45.0,63.1000,20
4,45.0,18.0500,20
...,...,...,...
886,37.0,23.0000,20
887,29.0,40.0000,20
888,,33.4500,20
889,36.0,40.0000,20


In [64]:
def missing_value(series):
    return series.isnull()

titanic = sns.load_dataset('titanic')
df = titanic.loc[:, ['age', 'fare']]

result = df.apply(missing_value, axis=0)

In [65]:
result

Unnamed: 0,age,fare
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
886,False,False
887,False,False
888,True,False
889,False,False
