In [1]:
import pandas as pd
import numpy as np

## Day 9: Data Manipulation with Pandas - Hands On

### I. Create DataFrame

In [43]:
pd.__version__

'1.1.5'

Kita bisa membuat dataframe dengan menuliskan dictionary seperti ini.

In [19]:
df = pd.DataFrame({
    "cust_id": ["A1", "A2", "A3", "A4", "A5"],
    "item_bought": [10,15,20,35,4],
    "city": ["Bandung", "Surabaya", "Bekasi", "Tangerang", "Denpasar"]
    })
df

Unnamed: 0,cust_id,item_bought,city
0,A1,10,Bandung
1,A2,15,Surabaya
2,A3,20,Bekasi
3,A4,35,Tangerang
4,A5,4,Denpasar


Bisa juga pakai nested list, jadi datanya per baris, bukan per kolom.

In [33]:
df_2 = pd.DataFrame([
    ['A1', 10, 'Bandung'],
    ['A2', 15, 'Surabaya'],
    ['A3', 20, 'Bekasi'],
    ['A4', 35, 'Tangerang'],
    ['A5', 4, 'Denpasar']
])

df_2.columns = ['cust_id', 'quantity', 'city']
df_2

Unnamed: 0,cust_id,quantity,city
0,A1,10,Bandung
1,A2,15,Surabaya
2,A3,20,Bekasi
3,A4,35,Tangerang
4,A5,4,Denpasar


### II. Basic Pandas Operation

#### II.1. Basic

Basic overviews:

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   cust_id      5 non-null      object
 1   item_bought  5 non-null      int64 
 2   city         5 non-null      object
dtypes: int64(1), object(2)
memory usage: 248.0+ bytes


In [28]:
df.describe()

Unnamed: 0,item_bought
count,5.0
mean,16.8
std,11.777096
min,4.0
25%,10.0
50%,15.0
75%,20.0
max,35.0


In [21]:
df.columns

Index(['cust_id', 'item_bought', 'city'], dtype='object')

Changing column names:

In [22]:
df.columns = ['cust_id', 'quantity', 'center_city']

In [23]:
df

Unnamed: 0,cust_id,quantity,center_city
0,A1,10,Bandung
1,A2,15,Surabaya
2,A3,20,Bekasi
3,A4,35,Tangerang
4,A5,4,Denpasar


DataFrame shape (row, column)

In [34]:
df.shape

(5, 3)

In [35]:
row = df.shape[0]

In [36]:
print(row)

5


In [38]:
column = df.shape[1]

In [39]:
column

3

Kita bisa mengubah `data type` dari sebuah kolom dengan `.astype`.

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   cust_id      5 non-null      object
 1   quantity     5 non-null      int64 
 2   center_city  5 non-null      object
dtypes: int64(1), object(2)
memory usage: 248.0+ bytes


In [27]:
df['quantity'] = df['quantity'].astype(float)

In [28]:
df

Unnamed: 0,cust_id,quantity,center_city
0,A1,10.0,Bandung
1,A2,15.0,Surabaya
2,A3,20.0,Bekasi
3,A4,35.0,Tangerang
4,A5,4.0,Denpasar


Maka akan terlihat bahwa kolom `quantity` sekarang menjadi dalam bentuk desimal (float).

#### II.2. Filtering

Selecting first few rows

In [47]:
df.head(2)

Unnamed: 0,cust_id,quantity,center_city
0,A1,10.0,Bandung
1,A2,15.0,Surabaya


Selecting last rows

In [48]:
df.tail(3)

Unnamed: 0,cust_id,quantity,center_city
2,A3,20.0,Bekasi
3,A4,35.0,Tangerang
4,A5,4.0,Denpasar


Selecting several columns

In [52]:
df_selected = df[['cust_id', 'quantity']]

In [53]:
df_selected

Unnamed: 0,cust_id,quantity
0,A1,10.0
1,A2,15.0
2,A3,20.0
3,A4,35.0
4,A5,4.0


In [55]:
## If you don't wish to create a new dataframe to store the selection, you can just
df[['cust_id', 'quantity']]

Unnamed: 0,cust_id,quantity
0,A1,10.0
1,A2,15.0
2,A3,20.0
3,A4,35.0
4,A5,4.0


Filtering rows that has quantity > 10

In [59]:
df

Unnamed: 0,cust_id,quantity,center_city
0,A1,10.0,Bandung
1,A2,15.0,Surabaya
2,A3,20.0,Bekasi
3,A4,35.0,Tangerang
4,A5,4.0,Denpasar


In [62]:
df['quantity']>10

0    False
1     True
2     True
3     True
4    False
Name: quantity, dtype: bool

In [56]:
df[df['quantity']>10]

Unnamed: 0,cust_id,quantity,center_city
1,A2,15.0,Surabaya
2,A3,20.0,Bekasi
3,A4,35.0,Tangerang


Filtering rows that has quantity >= 10 and whose center city is in Bandung / Surabaya.

In [84]:
rule_pertama = df['quantity'] >= 10
rule_kedua = df['center_city'].isin(['Bandung', 'Surabaya'])
rule_ketiga = df['quantity'] < 5

# (rule pertama dan rule kedua) atau (rule ketiga)

df[(rule_pertama & rule_kedua) | rule_ketiga]

Unnamed: 0,cust_id,quantity,center_city
0,A1,10.0,Bandung
1,A2,15.0,Surabaya
4,A5,4.0,Denpasar


Selecting all customers who are NOT in Bandung

In [86]:
df[~df['center_city'].isin(['Bandung'])]

Unnamed: 0,cust_id,quantity,center_city
1,A2,15.0,Surabaya
2,A3,20.0,Bekasi
3,A4,35.0,Tangerang
4,A5,4.0,Denpasar


In [40]:
df[(df['quantity'] >= 10) & (df['center_city'].isin(['Bandung', 'Surabaya']))]

Unnamed: 0,cust_id,quantity,center_city
0,A1,10,Bandung
1,A2,15,Surabaya


Filtering with OR conditionals

In [63]:
df[(df["quantity"]>15) | (df["center_city"]!= "Tangerang")]

Unnamed: 0,cust_id,quantity,center_city
0,A1,10,Bandung
1,A2,15,Surabaya
2,A3,20,Bekasi
3,A4,35,Tangerang
4,A5,4,Denpasar


Saving a filtered dataframe

In [88]:
df_more_than_10 = df[df['quantity']>10]

In [89]:
df_more_than_10

Unnamed: 0,cust_id,quantity,center_city
1,A2,15.0,Surabaya
2,A3,20.0,Bekasi
3,A4,35.0,Tangerang


Restore the index

In [90]:
df_more_than_10 = df[df['quantity']>10].reset_index(drop = True)

In [91]:
df_more_than_10

Unnamed: 0,cust_id,quantity,center_city
0,A2,15.0,Surabaya
1,A3,20.0,Bekasi
2,A4,35.0,Tangerang


Resample random rows >> run the cell below several times and you'll get different results.

In [118]:
df.sample(3)

Unnamed: 0,cust_id,quantity,center_city
1,A2,15.0,Surabaya
4,A5,4.0,Denpasar
2,A3,20.0,Bekasi


Selecting specific cell in the dataframe.

In [53]:
df

Unnamed: 0,cust_id,quantity,center_city
0,A1,10,Bandung
1,A2,15,Surabaya
2,A3,20,Bekasi
3,A4,35,Tangerang
4,A5,4,Denpasar


In [119]:
### the quantity value of the 2nd index row
df.loc[2, 'quantity']

20.0

In [120]:
### the cell value of row index 3, and column index 2
df.iloc[3,2]

'Tangerang'

Selecting a range of cells

In [126]:
df.iloc[0:2,1:3]

## dari row 0 sampai sebelum row index 2
## dari column 1 sampai sebelum column index 3

Unnamed: 0,quantity,center_city
0,10.0,Bandung
1,15.0,Surabaya


In [127]:
df.iloc[0:2,0:3]

Unnamed: 0,cust_id,quantity,center_city
0,A1,10.0,Bandung
1,A2,15.0,Surabaya


In [124]:
a = [0,1,2,3,4,5,6,7,8]

In [125]:
a[0:5]

[0, 1, 2, 3, 4]

#### Materi Tambahan: Sorting Values

In [132]:
df.sort_values(['quantity'], ascending = False)

Unnamed: 0,cust_id,quantity,center_city
3,A4,35.0,Tangerang
2,A3,20.0,Bekasi
1,A2,15.0,Surabaya
0,A1,10.0,Bandung
4,A5,4.0,Denpasar


In [135]:
df_3 = pd.DataFrame({
    'quantity':[20,10,30,30,25],
    'center_city':['Bandung', 'Bandung', 'Bandung', 'Surabaya', 'Surabaya']
})

df_3

Unnamed: 0,quantity,center_city
0,20,Bandung
1,10,Bandung
2,30,Bandung
3,30,Surabaya
4,25,Surabaya


In [136]:
df_3.sort_values(['quantity', 'center_city'], ascending = [False, True])

Unnamed: 0,quantity,center_city
2,30,Bandung
3,30,Surabaya
4,25,Surabaya
0,20,Bandung
1,10,Bandung


#### II.3. Reorder Columns

In [137]:
df

Unnamed: 0,cust_id,quantity,center_city
0,A1,10.0,Bandung
1,A2,15.0,Surabaya
2,A3,20.0,Bekasi
3,A4,35.0,Tangerang
4,A5,4.0,Denpasar


In [138]:
df_reorder = df[['quantity', 'center_city', 'cust_id']]

In [139]:
df_reorder

Unnamed: 0,quantity,center_city,cust_id
0,10.0,Bandung,A1
1,15.0,Surabaya,A2
2,20.0,Bekasi,A3
3,35.0,Tangerang,A4
4,4.0,Denpasar,A5


### III. Group-By Data

This topic will further be discussed on Day 14.

In [140]:
iris = pd.read_csv("https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/639388c2cbc2120a14dcf466e85730eb8be498bb/iris.csv")

In [141]:
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


First, we can do `value_counts` to know how much species are there in the whole dataset.

In [143]:
iris['species'].value_counts()

setosa        50
virginica     50
versicolor    50
Name: species, dtype: int64

In [144]:
iris['species'].value_counts(normalize = True)

setosa        0.333333
virginica     0.333333
versicolor    0.333333
Name: species, dtype: float64

Next, we can groupby each species and find out a few important things!

In [71]:
iris.groupby('species').max() #to know the maximum length/width of the flowers in each species

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.8,4.4,1.9,0.6
versicolor,7.0,3.4,5.1,1.8
virginica,7.9,3.8,6.9,2.5


However, we can specify in details what we want to get from each columns, as such:

In [145]:
iris.groupby('species').agg({'sepal_length':'max', 
                             'sepal_width':'mean',
                            'petal_length':['median', 'mean']})

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_length
Unnamed: 0_level_1,max,mean,median,mean
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
setosa,5.8,3.418,1.5,1.464
versicolor,7.0,2.77,4.35,4.26
virginica,7.9,2.974,5.55,5.552


In [147]:
groupby_result = iris.groupby('species').agg({'sepal_length':'max', 'sepal_width':'mean',
                            'petal_length':['median', 'mean']})

Perhatikan! Index kolomnya itu 'multi-index', sehingga jika kita ingin memilih beberapa kolom saja, bentuknya harus seperti ini:

In [149]:
groupby_result[[('sepal_length','max'), ('petal_length', 'median')]]

Unnamed: 0_level_0,sepal_length,petal_length
Unnamed: 0_level_1,max,median
species,Unnamed: 1_level_2,Unnamed: 2_level_2
setosa,5.8,1.5
versicolor,7.0,4.35
virginica,7.9,5.55


### IV. Pivot Table

In [159]:
df = pd.DataFrame({'First Name': ['Aryan', 'Rohan', 'Riya', 'Yash', 'Siddhant','Orang' ],
                   'Last Name': ['Singh', 'Agarwal', 'Shah', 'Bhatia', 'Khanna', 'Baru'],
                   'Type': ['Full-time Employee', 'Intern', 'Full-time Employee', 
                            'Part-time Employee', 'Full-time Employee', 'Full-time Employee'],
                   'Department': ['Administration', 'Technical', 'Administration', 
                                  'Technical', 'Management', 'Management'],
                   'YoE': [2, 3, 5, 7, 6, 6],
                   'Salary': [20000, 5000, 10000, 10000, 20000,10000]})

df

Unnamed: 0,First Name,Last Name,Type,Department,YoE,Salary
0,Aryan,Singh,Full-time Employee,Administration,2,20000
1,Rohan,Agarwal,Intern,Technical,3,5000
2,Riya,Shah,Full-time Employee,Administration,5,10000
3,Yash,Bhatia,Part-time Employee,Technical,7,10000
4,Siddhant,Khanna,Full-time Employee,Management,6,20000
5,Orang,Baru,Full-time Employee,Management,6,10000


In [160]:
df.pivot_table(index = ['Type', 'Department'], values = 'Salary', 
              aggfunc = ['mean', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count
Unnamed: 0_level_1,Unnamed: 1_level_1,Salary,Salary
Type,Department,Unnamed: 2_level_2,Unnamed: 3_level_2
Full-time Employee,Administration,15000,2
Full-time Employee,Management,15000,2
Intern,Technical,5000,1
Part-time Employee,Technical,10000,1


- Ada 2 orang yang 'Full Time Employee' di bagian 'Administration', gaji rata-rata mereka 15 000
- Ada 1 Intern, dan dia bekerja di bagian 'Technical', dan gaji rata-rata dia 5 000

In [161]:
df.pivot_table(index = ['Department', 'Type'], values = 'Salary', 
              aggfunc = ['mean', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count
Unnamed: 0_level_1,Unnamed: 1_level_1,Salary,Salary
Department,Type,Unnamed: 2_level_2,Unnamed: 3_level_2
Administration,Full-time Employee,15000,2
Management,Full-time Employee,15000,2
Technical,Intern,5000,1
Technical,Part-time Employee,10000,1


In [162]:
df.pivot_table(index = ['Department', 'Type'], values = 'Salary')

Unnamed: 0_level_0,Unnamed: 1_level_0,Salary
Department,Type,Unnamed: 2_level_1
Administration,Full-time Employee,15000
Management,Full-time Employee,15000
Technical,Intern,5000
Technical,Part-time Employee,10000


Nah, kalau ditukar seperti ini Index-nya, maka yang kita lihat lebih dulu itu Departmentnya, bukan status ketenagakerjaannya.

Bagaimana kalau sekarang sintax kita seperti ini?

In [173]:
df.pivot_table(index = ['Department', 'Type'], 
               columns = ['YoE'], 
               values = 'Salary')

Unnamed: 0_level_0,YoE,2,3,5,6,7
Department,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Administration,Full-time Employee,20000.0,,10000.0,,
Management,Full-time Employee,,,,15000.0,
Technical,Intern,,5000.0,,,
Technical,Part-time Employee,,,,,10000.0


Sekarang, kita memiliki kolom 'YoE', artinya (Years of Experience). Dengan melihat tabel di atas, kita bisa belajar bahwa:
1. Ada ketimpangan gaji di Department Administration untuk para employee yang Full Time! Yang punya 2 tahun pengalaman dapet gaji 20 000, tapi yang punya 5 tahun pengalaman dapet gaji 10 000. 

2. Untuk department Management, si employee yang gajinya 20 000 itu memiliki 6 tahun pengalaman
3. dst

Pivot table memberikan kita kesempatan untuk melihat keadaan data secara 'garis besar', dan kita bisa memilih aspek-aspek apa saja yang ingin di highlight. 

### V. Data Transformation

Kita bisa melakukan operasi kolom (atau baris) di dataframe menggunakan fitur `apply` pada `pandas`. 

In [176]:
df = pd.DataFrame({ 'A': np.random.randint(low = 1, high = 10, size = 100), 
                   'B': np.random.randint(low = 10, high = 100, size = 100),
                   'C': np.random.randint(low = 20, high = 45, size = 100)
                  }, )
df

Unnamed: 0,A,B,C
0,1,35,34
1,2,64,35
2,8,20,34
3,1,23,44
4,6,50,31
...,...,...,...
95,4,82,31
96,9,49,40
97,7,30,30
98,1,92,23


Misal kita ingin membuat kolom baru, kolom 'D', dengan rumus sebagai berikut:
- D = 10 * A + (0.5 * B) - Akar(C)

Maka bagaimana agar kita cukup menulis 1 fungsi saja, dan langsung bisa menghitung seluruh baris di dataframe kita?

In [177]:
def custom_operation(a, b, c):
    return 10*a + (0.5)*b - np.sqrt(c)

In [181]:
df['D'] = df.apply(lambda x: custom_operation(x['A'], x['B'], x['C']), axis = 1)

In [182]:
df['D'] = df.apply(lambda x: custom_operation(x['A'], x['B'], x['C']), axis = 1)

In [185]:
### CEK Index 0 dan 95
print(10*1 + 0.5*35 - np.sqrt(34))
print(10*4 + 0.5*82 - np.sqrt(31))

21.6690481051547
75.43223563716998


In [196]:
df

Unnamed: 0,A,B,C,D,E,F
0,1,35,34,21.669048,3,36.0
1,2,64,35,46.083920,4,66.0
2,8,20,34,84.169048,10,28.0
3,1,23,44,14.866750,3,24.0
4,6,50,31,79.432236,8,56.0
...,...,...,...,...,...,...
95,4,82,31,75.432236,6,86.0
96,9,49,40,108.175445,11,58.0
97,7,30,30,79.522774,9,37.0
98,1,92,23,51.204168,3,93.0


In [201]:
df['product_ABC' ] = df[['A','B','C']].product(axis = 1)

In [202]:
df

Unnamed: 0,A,B,C,D,E,F,product_ABC
0,1,35,34,21.669048,3,36.0,1190
1,2,64,35,46.083920,4,66.0,4480
2,8,20,34,84.169048,10,28.0,5440
3,1,23,44,14.866750,3,24.0,1012
4,6,50,31,79.432236,8,56.0,9300
...,...,...,...,...,...,...,...
95,4,82,31,75.432236,6,86.0,10168
96,9,49,40,108.175445,11,58.0,17640
97,7,30,30,79.522774,9,37.0,6300
98,1,92,23,51.204168,3,93.0,2116


In [186]:
# apply 1 kolom doang
df['E'] = df['A'].apply(lambda x: x + 2)

In [189]:
# apply 2 kolom 
df['F'] = df.apply(lambda x: x['A'] + x['B'], axis = 1)

In [190]:
df

Unnamed: 0,A,B,C,D,E,F
0,1,35,34,21.669048,3,36.0
1,2,64,35,46.083920,4,66.0
2,8,20,34,84.169048,10,28.0
3,1,23,44,14.866750,3,24.0
4,6,50,31,79.432236,8,56.0
...,...,...,...,...,...,...
95,4,82,31,75.432236,6,86.0
96,9,49,40,108.175445,11,58.0
97,7,30,30,79.522774,9,37.0
98,1,92,23,51.204168,3,93.0


Nah, kalo kita cuma pengen tau rata-rata per barisnya gimana? Apakah perlu se-ribet itu? Nggak pastinya. Kalau operasinya itu yang umum-umum saja, tidak usah pakai `apply lambda`. 

In [203]:
df['rata-rata_abc'] = df[['A','B','C']].mean(axis = 1)

In [205]:
np.mean([1,35,34])

23.333333333333332

In [206]:
df['cumsum_a'] = df['A'].cumsum()

In [208]:
df['B_bagi_A'] = df['B'] / df['A']

In [209]:
df

Unnamed: 0,A,B,C,D,E,F,product_ABC,rata-rata_abc,cumsum_a,B_bagi_A
0,1,35,34,21.669048,3,36.0,1190,23.333333,1,35.000000
1,2,64,35,46.083920,4,66.0,4480,33.666667,3,32.000000
2,8,20,34,84.169048,10,28.0,5440,20.666667,11,2.500000
3,1,23,44,14.866750,3,24.0,1012,22.666667,12,23.000000
4,6,50,31,79.432236,8,56.0,9300,29.000000,18,8.333333
...,...,...,...,...,...,...,...,...,...,...
95,4,82,31,75.432236,6,86.0,10168,39.000000,484,20.500000
96,9,49,40,108.175445,11,58.0,17640,32.666667,493,5.444444
97,7,30,30,79.522774,9,37.0,6300,22.333333,500,4.285714
98,1,92,23,51.204168,3,93.0,2116,38.666667,501,92.000000


Next, bagaimana jika kita ingin melakukan pemetaan? Misalnya:
- Jika kolom A bernilai 1-3, kita beri label 'Kurang'
- Jika kolom A bernilai 4-7, kita beri label 'Cukup'
- Jika kolom A bernilai 8-10, kita beri label 'Surplus'

In [213]:
def label(a):
    if a<4:
        return 'Kurang'
    elif a<8:
        return 'Cukup'
    else:
        return 'Surplus'

In [214]:
df['label_A'] = df['A'].apply(label)

In [215]:
df

Unnamed: 0,A,B,C,D,E,F,product_ABC,rata-rata_abc,cumsum_a,B_bagi_A,label_A
0,1,35,34,21.669048,3,36.0,1190,23.333333,1,35.000000,Kurang
1,2,64,35,46.083920,4,66.0,4480,33.666667,3,32.000000,Kurang
2,8,20,34,84.169048,10,28.0,5440,20.666667,11,2.500000,Surplus
3,1,23,44,14.866750,3,24.0,1012,22.666667,12,23.000000,Kurang
4,6,50,31,79.432236,8,56.0,9300,29.000000,18,8.333333,Cukup
...,...,...,...,...,...,...,...,...,...,...,...
95,4,82,31,75.432236,6,86.0,10168,39.000000,484,20.500000,Cukup
96,9,49,40,108.175445,11,58.0,17640,32.666667,493,5.444444,Surplus
97,7,30,30,79.522774,9,37.0,6300,22.333333,500,4.285714,Cukup
98,1,92,23,51.204168,3,93.0,2116,38.666667,501,92.000000,Kurang


Nah, selain `apply`, sebenarnya ada juga yang namanya pemetaan menggunakan dictionary. Contohnya:

In [228]:
warna_harga = {
    'Kuning':15000,
    'Hijau':10000,
    'Biru':11000
}

df_laporan = pd.DataFrame({
    'Warna':['Kuning', 'Kuning', 'Hijau', 'Kuning', 'Biru', 'Biru', 'Kuning', 'Hijau'],
    'Kuantitas':[10,10,20,15,8,20,12,18]
})

In [229]:
df_laporan

Unnamed: 0,Warna,Kuantitas
0,Kuning,10
1,Kuning,10
2,Hijau,20
3,Kuning,15
4,Biru,8
5,Biru,20
6,Kuning,12
7,Hijau,18


In [230]:
df_laporan['harga_satuan'] = df_laporan['Warna'].map(warna_harga)

In [231]:
df_laporan['total_harga'] = df_laporan['Kuantitas'] * df_laporan['harga_satuan']

In [232]:
df_laporan

Unnamed: 0,Warna,Kuantitas,harga_satuan,total_harga
0,Kuning,10,15000,150000
1,Kuning,10,15000,150000
2,Hijau,20,10000,200000
3,Kuning,15,15000,225000
4,Biru,8,11000,88000
5,Biru,20,11000,220000
6,Kuning,12,15000,180000
7,Hijau,18,10000,180000


### VI. Data Standardization

In [233]:
from sklearn import preprocessing

In [234]:
iris['sepal_length'].mean()

5.843333333333334

In [235]:
iris['sepal_length'].std()

0.828066127977863

In [240]:
iris.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [238]:
standardized_iris = pd.DataFrame(preprocessing.scale(iris[['sepal_length',
                                              'sepal_width',
                                              'petal_length',
                                              'petal_width']]))
standardized_iris['species'] = iris['species']

In [241]:
standardized_iris.columns = iris.columns

In [242]:
standardized_iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,-0.900681,1.032057,-1.341272,-1.312977,setosa
1,-1.143017,-0.124958,-1.341272,-1.312977,setosa
2,-1.385353,0.337848,-1.398138,-1.312977,setosa
3,-1.506521,0.106445,-1.284407,-1.312977,setosa
4,-1.021849,1.263460,-1.341272,-1.312977,setosa
...,...,...,...,...,...
145,1.038005,-0.124958,0.819624,1.447956,virginica
146,0.553333,-1.281972,0.705893,0.922064,virginica
147,0.795669,-0.124958,0.819624,1.053537,virginica
148,0.432165,0.800654,0.933356,1.447956,virginica


In [246]:
standardized_iris['sepal_length'].mean()

-4.736951571734001e-16

In [244]:
standardized_iris['sepal_length'].std()

1.0033500931359767