# Pengenalan Dasar Library Pandas

Notes:


*   Dokumentasi Pandas library: https://pandas.pydata.org/docs/index.html
*   Tutorial Basic Pandas: https://pandas.pydata.org/docs/user_guide/10min.html#min



In [21]:
# import libraries
import numpy as np
import pandas as pd

# Struktur Data pada Pandas

Pandas menyediakan dua jenis struktur data:


*   **Series**: Array satu dimensi dengan berbagai tipe data seperti integers, strings, Python objects, dll.
*   **DataFrame**: Array dua dimensi yang terdiri dari baris dan kolom.


In [22]:
# Membuat data array satu dimensi dengan Pandas
ser = pd.Series([1, 3, 5, np.nan, 6, 8])
print(ser)

# Membuat data array dua dimensi dengan isi random
df = pd.DataFrame(np.random.randn(6, 4), columns=list("ABCD"))
print('\n')
print(df)

# Membuat data dua dimensi dari dictionary
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": np.array([3] * 4, dtype="int32"),
        "C": pd.Categorical(["test", "train", "test", "train"]),
        "D": "foo",
    }
)

print('\n', df2)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


          A         B         C         D
0 -0.386651  0.191213 -1.350897 -0.210658
1 -0.173246 -0.231973 -1.482147 -1.486960
2  0.806852 -0.140569  0.429632 -0.692796
3  0.747919 -1.195263 -0.560867 -0.651959
4 -0.789774  0.403004 -1.630550 -0.511864
5  0.828345 -0.439653 -1.895943 -0.671595

      A  B      C    D
0  1.0  3   test  foo
1  1.0  3  train  foo
2  1.0  3   test  foo
3  1.0  3  train  foo


# Bermain dengan Pandas

In [23]:
# Print beberapa data teratas
print('Data Teratas:')
print(df.head())
print(df.head(3))

# Print beberapa data terakhir
print('\nData Terakhir:')
print(df.tail())
print(df.tail(3))

Data Teratas:
          A         B         C         D
0 -0.386651  0.191213 -1.350897 -0.210658
1 -0.173246 -0.231973 -1.482147 -1.486960
2  0.806852 -0.140569  0.429632 -0.692796
3  0.747919 -1.195263 -0.560867 -0.651959
4 -0.789774  0.403004 -1.630550 -0.511864
          A         B         C         D
0 -0.386651  0.191213 -1.350897 -0.210658
1 -0.173246 -0.231973 -1.482147 -1.486960
2  0.806852 -0.140569  0.429632 -0.692796

Data Terakhir:
          A         B         C         D
1 -0.173246 -0.231973 -1.482147 -1.486960
2  0.806852 -0.140569  0.429632 -0.692796
3  0.747919 -1.195263 -0.560867 -0.651959
4 -0.789774  0.403004 -1.630550 -0.511864
5  0.828345 -0.439653 -1.895943 -0.671595
          A         B         C         D
3  0.747919 -1.195263 -0.560867 -0.651959
4 -0.789774  0.403004 -1.630550 -0.511864
5  0.828345 -0.439653 -1.895943 -0.671595


In [24]:
# Print nama kolom
print(df.columns)

Index(['A', 'B', 'C', 'D'], dtype='object')


In [25]:
# Konversi dari DataFrame ke array NumPy
df_np = df.to_numpy()
print(df_np)

[[-0.38665124  0.19121251 -1.35089704 -0.21065767]
 [-0.17324611 -0.23197347 -1.48214694 -1.48695955]
 [ 0.80685179 -0.14056857  0.42963165 -0.6927961 ]
 [ 0.74791867 -1.19526278 -0.5608673  -0.65195944]
 [-0.78977371  0.40300389 -1.63055048 -0.51186376]
 [ 0.82834492 -0.43965252 -1.89594337 -0.67159459]]


# Selection pada Pandas

In [26]:
# Menampilkan data pada kolom tertentu
a = df['A']
print('A\n', a)

# Menampilkan data pada baris tertentu
brs = df[2:]
print('\nbaris\n', brs)

A
 0   -0.386651
1   -0.173246
2    0.806852
3    0.747919
4   -0.789774
5    0.828345
Name: A, dtype: float64

baris
           A         B         C         D
2  0.806852 -0.140569  0.429632 -0.692796
3  0.747919 -1.195263 -0.560867 -0.651959
4 -0.789774  0.403004 -1.630550 -0.511864
5  0.828345 -0.439653 -1.895943 -0.671595


In [27]:
# Menampilkan data menggunakan label, berdasarkan baris
ab = df.loc[:, ['A', 'B']]
print('AB\n', ab)

# Menampilkan data pada baris tertentu,  berdasarkan baris
brs = df.loc[0:2, ['A']]
print('\nBaris\n', brs)

AB
           A         B
0 -0.386651  0.191213
1 -0.173246 -0.231973
2  0.806852 -0.140569
3  0.747919 -1.195263
4 -0.789774  0.403004
5  0.828345 -0.439653

Baris
           A
0 -0.386651
1 -0.173246
2  0.806852


In [31]:
# Menampilkan data menggunakan iloc (index)
ab = df.iloc[:, 0:3]
print('AB\n', ab)

# Menampilkan data pada baris tertentu dengan iloc (index)
brs = df.iloc[0:2, 0]
print('\nBaris\n', brs)

AB
           A         B         C
0 -0.386651  0.191213 -1.350897
1 -0.173246 -0.231973 -1.482147
2  0.806852 -0.140569  0.429632
3  0.747919 -1.195263 -0.560867
4 -0.789774  0.403004 -1.630550
5  0.828345 -0.439653 -1.895943

Baris
 0   -0.386651
1   -0.173246
Name: A, dtype: float64


# Statistika pada Pandas

In [32]:
print(df)

          A         B         C         D
0 -0.386651  0.191213 -1.350897 -0.210658
1 -0.173246 -0.231973 -1.482147 -1.486960
2  0.806852 -0.140569  0.429632 -0.692796
3  0.747919 -1.195263 -0.560867 -0.651959
4 -0.789774  0.403004 -1.630550 -0.511864
5  0.828345 -0.439653 -1.895943 -0.671595


In [33]:
# Menghitung nilai rata-rata
print(df.mean())              # per kolom
print('\n', df.mean(axis=1))  # per baris

A    0.172241
B   -0.235540
C   -1.081796
D   -0.704305
dtype: float64
0   -0.439248
1   -0.843582
2    0.100780
3   -0.415043
4   -0.632296
5   -0.544711
dtype: float64


# Explore Data

In [35]:
print(df)

          A         B         C         D
0 -0.386651  0.191213 -1.350897 -0.210658
1 -0.173246 -0.231973 -1.482147 -1.486960
2  0.806852 -0.140569  0.429632 -0.692796
3  0.747919 -1.195263 -0.560867 -0.651959
4 -0.789774  0.403004 -1.630550 -0.511864
5  0.828345 -0.439653 -1.895943 -0.671595


# Menyimpan DataFrame

In [34]:
# Menyimpan dalam file .csv
df.to_csv("foo.csv")

# Menyimpan dalam file excel
df.to_excel("foo.xlsx", sheet_name="Sheet1")