In [212]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

## **1. Akses Dataset**

Akses dataset serangan hama dengan 7 jenis hama berbeda yang disertai kategori :

(T) -> jumlah area yang terkena serangan

(P) -> jumlah area yang mengalami kerusakan lebih dari 75%

In [213]:
dataset_dir = "C:\\Users\\Diputra_W\\Documents\\Campus\\Study\\Mata Kuliah\\Semester 7\\PKL\\FInal Project\\src\\penerapan-pengelolaan-hama-terpadu-tanaman-pangan.xlsx"

In [214]:
df = pd.ExcelFile(dataset_dir)

# Akses tiap halaman pada dataset (jika ada)
df.sheet_names

['padi',
 'jagung',
 'kedelai',
 'kacang tanah',
 'kacang hijau',
 'ubi kayu',
 'ubi jalar']

In [215]:
# Load data untuk halaman tertentu & tampilkan
padi_data = df.parse('ubi jalar')

print(f"Shape data : {padi_data}")
padi_data.head(178)

Shape data :           TAHUN  NO PROV            PROV   Tikus Unnamed: 4  Babi Unnamed: 6   
0           NaN      NaN             NaN       T          P     T          P  \
1          2018      1.0            Aceh       6          0    13          0   
2          2018      2.0  Sumatera Utara   158.9          0     0          0   
3          2018      3.0  Sumatera Barat     NaN        NaN   NaN        NaN   
4          2018      4.0            Riau     NaN        NaN     5          0   
..          ...      ...             ...     ...        ...   ...        ...   
171        2022     31.0          Maluku     NaN        NaN   4.8          0   
172        2022     32.0    Maluku Utara     NaN        NaN   NaN        NaN   
173        2022     33.0     Papua Barat      39          0   NaN        NaN   
174        2022     34.0           Papua     NaN        NaN   NaN        NaN   
175  2022 Total      NaN             NaN  146.58          0  31.5          0   

    Bercak Daun Coklat Unn

Unnamed: 0,TAHUN,NO PROV,PROV,Tikus,Unnamed: 4,Babi,Unnamed: 6,Bercak Daun Coklat,Unnamed: 8,Boleng,Unnamed: 10,TOTAL OPUT,Unnamed: 12
0,,,,T,P,T,P,T,P,T,P,T,P
1,2018,1.0,Aceh,6,0,13,0,,,3.5,0,22.5,0
2,2018,2.0,Sumatera Utara,158.9,0,0,0,0,0,7,0,165.9,0
3,2018,3.0,Sumatera Barat,,,,,,,,,,
4,2018,4.0,Riau,,,5,0,,,,,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,2022,31.0,Maluku,,,4.8,0,0.6,0,,,5.4,0
172,2022,32.0,Maluku Utara,,,,,,,,,,
173,2022,33.0,Papua Barat,39,0,,,,,37.5,0,76.5,0
174,2022,34.0,Papua,,,,,,,,,,


## **2. Preprocessing Data**

Membersihkan dataset dan mempersiapkan untuk dilakukan regresi linear :

- Perbaiki header

- Isi kolom -, null, nan, dengan nilai 0

- Pilih kolom/baris yang ingin digunakan dalam sistem

In [216]:
# Perbaikan header data
data = pd.read_excel(dataset_dir, header=[0, 1])
data.columns = [" ".join(col).strip() for col in data.columns.values]

# Cleaning kolom data
data.columns = data.columns.str.strip()
data.columns = data.columns.str.replace(" ", "_")
data.columns = data.columns.str.replace("/", "_")
data.columns = data.columns.str.replace("-", "_")

# Ganti nama kolom (tujuan untuk mempermudah proses kedepannya)
data.rename(columns={
    "TAHUN_Unnamed:_0_level_1": "TAHUN",
    "NO_PROV_Unnamed:_1_level_1": "KODE_PROVINSI",
    "PROV_Unnamed:_2_level_1": "PROVINSI",
}, inplace=True)

# Penanganan missing values
data.fillna(0, inplace=True)

# Menghilangkan baris yang tidak diperlukan
# rows_to_drop = [34, 69, 104, 139, 174]
# data = data.drop(index=rows_to_drop)

# Filtering kolom yang akan digunakan (kode provinsi, tahun, dan kolom _T)
columns_to_keep = ['KODE_PROVINSI'] + [col for col in data.columns if '_T' in col]
data_T = data[columns_to_keep]

data_T.head(40)

Unnamed: 0,KODE_PROVINSI,PENGGEREK_BATANG_PADI_T,WBC_T,TIKUS_T,BLAS_T,KRESEK_T,TUNGRO_T,KR_KH_T,TOTAL_OPUT_T
0,1.0,2818.7,2663.48,2722.45,1409.0,2738.0,0.0,0.0,12351.63
1,2.0,1799.0,478.56,2030.15,3042.95,1831.5,35.1,68.4,9285.66
2,3.0,116.15,430.7,2044.21,465.01,31.25,104.85,57.45,3249.62
3,4.0,952.9,435.35,658.25,485.65,117.55,25.0,0.0,2674.7
4,5.0,345.98,70.4,448.36,169.23,61.49,4.41,0.0,1099.87
5,6.0,4681.48,1728.57,8608.06,3076.3,2313.05,83.27,135.9,20626.63
6,7.0,837.95,60.25,796.24,614.1,285.75,34.5,0.5,2629.29
7,8.0,4717.0,1842.0,6765.0,3071.0,2885.0,1.0,26.0,19307.0
8,9.0,17.42,424.1,214.95,175.7,40.43,0.0,0.0,872.6
9,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Sesuai catatan sebelumnya, model akan mengguakan data kategori (T) dan dijumlahkan total untuk 1 provinsi untuk setiap tahunnya.

In [217]:
# Jumlahkan data dalam 5 tahun
data_T = data_T.groupby('KODE_PROVINSI').sum().reset_index()

print(f"Shape Data Final : {data_T.shape}")
data_T.head(40)

Shape Data Final : (35, 9)


Unnamed: 0,KODE_PROVINSI,PENGGEREK_BATANG_PADI_T,WBC_T,TIKUS_T,BLAS_T,KRESEK_T,TUNGRO_T,KR_KH_T,TOTAL_OPUT_T
0,0.0,490200.2575,214772.458483,437165.715689,228334.584,233929.871056,28655.665333,6600.11,1639659.0
1,1.0,14862.8,6480.21,12005.84,6582.85,11568.29,11.25,0.0,51511.24
2,2.0,9210.12,3167.51,8017.13,22837.48,16576.65,94.9,163.9,60067.69
3,3.0,1033.85,3836.66,8177.15,1776.77,274.8,552.82,128.65,15780.7
4,4.0,5297.481,2927.9,3236.9,2804.37,671.43,26.0,0.0,14964.08
5,5.0,2314.74,541.64,3304.04,1574.82,719.88,222.13,0.0,8677.25
6,6.0,18632.835,10267.38,30994.953083,13635.435,10670.095,128.72,694.65,85024.07
7,7.0,3391.05,3308.4,4065.74,3462.75,1421.95,229.25,0.5,15879.64
8,8.0,28428.77,20087.695,31846.199,16314.12,15140.7,159.25,1939.25,113916.0
9,9.0,588.09,1326.54,1116.22,2146.02,122.22,1.0,0.0,5300.09


## **3. Filtering Data**

Filtering bagian data yang akan digunakan dalam model regresi linear. Tentukan mana variabel bebas dan terikat.

In [218]:
# Variabel dependen
Y_T = data_T["TOTAL_OPUT_T"]

X_T = [col for col in data_T.columns if '_T' in col and col != "TOTAL_OPUT_T"]
X_T = data_T[X_T]

In [219]:
print(X_T)
print(X_T.shape)

    PENGGEREK_BATANG_PADI_T          WBC_T        TIKUS_T      BLAS_T   
0               490200.2575  214772.458483  437165.715689  228334.584  \
1                14862.8000    6480.210000   12005.840000    6582.850   
2                 9210.1200    3167.510000    8017.130000   22837.480   
3                 1033.8500    3836.660000    8177.150000    1776.770   
4                 5297.4810    2927.900000    3236.900000    2804.370   
5                 2314.7400     541.640000    3304.040000    1574.820   
6                18632.8350   10267.380000   30994.953083   13635.435   
7                 3391.0500    3308.400000    4065.740000    3462.750   
8                28428.7700   20087.695000   31846.199000   16314.120   
9                  588.0900    1326.540000    1116.220000    2146.020   
10                   0.0000       0.000000       0.000000       0.000   
11                 246.3600     162.290000     217.750000       6.200   
12               80673.0000   64329.100900   60967.

In [220]:
print(Y_T)
print(Y_T.shape)

0     1.639659e+06
1     5.151124e+04
2     6.006769e+04
3     1.578070e+04
4     1.496408e+04
5     8.677250e+03
6     8.502407e+04
7     1.587964e+04
8     1.139160e+05
9     5.300090e+03
10    0.000000e+00
11    7.931000e+02
12    3.086601e+05
13    2.439690e+05
14    3.411381e+04
15    1.580321e+05
16    3.840276e+04
17    2.002347e+04
18    2.905025e+04
19    1.703235e+04
20    3.097560e+04
21    1.728691e+04
22    1.104317e+04
23    3.395016e+04
24    5.625700e+02
25    1.069611e+04
26    5.570522e+04
27    7.667145e+04
28    9.386750e+04
29    1.396476e+04
30    3.946148e+04
31    1.441624e+04
32    4.387460e+03
33    7.147800e+03
34    8.324540e+03
Name: TOTAL_OPUT_T, dtype: float64
(35,)


## **4. Perhitungan Regresi Linear**

In [221]:
X_T = X_T.values
Y_T = Y_T.values.reshape(-1, 1)

#### 4.1. Train Test Split

In [222]:
X_train_T, X_test_T, Y_train_T, Y_test_T = train_test_split(X_T, Y_T, test_size=0.2, random_state=42)

In [223]:
print(f"Shape data latih sumbu X (Variabel Independen) : {X_train_T.shape}")
print(f"Shape data latih sumbu Y (Variabel dependen) : {Y_train_T.shape}")
print("\n============================================================\n")
print(f"Shape data uji sumbu X (Variabel Independen) : {X_test_T.shape}")
print(f"Shape data uji sumbu Y (Variabel dependen) : {Y_test_T.shape}")

Shape data latih sumbu X (Variabel Independen) : (28, 7)
Shape data latih sumbu Y (Variabel dependen) : (28, 1)


Shape data uji sumbu X (Variabel Independen) : (7, 7)
Shape data uji sumbu Y (Variabel dependen) : (7, 1)


#### 4.2. Model Regresi Linear

In [224]:
# Menambahkan variabel tambahan sebagai intercept/konstanta sesuai pada rumus regresi linear
X_train_T_intercept = np.hstack((np.ones((X_train_T.shape[0], 1)), X_train_T))
X_test_T_intercept = np.hstack((np.ones((X_test_T.shape[0], 1)), X_test_T))

In [225]:
# Cari nllai transpose dari matriks X
X_transpose = X_train_T_intercept.T

print(X_transpose)
print(X_transpose.shape)

[[1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
  1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
  1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
  1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
  1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
  1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
  1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00]
 [8.06730000e+04 2.84287700e+04 9.10950000e+03 5.88090000e+02
  4.59990000e+03 4.90200258e+05 5.29748100e+03 4.96123000e+03
  2.65092900e+04 2.31474000e+03 2.46360000e+02 1.48628000e+04
  9.21012000e+03 2.67796000e+03 1.03385000e+03 1.96995700e+04
  1.61802300e+04 1.00269200e+04 0.00000000e+00 5.61431000e+02
  1.09109500e+04 9.04021500e+03 1.86328350e+04 1.05501670e+04
  2.09264000e+03 3.39105000e+03 1.05255900e+04 3.29615000e+04]
 [6.43291009e+04 2.00876950e+04 1.26228650e+04 1.32654000e+03
  1.74960000e+03 2.14772458e+05 2.92790000e+03 1.56422000e+03
  9.62

In [226]:
# Kalikan Matriks Transpose X dengan Matriks X itu sendiri
multiplication_X = X_transpose @ X_train_T_intercept

print(multiplication_X)
print(multiplication_X.shape)

[[2.80000000e+01 8.25286446e+05 3.58583292e+05 7.47183084e+05
  3.99751350e+05 3.84387067e+05 4.59504107e+04 1.16227700e+04]
 [8.25286446e+05 2.51415033e+11 1.11701627e+11 2.24360259e+11
  1.17642373e+11 1.20271151e+11 1.43317912e+10 3.46850937e+09]
 [3.58583292e+05 1.11701627e+11 5.10422175e+10 9.91444600e+10
  5.28528503e+10 5.41616398e+10 6.35458605e+09 1.59499761e+09]
 [7.47183084e+05 2.24360259e+11 9.91444600e+10 2.01573867e+11
  1.04812929e+11 1.06935432e+11 1.27543157e+10 3.09421584e+09]
 [3.99751350e+05 1.17642373e+11 5.28528503e+10 1.04812929e+11
  5.57364973e+10 5.68607277e+10 6.71510039e+09 1.64827484e+09]
 [3.84387067e+05 1.20271151e+11 5.41616398e+10 1.06935432e+11
  5.68607277e+10 5.82583795e+10 6.87047645e+09 1.68545219e+09]
 [4.59504107e+04 1.43317912e+10 6.35458605e+09 1.27543157e+10
  6.71510039e+09 6.87047645e+09 8.83404217e+08 1.95339465e+08]
 [1.16227700e+04 3.46850937e+09 1.59499761e+09 3.09421584e+09
  1.64827484e+09 1.68545219e+09 1.95339465e+08 5.19240156e+07]]

In [227]:
# Cari invers dari perkalian matriks sebelumnya
inversed_multiplication_X = np.linalg.inv(multiplication_X)

print(inversed_multiplication_X)
print(inversed_multiplication_X.shape)

[[ 4.38623151e-02 -6.88558938e-07  2.18861086e-07  5.10781176e-07
  -4.62669550e-06  4.63421776e-06  4.29208199e-07 -6.15527441e-06]
 [-6.88558938e-07  1.86416180e-09 -7.36231035e-10 -1.30097510e-09
   3.29246114e-10 -1.31451863e-09 -1.03457924e-09  1.18806139e-08]
 [ 2.18861086e-07 -7.36231035e-10  2.64814974e-09  6.26605550e-10
   5.67171085e-11 -1.42226186e-09  2.20324927e-10 -2.60175183e-08]
 [ 5.10781176e-07 -1.30097510e-09  6.26605550e-10  1.14841204e-09
  -6.41434900e-10  8.53021090e-10  3.04147101e-10 -9.36442685e-09]
 [-4.62669550e-06  3.29246114e-10  5.67171085e-11 -6.41434900e-10
   5.54953690e-09 -4.77737747e-09 -2.74743403e-10 -4.53347749e-09]
 [ 4.63421776e-06 -1.31451863e-09 -1.42226186e-09  8.53021090e-10
  -4.77737747e-09  7.11978436e-09 -5.72086856e-10  2.32564314e-09]
 [ 4.29208199e-07 -1.03457924e-09  2.20324927e-10  3.04147101e-10
  -2.74743403e-10 -5.72086856e-10  1.58494533e-08  1.17864274e-08]
 [-6.15527441e-06  1.18806139e-08 -2.60175183e-08 -9.36442685e-09
  -

In [228]:
# Kalikan matriks transpose X dengan matriks output Y_T
multiplication_Y = X_transpose @ Y_train_T

print(multiplication_Y)
print(multiplication_Y.shape)

[[2.77276442e+06]
 [8.43190743e+11]
 [3.76852378e+11]
 [7.52675478e+11]
 [3.96268753e+11]
 [4.05043258e+11]
 [4.81050135e+10]
 [1.17387133e+10]]
(8, 1)


In [229]:
# Kalikan semua nilai tadi untuk memperoleh nilai koefisien untuk masing - masing variabel independen serta konstanta
coefficient = inversed_multiplication_X @ multiplication_Y

In [230]:
print(coefficient)
print(coefficient.shape)

[[-9.77888703e-09]
 [ 1.00000000e+00]
 [ 1.00000000e+00]
 [ 1.00000000e+00]
 [ 1.00000000e+00]
 [ 1.00000000e+00]
 [ 1.00000000e+00]
 [ 1.00000000e+00]]
(8, 1)


In [231]:
Y_pred = X_test_T_intercept @ coefficient

#### 4.3. Evaluasi Model

In [232]:
mse = mean_squared_error(Y_test_T, Y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test_T, Y_pred)

In [233]:
print("\nEvaluasi Model:")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-Squared (R²): {r2}")

# Menampilkan hasil prediksi dan nilai aktual
print("\nPerbandingan Y aktual vs Y prediksi:")
for actual, pred in zip(Y_test_T, Y_pred):
    print(f"Y aktual: {actual[0]}, Y prediksi: {pred[0]}")


Evaluasi Model:
Mean Squared Error (MSE): 1.1816215606314677e-16
Root Mean Squared Error (RMSE): 1.087024176654534e-08
R-Squared (R²): 1.0

Perbandingan Y aktual vs Y prediksi:
Y aktual: 55705.22, Y prediksi: 55705.21999998188
Y aktual: 243968.986, Y prediksi: 243968.98599999922
Y aktual: 562.5699999999999, Y prediksi: 562.5699999900839
Y aktual: 17286.910000000003, Y prediksi: 17286.909999990556
Y aktual: 158032.10838888888, Y prediksi: 158032.10838889267
Y aktual: 13964.755499999997, Y prediksi: 13964.755499989336
Y aktual: 17032.354, Y prediksi: 17032.35399998649
