In [245]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

## **1. Akses Dataset**

Akses dataset serangan hama dengan 7 jenis hama berbeda yang disertai kategori :

(T) -> jumlah area yang terkena serangan

(P) -> jumlah area yang mengalami kerusakan lebih dari 75%

In [246]:
dataset_dir = "src\\penerapan-pengelolaan-hama-terpadu-tanaman-pangan.xlsx"

In [247]:
df = pd.ExcelFile(dataset_dir)

# Akses tiap halaman pada dataset (jika ada)
df.sheet_names

['padi',
 'jagung',
 'kedelai',
 'kacang tanah',
 'kacang hijau',
 'ubi kayu',
 'ubi jalar']

In [248]:
# Load data untuk halaman tertentu & tampilkan
data = df.parse('padi')
data.head()

Unnamed: 0,TAHUN,NO PROV,PROV,PENGGEREK BATANG PADI,Unnamed: 4,WBC,Unnamed: 6,TIKUS,Unnamed: 8,BLAS,Unnamed: 10,KRESEK,Unnamed: 12,TUNGRO,Unnamed: 14,KR/KH,Unnamed: 16,TOTAL OPUT,Unnamed: 18
0,,,,T,P,T,P,T,P,T,P,T,P,T,P,T,P,T,P
1,2018.0,1.0,Aceh,2818.7,0,2663.48,95.25,2722.45,0,1409,0,2738,0,0,0,0,0,12351.63,95.25
2,2018.0,2.0,Sumatera Utara,1799.0,0,478.56,3.18,2030.15,22.5,3042.95,0,1831.5,0,35.1,0,68.4,0,9285.66,25.68
3,2018.0,3.0,Sumatera Barat,116.15,0,430.7,11.6,2044.21,136.25,465.01,14.05,31.25,0.5,104.85,5.25,57.45,14.5,3249.62,182.15
4,2018.0,4.0,Riau,952.9,0,435.35,0,658.25,1,485.65,0,117.55,0,25,0,,,2674.7,1


## **2. Preprocessing Data**

Membersihkan dataset dan mempersiapkan untuk dilakukan regresi linear :

- Perbaiki header

- Isi kolom -, null, nan, dengan nilai 0

- Pilih kolom/baris yang ingin digunakan dalam sistem

In [249]:
# Perbaikan header data
data = pd.read_excel(dataset_dir, sheet_name="padi", header=[0, 1])
data.columns = [" ".join(col).strip() for col in data.columns.values]

# Cleaning kolom data
data.columns = data.columns.str.strip()
data.columns = data.columns.str.replace(" ", "_")
data.columns = data.columns.str.replace("/", "_")
data.columns = data.columns.str.replace("-", "_")

# Ganti nama kolom (tujuan untuk mempermudah proses kedepannya)
data.rename(columns={
    "TAHUN_Unnamed:_0_level_1": "TAHUN",
    "NO_PROV_Unnamed:_1_level_1": "KODE_PROVINSI",
    "PROV_Unnamed:_2_level_1": "PROVINSI",
}, inplace=True)

# Penanganan missing values
data.fillna(0, inplace=True)

# Menghilangkan baris yang tidak diperlukan
rows_to_drop = [34, 69, 104, 139, 174]
data = data.drop(index=rows_to_drop)

# Filtering kolom yang akan digunakan (kode provinsi, tahun, dan kolom _T)
columns_to_keep = ['TAHUN'] + ['KODE_PROVINSI'] + [col for col in data.columns if '_T' in col]
data_T = data[columns_to_keep]
data_T['TAHUN'] = pd.to_numeric(data_T['TAHUN'], errors='coerce')

# data_T.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_T['TAHUN'] = pd.to_numeric(data_T['TAHUN'], errors='coerce')


Sesuai catatan sebelumnya, model akan mengguakan data kategori (T). Lalu kolom tahun tetap digunakan untuk memproses data secara historis.

## **3. Filtering Data**

Filtering bagian data yang akan digunakan dalam model regresi linear. Tentukan mana variabel bebas dan terikat.

In [250]:
# Variabel dependen
Y_T = data_T["TOTAL_OPUT_T"]

X_T = ['TAHUN'] + [col for col in data_T.columns if '_T' in col and col != "TOTAL_OPUT_T"]
X_T = data_T[X_T]

In [251]:
print(X_T)
print(X_T.shape) # Berisi seluruh jenis hama + kolom tahun untuk data trend historis

     TAHUN  PENGGEREK_BATANG_PADI_T    WBC_T  TIKUS_T   BLAS_T  KRESEK_T   
0     2018                  2818.70  2663.48  2722.45  1409.00   2738.00  \
1     2018                  1799.00   478.56  2030.15  3042.95   1831.50   
2     2018                   116.15   430.70  2044.21   465.01     31.25   
3     2018                   952.90   435.35   658.25   485.65    117.55   
4     2018                   345.98    70.40   448.36   169.23     61.49   
..     ...                      ...      ...      ...      ...       ...   
169   2022                  2027.76    37.45   995.96   356.16    826.90   
170   2022                  1422.75   203.50   295.00    48.00     26.75   
171   2022                   282.80     9.00   151.50     8.50      0.00   
172   2022                   645.70   229.80    34.40    48.50     61.00   
173   2022                   259.40     9.00    18.25   333.90    626.50   

     TUNGRO_T  KR_KH_T  
0        0.00     0.00  
1       35.10    68.40  
2      104.8

In [252]:
print(Y_T)
print(Y_T.shape)

0      12351.63
1       9285.66
2       3249.62
3       2674.70
4       1099.87
         ...   
169     4283.73
170     2001.50
171      465.20
172     1019.40
173     2426.30
Name: TOTAL_OPUT_T, Length: 170, dtype: float64
(170,)


## **4.1. Perhitungan Regresi Linear (LEAST SQUARE)**

In [253]:
X_T = X_T.values
Y_T = Y_T.values.reshape(-1, 1)

#### 4.1.1. Train Test Split

In [254]:
X_train_T, X_test_T, Y_train_T, Y_test_T = train_test_split(X_T, Y_T, test_size=0.2, random_state=42)

In [255]:
print(f"Shape data latih sumbu X (Variabel Independen) : {X_train_T.shape}")
print(f"Shape data latih sumbu Y (Variabel dependen) : {Y_train_T.shape}")
print("\n============================================================\n")
print(f"Shape data uji sumbu X (Variabel Independen) : {X_test_T.shape}")
print(f"Shape data uji sumbu Y (Variabel dependen) : {Y_test_T.shape}")

Shape data latih sumbu X (Variabel Independen) : (136, 8)
Shape data latih sumbu Y (Variabel dependen) : (136, 1)


Shape data uji sumbu X (Variabel Independen) : (34, 8)
Shape data uji sumbu Y (Variabel dependen) : (34, 1)


#### 4.1.2. Model Regresi Linear

In [256]:
# Menambahkan variabel tambahan sebagai intercept/konstanta sesuai pada rumus regresi linear
X_train_T_intercept = np.hstack((np.ones((X_train_T.shape[0], 1)), X_train_T))
X_test_T_intercept = np.hstack((np.ones((X_test_T.shape[0], 1)), X_test_T))

In [257]:
# Cari nllai transpose dari matriks X
X_transpose = X_train_T_intercept.T

# print(X_transpose)
# print(X_transpose.shape)

In [258]:
# Kalikan Matriks Transpose X dengan Matriks X itu sendiri
multiplication_X = X_transpose @ X_train_T_intercept

# print(multiplication_X)
# print(multiplication_X.shape)

In [259]:
# Cari invers dari perkalian matriks sebelumnya
inversed_multiplication_X = np.linalg.inv(multiplication_X)

# print(inversed_multiplication_X)
# print(inversed_multiplication_X.shape)

In [260]:
# Kalikan matriks transpose X dengan matriks output Y_T
multiplication_Y = X_transpose @ Y_train_T

print(multiplication_Y)
print(multiplication_Y.shape)

[[1.27458887e+06]
 [2.57477141e+09]
 [1.02898124e+10]
 [6.87532541e+09]
 [8.99224270e+09]
 [5.34147030e+09]
 [6.07997439e+09]
 [3.60742270e+08]
 [1.94042724e+08]]
(9, 1)


In [261]:
# Kalikan semua nilai tadi untuk memperoleh nilai koefisien untuk masing - masing variabel independen serta konstanta
coefficient = inversed_multiplication_X @ multiplication_Y

In [262]:
print(coefficient)
print(coefficient.shape)

[[-2.00988323e-05]
 [ 9.37036082e-09]
 [ 1.00000000e+00]
 [ 1.00000000e+00]
 [ 1.00000000e+00]
 [ 1.00000000e+00]
 [ 1.00000000e+00]
 [ 1.00000000e+00]
 [ 1.00000000e+00]]
(9, 1)


In [263]:
# Prediksi Data Uji Menggunakan Koefisien Yang Diperoleh
Y_pred = X_test_T_intercept @ coefficient

#### 4.1.3. Evaluasi Model

In [264]:
mse = mean_squared_error(Y_test_T, Y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test_T, Y_pred)

In [265]:
print("\nEvaluasi Model:")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-Squared (R²): {r2}")

# Menampilkan hasil prediksi dan nilai aktual
print("\nPerbandingan Y aktual vs Y prediksi:")
for actual, pred in zip(Y_test_T, Y_pred):
    print(f"Y aktual: {actual[0]}, Y prediksi: {pred[0]}")


Evaluasi Model:
Mean Squared Error (MSE): 1.3774707031982975e-12
Root Mean Squared Error (RMSE): 1.1736569785070497e-06
R-Squared (R²): 1.0

Perbandingan Y aktual vs Y prediksi:
Y aktual: 2599.311, Y prediksi: 2599.3109988485044
Y aktual: 1685.6, Y prediksi: 1685.599998810462
Y aktual: 7750.799999999999, Y prediksi: 7750.799998835563
Y aktual: 9727.28, Y prediksi: 9727.279998807346
Y aktual: 363.45, Y prediksi: 363.4499988483011
Y aktual: 16403.0, Y prediksi: 16402.999998850042
Y aktual: 2001.5, Y prediksi: 2001.499998847893
Y aktual: 3290.63, Y prediksi: 3290.6299988194946
Y aktual: 3686.2499999999973, Y prediksi: 3686.2499988393984
Y aktual: 10700.970000000001, Y prediksi: 10700.969998821922
Y aktual: 11845.215000000002, Y prediksi: 11845.214998808793
Y aktual: 5237.34, Y prediksi: 5237.339998849027
Y aktual: 1907.77, Y prediksi: 1907.7699988352433
Y aktual: 49154.0, Y prediksi: 49153.999998818246
Y aktual: 9350.89, Y prediksi: 9350.889998825443
Y aktual: 6952.469999999997, Y predik

## **4.2. Perhitungan Regresi Linear (GAUSS ELIMINATION)**

#### 4.2.1. Function Eliminasi Gauss



In [266]:
# Function Eliminasi Gauss
def gauss_elimination(A, b):
    n = len(b)
    for i in range(n):
        # Partial pivoting
        max_row = max(range(i, n), key=lambda x: abs(A[x][i]))
        A[[i, max_row]] = A[[max_row, i]]
        b[[i, max_row]] = b[[max_row, i]]
        
        # Matriks Segitiga Atas
        for j in range(i + 1, n):
            factor = A[j][i] / A[i][i]
            A[j, i:] -= factor * A[i, i:]
            b[j] -= factor * b[i]

    x = np.zeros(n)
    for i in range(n - 1, -1, -1):
        x[i] = (b[i] - np.dot(A[i, i + 1:], x[i + 1:])) / A[i][i]
    return x

#### 4.2.2. Aplikasi Function Gauss Pada Data

In [267]:
A = X_train_T_intercept.T @ X_train_T_intercept
b = X_train_T_intercept.T @ Y_train_T

In [268]:
coefficient_gauss = gauss_elimination(A.copy(), b.copy())
print(coefficient_gauss)

[-1.25620905e-05  6.21871519e-09  1.00000000e+00  1.00000000e+00
  1.00000000e+00  1.00000000e+00  1.00000000e+00  1.00000000e+00
  1.00000000e+00]


#### 4.2.3 Evaluasi Model

In [269]:
Y_pred_gauss = X_test_T_intercept @ coefficient_gauss

In [270]:
if Y_pred_gauss.ndim == 1:
    Y_pred_gauss = Y_pred_gauss.reshape(-1, 1)

if Y_test_T.ndim == 1:
    Y_test_T = Y_test_T.reshape(-1, 1)


In [271]:
# Evaluasi Model 
mse = mean_squared_error(Y_test_T, Y_pred_gauss)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test_T, Y_pred_gauss)

In [272]:
print("\nEvaluasi Model:")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-Squared (R²): {r2}")

# Menampilkan hasil prediksi dan nilai aktual
print("\nPerbandingan Y aktual vs Y prediksi:")
for actual, pred in zip(Y_test_T, Y_pred_gauss):
    print(f"Y aktual: {actual[0]}, Y prediksi: {pred[0]}")



Evaluasi Model:
Mean Squared Error (MSE): 9.084364441806396e-17
Root Mean Squared Error (RMSE): 9.531193231598233e-09
R-Squared (R²): 1.0

Perbandingan Y aktual vs Y prediksi:
Y aktual: 2599.311, Y prediksi: 2599.3110000125344
Y aktual: 1685.6, Y prediksi: 1685.599999987227
Y aktual: 7750.799999999999, Y prediksi: 7750.800000004956
Y aktual: 9727.28, Y prediksi: 9727.279999985787
Y aktual: 363.45, Y prediksi: 363.4500000123312
Y aktual: 16403.0, Y prediksi: 16403.000000012904
Y aktual: 2001.5, Y prediksi: 2001.500000012062
Y aktual: 3290.63, Y prediksi: 3290.6299999936136
Y aktual: 3686.2499999999973, Y prediksi: 3686.250000006453
Y aktual: 10700.970000000001, Y prediksi: 10700.969999994564
Y aktual: 11845.215000000002, Y prediksi: 11845.214999987136
Y aktual: 5237.34, Y prediksi: 5237.340000012898
Y aktual: 1907.77, Y prediksi: 1907.7700000043835
Y aktual: 49154.0, Y prediksi: 49153.99999999527
Y aktual: 9350.89, Y prediksi: 9350.889999998313
Y aktual: 6952.469999999997, Y prediksi: 

# **LIBRARY**

Selain perhitungan manual berikut juga disertakan perhitungan koefisien regresi dengan bantuan library scikit-learn 

In [273]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score, KFold

In [274]:
X_train_T, X_test_T, y_train_t, y_test_t = train_test_split(X_T, Y_T, test_size=0.2, random_state=42)

In [275]:
# Inisialisasi Model & Latih Model 
model_t = LinearRegression()
model_t.fit(X_train_T, y_train_t)

In [276]:
# Nilai intercept untuk model TOTAL_T
intercept_t = model_t.intercept_
coefficients_t = model_t.coef_

print("Intercept untuk TOTAL_T:", intercept_t)
print("Koefisien untuk TOTAL_T:", coefficients_t)


Intercept untuk TOTAL_T: [8.31278157e-10]
Koefisien untuk TOTAL_T: [[-4.10918687e-13  1.00000000e+00  1.00000000e+00  1.00000000e+00
   1.00000000e+00  1.00000000e+00  1.00000000e+00  1.00000000e+00]]


In [277]:
y_pred_t = model_t.predict(X_test_T)

# Evaluate for Total_T
print("== Model Performance for TOTAL_OPUT_T ==")
print(f"MSE: {mean_squared_error(y_test_t, y_pred_t):.2f}, \nR2: {r2_score(y_test_t, y_pred_t):.2f}")



== Model Performance for TOTAL_OPUT_T ==
MSE: 0.00, 
R2: 1.00


In [278]:
# Function to preprocess the data from the selected sheet
def preprocess_data(dataset, sheet_name):
    # Load the data for the selected plant type
    data = pd.read_excel(dataset, sheet_name=sheet_name, header=[0, 1])
    data.columns = [" ".join(col).strip() for col in data.columns.values]
    data.columns = (
        data.columns.str.strip()
        .str.replace(" ", "_")
        .str.replace("/", "_")
        .str.replace("-", "_")
    )
    data.rename(
        columns={
            "TAHUN_Unnamed:_0_level_1": "TAHUN",
            "NO_PROV_Unnamed:_1_level_1": "KODE_PROVINSI",
            "PROV_Unnamed:_2_level_1": "PROVINSI",
        },
        inplace=True,
    )
    data.fillna(0, inplace=True)
    return data