In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

In [None]:
# Membaca dataset
df = pd.read_csv("global air pollution dataset.csv")
print(df)

                        Country              City  AQI Value AQI Category  \
0            Russian Federation        Praskoveya         51     Moderate   
1                        Brazil  Presidente Dutra         41         Good   
2                         Italy   Priolo Gargallo         66     Moderate   
3                        Poland         Przasnysz         34         Good   
4                        France          Punaauia         22         Good   
...                         ...               ...        ...          ...   
23458                     India      Gursahaiganj        184    Unhealthy   
23459                    France            Sceaux         50         Good   
23460                     India          Mormugao         50         Good   
23461  United States of America       Westerville         71     Moderate   
23462                  Malaysia            Marang         70     Moderate   

       CO AQI Value CO AQI Category  Ozone AQI Value Ozone AQI Category  \


In [None]:
# Melakukan statistik deskriptif pada kolom AQI Value
kolom_aqi_value = df['AQI Value']

mean = np.mean(kolom_aqi_value)
modus = np.argmax(np.bincount(kolom_aqi_value))
median = np.median(kolom_aqi_value)
quartile_1 = np.percentile(kolom_aqi_value, 25)
quartile_3 = np.percentile(kolom_aqi_value, 75)
std_deviation = np.std(kolom_aqi_value)
variance = np.var(kolom_aqi_value)

In [None]:
# Menampilkan hasil statistik deskriptif
print("Statistik Deskriptif pada AQI Value:")
print("Mean:", mean)
print("Modus:", modus)
print("Median:", median)
print("Quartile 1:", quartile_1)
print("Quartile 3:", quartile_3)
print("Standar Deviasi:", std_deviation)
print("Variance:", variance)

Statistik Deskriptif pada AQI Value:
Mean: 72.01086817542513
Modus: 50
Median: 55.0
Quartile 1: 39.0
Quartile 3: 79.0
Standar Deviasi: 56.054025694475605
Variance: 3142.053796556931


In [None]:
# Preprocessing data (Contoh: Menghapus kolom yang tidak relevan dan Normalisasi menggunakan Min-Max Scaling)
df = df.drop(['Country', 'City', 'AQI Category', 'CO AQI Category', 'Ozone AQI Category', 'NO2 AQI Category', 'PM2.5 AQI Category'], axis=1)
scaler = MinMaxScaler()
df[['CO AQI Value', 'Ozone AQI Value', 'NO2 AQI Value', 'PM2.5 AQI Value']] = scaler.fit_transform(df[['CO AQI Value', 'Ozone AQI Value', 'NO2 AQI Value', 'PM2.5 AQI Value']])

In [None]:
# Split data menjadi atribut (X) dan target (y)
X = df.drop('AQI Value', axis=1)
y = df['AQI Value']

In [None]:
# Membagi data menjadi set pelatihan dan pengujian
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Tuning Hyperparameter menggunakan GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'max_features': [1.0, 'sqrt', 'log2']
}
model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [None]:
# Mendapatkan model terbaik setelah tuning hyperparameter
best_model = grid_search.best_estimator_

In [None]:
# Melakukan prediksi pada data pengujian dengan model terbaik
y_pred = best_model.predict(X_test)

In [None]:
# Evaluasi model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
# Menampilkan hasil evaluasi
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Absolute Error: 0.22601001491583209
Mean Squared Error: 6.939175703174943
R-squared: 0.9978224182456765
