In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import VarianceThreshold

In [11]:
# Load data
air_data = pd.read_csv("airdata.csv")  # Data
air_data

Unnamed: 0,Avg_Wind,Avg_Temp,Avg_Humidity,Avg_Barometer,Avg_Visbility
0,6.13,28.88,0.72,1012.00,15.00
1,5.00,25.88,0.86,1012.63,10.13
2,4.75,26.38,0.85,1011.75,11.63
3,3.88,26.88,0.78,1010.00,13.13
4,4.38,26.88,0.81,1010.63,13.88
...,...,...,...,...,...
1456,4.50,28.38,0.80,1013.38,12.63
1457,5.50,28.88,0.80,1012.13,13.88
1458,4.75,28.50,0.80,1010.50,13.25
1459,4.38,28.88,0.82,1010.25,13.50


In [12]:
air_data.isna().sum() 

Avg_Wind         4
Avg_Temp         4
Avg_Humidity     4
Avg_Barometer    4
Avg_Visbility    4
dtype: int64

Removing the missing values

In [None]:
df = air_data.dropna() # Removing the Missing values

In [None]:
df.isna().sum() # Checking if there any Missing Values

Avg_Wind         0
Avg_Temp         0
Avg_Humidity     0
Avg_Barometer    0
Avg_Visbility    0
dtype: int64

In [None]:
X =df.drop(columns=["Avg_Humidity"]) #Assigning X as the input variables
X

Unnamed: 0,Avg_Wind,Avg_Temp,Avg_Barometer,Avg_Visbility
0,6.13,28.88,1012.00,15.00
1,5.00,25.88,1012.63,10.13
2,4.75,26.38,1011.75,11.63
3,3.88,26.88,1010.00,13.13
4,4.38,26.88,1010.63,13.88
...,...,...,...,...
1456,4.50,28.38,1013.38,12.63
1457,5.50,28.88,1012.13,13.88
1458,4.75,28.50,1010.50,13.25
1459,4.38,28.88,1010.25,13.50


In [None]:
y = df["Avg_Humidity"] #Assigning Y as the outcome variable (Avg_Humidity)
y

0       0.72
1       0.86
2       0.85
3       0.78
4       0.81
        ... 
1456    0.80
1457    0.80
1458    0.80
1459    0.82
1460    0.80
Name: Avg_Humidity, Length: 1457, dtype: float64

### Data Splitting

In [17]:


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=pd.qcut(y, q=10, duplicates="drop"),
    random_state=12019, shuffle=True
)

In [18]:
X_train.count()

Avg_Wind         1165
Avg_Temp         1165
Avg_Barometer    1165
Avg_Visbility    1165
dtype: int64

In [19]:
X_test.count()

Avg_Wind         292
Avg_Temp         292
Avg_Barometer    292
Avg_Visbility    292
dtype: int64

### Data Preprocessing

In [20]:
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()), # Standardization
    ('knn', KNeighborsRegressor())  # KNN Model
])


### Hyperparameter Grid

In [None]:
param_grid = { # Hyper Grid
    'knn__n_neighbors': list(range(1,25,1)),  # k values
    'knn__weights': [ 'distance'],  # Weighting methods
    'knn__p': [2]  # Distance metric 2 = Euclidean)
}

### Grid Search

In [None]:
grid_search = GridSearchCV( # Training Model
    knn_pipeline, param_grid, cv=10, scoring='r2', n_jobs=-1, verbose=1
)

In [23]:
# Fit the pipeline with GridSearch
grid_search.fit(X_train, y_train)

# Best Parameters & Model Training
best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print(f"Best R² Score:{grid_search.best_score_:.4f}")

# Train Best Model
knn_best = grid_search.best_estimator_

Fitting 10 folds for each of 24 candidates, totalling 240 fits
Best Parameters: {'knn__n_neighbors': 24, 'knn__p': 2, 'knn__weights': 'distance'}
Best R² Score:0.6254


RMSE Results

In [24]:
grid_search = GridSearchCV(
    knn_pipeline, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1
)

In [25]:
# Fit the pipeline with GridSearch
grid_search.fit(X_train, y_train)

# Best Parameters & Model Training
best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best MSE:",np.abs(grid_search.best_score_))

RMSE = np.sqrt(0.0008420436770044618)
print(f"Best RMSE score: {RMSE:.4f}")

Fitting 10 folds for each of 24 candidates, totalling 240 fits
Best Parameters: {'knn__n_neighbors': 24, 'knn__p': 2, 'knn__weights': 'distance'}
Best MSE: 0.0008401405849885408
Best RMSE score: 0.0290


### MODEL VALIDATION

In [None]:
y_test_pred = knn_best.predict(X_test)# Predictions
y_test_pred
# Metrics Evaluation Function
def evaluate(y_true, y_pred, set_name):
    return pd.DataFrame({
        'set': [set_name],
        'rmse': [mean_squared_error(y_true, y_pred, squared=False)],
        'rsq': [r2_score(y_true, y_pred)]
    })


metrics_test = evaluate(y_test, y_test_pred, 'Test')

# Display Results
knn_metrics = pd.concat([metrics_test])
print(knn_metrics)

    set     rmse       rsq
0  Test  0.02831  0.671625
