In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from scipy.stats import normaltest, zscore, pearsonr
import pickle
import joblib

In [33]:
# Load dataset
df = pd.read_csv('weatherHistory.csv')

In [34]:
# Drop the 'Formatted Date' column
df.drop('Formatted Date', axis=1, inplace=True)

In [35]:
df

Unnamed: 0,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.
...,...,...,...,...,...,...,...,...,...,...,...
96448,Partly Cloudy,rain,26.016667,26.016667,0.43,10.9963,31.0,16.1000,0.0,1014.36,Partly cloudy starting in the morning.
96449,Partly Cloudy,rain,24.583333,24.583333,0.48,10.0947,20.0,15.5526,0.0,1015.16,Partly cloudy starting in the morning.
96450,Partly Cloudy,rain,22.038889,22.038889,0.56,8.9838,30.0,16.1000,0.0,1015.66,Partly cloudy starting in the morning.
96451,Partly Cloudy,rain,21.522222,21.522222,0.60,10.5294,20.0,16.1000,0.0,1015.95,Partly cloudy starting in the morning.


In [36]:
# 1. Null Value Treatment
df['Precip Type'].fillna(df['Precip Type'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Precip Type'].fillna(df['Precip Type'].mode()[0], inplace=True)


In [37]:
# 2. Categorical Variable Treatment
le = LabelEncoder()
df['Precip Type'] = le.fit_transform(df['Precip Type'])
df['Summary'] = le.fit_transform(df['Summary'])
df['Daily Summary'] = le.fit_transform(df['Daily Summary'])

In [38]:
df

Unnamed: 0,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,19,0,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,197
1,19,0,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,197
2,17,0,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,197
3,19,0,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,197
4,17,0,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,197
...,...,...,...,...,...,...,...,...,...,...,...
96448,19,0,26.016667,26.016667,0.43,10.9963,31.0,16.1000,0.0,1014.36,170
96449,19,0,24.583333,24.583333,0.48,10.0947,20.0,15.5526,0.0,1015.16,170
96450,19,0,22.038889,22.038889,0.56,8.9838,30.0,16.1000,0.0,1015.66,170
96451,19,0,21.522222,21.522222,0.60,10.5294,20.0,16.1000,0.0,1015.95,170


In [39]:

def perform_feature_selection(data, target_column, corr_threshold=0.0, p_threshold=0.05):
    selected_features = []
    numeric_data = data.select_dtypes(include='number')  # Only numeric features
    correlation_matrix = numeric_data.corr()
    target_correlations = correlation_matrix[target_column]

    print(f"\nCorrelation and p-values with target: '{target_column}'\n")
    print(f"{'Feature':<30}{'r-value':>10}{'p-value':>15}{'Included?':>15}")
    print("-" * 70)

    for feature in target_correlations.index:
        if feature != target_column:
            corr, p_value = pearsonr(data[feature], data[target_column])
            include = abs(corr) > corr_threshold and p_value < p_threshold
            status = "Included" if include else "Excluded"
            print(f"{feature:<30}{corr:>10.4f}{p_value:>15.4f}{status:>15}")
            if include:
                selected_features.append(feature)

    # Final output summary
    print("\n✅ Selected Features (based on correlation > "
          f"{corr_threshold} and p-value < {p_threshold}):")
    for feature in selected_features:
        print(f"- {feature}")

    return selected_features, correlation_matrix

selected_features, corr_matrix = perform_feature_selection(df, 'Temperature (C)')


Correlation and p-values with target: 'Temperature (C)'

Feature                          r-value        p-value      Included?
----------------------------------------------------------------------
Summary                           0.1467         0.0000       Included
Precip Type                      -0.5626         0.0000       Included
Apparent Temperature (C)          0.9926         0.0000       Included
Humidity                         -0.6323         0.0000       Included
Wind Speed (km/h)                 0.0090         0.0054       Included
Wind Bearing (degrees)            0.0300         0.0000       Included
Visibility (km)                   0.3928         0.0000       Included
Loud Cover                           nan            nan       Excluded
Pressure (millibars)             -0.0054         0.0907       Excluded
Daily Summary                     0.4389         0.0000       Included

✅ Selected Features (based on correlation > 0.0 and p-value < 0.05):
- Summary
- Precip T

  corr, p_value = pearsonr(data[feature], data[target_column])


In [40]:
# Final features and target
X = df[selected_features]
y = df['Temperature (C)']

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
# Model Training
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [43]:
# Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [44]:

# Evaluation
print("\n=== Training Performance ===")
print("MAE:", mean_absolute_error(y_train, y_train_pred))
print("MSE:", mean_squared_error(y_train, y_train_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_train, y_train_pred)))
print("R2:", r2_score(y_train, y_train_pred))

print("\n=== Testing Performance ===")
print("MAE:", mean_absolute_error(y_test, y_test_pred))
print("MSE:", mean_squared_error(y_test, y_test_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print("R2:", r2_score(y_test, y_test_pred))


=== Training Performance ===
MAE: 0.004642964729336224
MSE: 0.0003200780093572581
RMSE: 0.01789072411495013
R2: 0.9999964825953503

=== Testing Performance ===
MAE: 0.012076572840537045
MSE: 0.0018564744822795354
RMSE: 0.04308682492687916
R2: 0.9999798558042174


In [45]:
# Save model using pickle
pickle.dump(model, open('wrf_model.pkl', 'wb'))

# Save model using joblib
joblib.dump(model, 'wrf_model.joblib')

['wrf_model.joblib']

In [46]:
model_pickle = pickle.load(open('wrf_model.pkl', 'rb'))
model_joblib = joblib.load('wrf_model.joblib')

In [None]:
importances = model.feature_importances_
feature_names = X.columns

plt.figure(figsize=(8, 6))
plt.barh(feature_names, importances, color='lightpink')
plt.title("Feature Importances - Gradient Boosting Regressor")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()