In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, r2_score, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier

In [32]:
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")


In [33]:
data = pd.read_csv('weatherHistory.csv')

In [34]:
data.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


In [35]:
# Removing the unneeded rows
data = data.drop(['Summary', 'Daily Summary', 'Formatted Date', 'Apparent Temperature (C)', 'Loud Cover'], axis=1)

In [36]:
# First as there are some null values we are adding the null values with the temperature condition and also we are labeling them for binary classification
data['Precip Type'] = data['Precip Type'].fillna(
    data['Temperature (C)'].apply(lambda x: 'snow' if x < 0 else 'rain')
)
data['Precip Type'] = data['Precip Type'].apply(lambda x: 0 if x == 'snow' else 1)
data.head()

Unnamed: 0,Precip Type,Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars)
0,1,9.472222,0.89,14.1197,251.0,15.8263,1015.13
1,1,9.355556,0.86,14.2646,259.0,15.8263,1015.63
2,1,9.377778,0.89,3.9284,204.0,14.9569,1015.94
3,1,8.288889,0.83,14.1036,269.0,15.8263,1016.41
4,1,8.755556,0.83,11.0446,259.0,15.8263,1016.51


In [None]:
# Select top 3 features using SelectKBest and f_classification
selector = SelectKBest(score_func=f_classif, k=5)
X_new = selector.fit_transform(X, Y)

# Get the top 3 feature columns
selected_features = X.columns[selector.get_support()].tolist()

In [37]:

X = data[selected_features]
Y = data["Precip Type"]

In [38]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [59]:
models = [
    ("knn", KNeighborsClassifier(n_neighbors=3)), 
    ("svc", make_pipeline(StandardScaler(), SVC(kernel="rbf", random_state=42)))
]
stack = StackingClassifier(estimators=models, final_estimator=LogisticRegression())

In [40]:
stack.fit(X_train, Y_train)

StackingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=3)),
                               ('svc',
                                Pipeline(steps=[('standardscaler',
                                                 StandardScaler()),
                                                ('svc', SVC())]))],
                   final_estimator=LogisticRegression())

In [21]:
pred = stack.predict(X_test)
r2_score(pred, Y_test)

0.9318370665552044

In [42]:
models_dict = {name: model for name, model in models}
models_dict["Logistic Regression"] = LogisticRegression(max_iter=1000)
models_dict["Stacking"] = stack
models_dict

{'knn': KNeighborsClassifier(n_neighbors=3),
 'svc': Pipeline(steps=[('standardscaler', StandardScaler()), ('svc', SVC())]),
 'Logistic Regression': LogisticRegression(max_iter=1000),
 'Stacking': StackingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=3)),
                                ('svc',
                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler()),
                                                 ('svc', SVC())]))],
                    final_estimator=LogisticRegression())}

In [48]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [49]:
results = {}
for name, model in models_dict.items():
    scores = cross_val_score(model, X, Y, cv=cv, scoring="accuracy")
    results[name] = {
        "mean_accuracy": np.mean(scores),
        "std_accuracy": np.std(scores)
    }
    print(f"{name}: Mean Accuracy = {np.mean(scores):.3f}, Std = {np.std(scores):.3f}")

# Determine the best model
best_model = max(results, key=lambda x: results[x]["mean_accuracy"])
print(f"\nThe best model is {best_model} with a mean accuracy of {results[best_model]['mean_accuracy']:.3f}")

knn: Mean Accuracy = 0.974, Std = 0.001
svc: Mean Accuracy = 0.993, Std = 0.000
Logistic Regression: Mean Accuracy = 0.999, Std = 0.000
Stacking: Mean Accuracy = 0.993, Std = 0.000

The best model is Logistic Regression with a mean accuracy of 0.999


In [51]:
smote = SMOTE(sampling_strategy="minority", random_state=42)
X_sample, Y_sample = smote.fit_resample(X, Y)

In [58]:
results = {}
for name, model in models_dict.items():
    scores = cross_val_score(model, X_sample, Y_sample, cv=cv, scoring="accuracy")
    results[name] = {
        "mean_accuracy": np.mean(scores),
        "std_accuracy": np.std(scores)
    }
    print(f"{name}: Mean Accuracy = {np.mean(scores):.3f}, Std = {np.std(scores):.3f}")

# Determine the best model
best_model = max(results, key=lambda x: results[x]["mean_accuracy"])
print(f"\nThe best model is {best_model} with a mean accuracy of {results[best_model]['mean_accuracy']:.3f}")

knn: Mean Accuracy = 0.985, Std = 0.001
svc: Mean Accuracy = 0.993, Std = 0.000
Logistic Regression: Mean Accuracy = 0.998, Std = 0.000
Stacking: Mean Accuracy = 0.997, Std = 0.000

The best model is Logistic Regression with a mean accuracy of 0.998
