In [1]:
# BASIC LIBRARIES
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# PREPROCESSING
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from imblearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif

# MODELING
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint, uniform

# LOAD TRAINING DATASET
train_url = "https://www.dropbox.com/scl/fi/4uwpw2pldan0xxs2154fr/train.csv?rlkey=rwcpickd2vrls56abenpr1it4&st=4k2oj5x6&dl=0&raw=1"
df_train = pd.read_csv(train_url, index_col='DateID')

# SET FEATURE MATRIX & TARGET VARIABLE
target_col = 'OutlookTomorrow'
X_train, y_train = df_train.drop(columns=target_col), df_train[target_col]

# Convert Categorical Target Variable into Label (0,1,2,...)
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)

numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer([
    ('std_scal', StandardScaler(), numerical_cols),
], remainder='passthrough')

# FEATURE SELECTION
feature_selector = SelectKBest(score_func=f_classif, k=10)  # Select top 10 features

# SVM PIPELINE
svm_pipeline = make_pipeline(
    preprocessor,
    feature_selector,
    SVC()
)

svm_param_dist = {
    'svc__C': uniform(0.01, 1),
    'svc__gamma': uniform(0.01, 1),
    'svc__kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

cv = StratifiedKFold(n_splits=5)
svm_random_search = RandomizedSearchCV(
    estimator=svm_pipeline,
    param_distributions=svm_param_dist,
    n_iter=200,
    cv=cv,
    verbose=2,
    n_jobs=-1,
    scoring='f1_macro'
)
svm_random_search.fit(X_train, y_train)

print("Best SVM Parameters:")
print(pd.Series(svm_random_search.best_params_))
print("\nBest SVM F1-score Accuracy:")
print(f"{svm_random_search.best_score_:.4f}")

# Random Forest PIPELINE
rf_pipeline = make_pipeline(
    preprocessor,
    feature_selector,
    RandomForestClassifier(random_state=42)
)

rf_param_dist = {
    'randomforestclassifier__n_estimators': randint(1, 200),
    'randomforestclassifier__max_features': ['sqrt', 'log2'],
    'randomforestclassifier__max_depth': randint(1, 100),
    'randomforestclassifier__min_samples_split': randint(2, 10),
    'randomforestclassifier__min_samples_leaf': randint(1, 10),
    'randomforestclassifier__bootstrap': [True, False]
}

rf_random_search = RandomizedSearchCV(
    estimator=rf_pipeline,
    param_distributions=rf_param_dist,
    n_iter=300,
    cv=cv,
    verbose=2,
    n_jobs=-1,
    scoring='f1_macro'
)
rf_random_search.fit(X_train, y_train)

print("Best Random Forest Parameters:")
print(pd.Series(rf_random_search.best_params_))
print("\nBest Random Forest F1-score Accuracy:")
print(f"{rf_random_search.best_score_:.4f}")

# SAVE & FIT BEST MODEL
best_model = svm_random_search.best_estimator_ if svm_random_search.best_score_ > rf_random_search.best_score_ else rf_random_search.best_estimator_
best_model.fit(X_train, y_train)

# LOAD THE TEST DATASET
test_url = "https://www.dropbox.com/scl/fi/yqihhgvrzwmr4p00ygwlg/test.csv?rlkey=9brcpbfnf4fr41q0seah1z3fr&st=moq8i5m7&dl=0&raw=1"
df_test = pd.read_csv(test_url, index_col='DateID')

X_test = df_test

y_pred = best_model.predict(X_test)
y_pred = encoder.inverse_transform(y_pred)

# CONVERT PREDICTION INTO DATAFRAME
submission = pd.DataFrame(y_pred, index=df_test.index, columns=[target_col])

# WRITE A CSV FILE FOR SUBMISSION
submission.to_csv('submission.csv')

submission

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CV] END svc__C=0.10624860487496122, svc__gamma=0.1254391848366022, svc__kernel=sigmoid; total time=   0.1s
[CV] END svc__C=0.7167895042213502, svc__gamma=0.9430448122050116, svc__kernel=rbf; total time=   0.1s
[CV] END svc__C=0.7167895042213502, svc__gamma=0.9430448122050116, svc__kernel=rbf; total time=   0.1s
[CV] END svc__C=0.7409700552498348, svc__gamma=0.5183678812809113, svc__kernel=sigmoid; total time=   0.1s
[CV] END svc__C=0.1927965007869753, svc__gamma=0.6924039943409566, svc__kernel=poly; total time=   0.1s
[CV] END svc__C=0.26029893697028283, svc__gamma=0.18986492712379877, svc__kernel=sigmoid; total time=   0.1s
[CV] END svc__C=0.26029893697028283, svc__gamma=0.18986492712379877, svc__kernel=sigmoid; total time=   0.1s
[CV] END svc__C=0.9121026235355211, svc__gamma=0.5510029411476512, svc__kernel=sigmoid; total time=   0.1s
[CV] END svc__C=0.3810133370338141, svc__gamma=0.6870435887948247, svc__kernel=linear;

Unnamed: 0_level_0,OutlookTomorrow
DateID,Unnamed: 1_level_1
fe97c2d8,Partly Cloudy
be33fd88,Partly Cloudy
824bae96,Clear
6589fc6c,Frigid and Windy
5d837fb0,Hot
...,...
c35ee3f9,Mostly Cloudy
3d1d573a,Mostly Cloudy
338db194,Windy
94221b87,Frosty
