In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Domains column Imputing - Gradient & KNN
## Loading the dataset

In [None]:
df = pd.read_excel('../../../data/BGG_Domains_Imputing_Data_Set.xlsx')

## Data preparation
#### Splitting the data

In [None]:
features = [
    "Complexity Average",
    "Rating Average",
    "Owned Users",
    "Users Rated",
    "Min Age",
    "Play Time"
]

df = df.dropna(subset=['Domains'] + features)

X = df[features]
y = df['Domains']

y_encoded = y.astype('category').cat.codes

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

## Model setup
#### Gradient Boosting Classifier

In [None]:
gbc = GradientBoostingClassifier(random_state=42, n_estimators=100)

#### K-Nearest Neighbors Classifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)

## Pipeline setup
#### Gradient Boosting Classifier pipeline

In [None]:
gbc_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('gbc', gbc)
])

#### K-Nearest Neighbors Classifier pipeline

In [None]:
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', knn)
])

## Cross validation
#### Gradient Boosting Classifier cross validation

In [None]:
gbc_cv_scores = cross_val_score(gbc_pipeline, X_train, y_train, cv=5)
print(f"Gradient Boosting Classifier cross validation scores: {gbc_cv_scores}")
print(f"Gradient Boosting Classifier cross validation mean score: {gbc_cv_scores.mean()}")

#### K-Nearest Neighbors Classifier cross validation

In [None]:
knn_cv_scores = cross_val_score(knn_pipeline, X_train, y_train, cv=5)
print(f"K-Nearest Neighbors Classifier cross validation scores: {knn_cv_scores}")
print(f"K-Nearest Neighbors Classifier cross validation mean score: {knn_cv_scores.mean()}")

## Ensemble model
#### Voting Classifier

In [None]:
ensemble_model = VotingClassifier([
    ('gbc', gbc_pipeline),
    ('knn', knn_pipeline)
], voting='hard')

#### Training the ensemble model

In [None]:
ensemble_model.fit(X_train, y_train)
gbc_pipeline.fit(X_train, y_train)
knn_pipeline.fit(X_train, y_train)

## Model evaluation
#### Making predictions

In [None]:
y_pred_gbc = gbc_pipeline.predict(X_test)
y_pred_knn = knn_pipeline.predict(X_test)
y_pred = ensemble_model.predict(X_test)

#### Accuracy scores

In [None]:
print(f"Gradient Boosting Classifier accuracy: {accuracy_score(y_test, y_pred_gbc):.2f}")
print(f"K-Nearest Neighbors Classifier accuracy: {accuracy_score(y_test, y_pred_knn):.2f}")
print(f"Ensemble model accuracy: {accuracy_score(y_test, y_pred):.2f}")