In [51]:
from datasets import load_dataset, Features, Value
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score,mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [52]:
cleaned_dataset = load_dataset("MissTiny/WikiArt",cache_dir=r"C:\Users\KL\Desktop\ML")

In [53]:
cleaned_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'artist', 'date', 'genre', 'style', 'description', 'filename', 'image', 'embeddings_pca512', 'image_numpy', 'CLIPVisionModelWithProjection_image_embeds'],
        num_rows: 57529
    })
    test: Dataset({
        features: ['title', 'artist', 'date', 'genre', 'style', 'description', 'filename', 'image', 'embeddings_pca512', 'image_numpy', 'CLIPVisionModelWithProjection_image_embeds'],
        num_rows: 19177
    })
})

# LightGBM

In [54]:
small_dataset = cleaned_dataset['train'].select_columns(['embeddings_pca512', 'date'])
print(small_dataset)

Dataset({
    features: ['embeddings_pca512', 'date'],
    num_rows: 57529
})


### Select Years 1800-2000

In [55]:
filtered_small_dataset = small_dataset.filter(
    lambda example: int(example['date']) >= 1800 and int(example['date']) <= 2000
    if example['date'].isdigit() else False
)

print(filtered_small_dataset)

Dataset({
    features: ['embeddings_pca512', 'date'],
    num_rows: 48091
})


### Encode years

In [56]:
years = np.array(filtered_small_dataset['date']).astype(int)
min_year_encoded = (min(years) - (min(years) // 100 * 100)) // 10 * 10 + (min(years) // 100 * 100)
years_encoded = min_year_encoded + ((years - min_year_encoded) // 20 * 20)

In [57]:
unique_classes = np.unique(years_encoded)
num_classes = len(unique_classes)

print(f"Unique Classes: {unique_classes}")
print(f"Number of Classes: {num_classes}")

Unique Classes: [1800 1820 1840 1860 1880 1900 1920 1940 1960 1980 2000]
Number of Classes: 11


In [58]:
label_encoder = LabelEncoder()
y_class = label_encoder.fit_transform(years_encoded)

print(f"Encoded Labels: {y_class[0:5]} ...")
print(f"Mapping: {dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))}")


Encoded Labels: [5 5 6 3 8] ...
Mapping: {np.int64(1800): 0, np.int64(1820): 1, np.int64(1840): 2, np.int64(1860): 3, np.int64(1880): 4, np.int64(1900): 5, np.int64(1920): 6, np.int64(1940): 7, np.int64(1960): 8, np.int64(1980): 9, np.int64(2000): 10}


### Split data

In [59]:
embeddings = np.array(filtered_small_dataset['embeddings_pca512'])
X = np.vstack(embeddings)
y = y_class

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=11)

### Create LightGBM datasets

In [60]:
import lightgbm as lgb
train_dataset = lgb.Dataset(X_train, label=y_train)
val_dataset = lgb.Dataset(X_val, label=y_val, reference=train_dataset)

### Use Random Search to Find Best Model Parameters (Optional; Takes too long)

In [None]:
# Define a parameter grid
param_grid = {
    'n_estimators': [100, 300, 1000],
    'num_leaves': [15, 31, 63],
    'max_depth': [8, 12, 16],
    'learning_rate': [0.001, 0.002, 0.005],
    'min_data_in_leaf': [100, 200, 500],
    'lambda_l1': [0.5, 1.0, 2.0],
    'lambda_l2': [0.5, 1.0, 2.0],
    'feature_fraction': [0.6, 0.75, 0.9],
    'bagging_fraction': [0.6, 0.8, 1.0],
    'bagging_freq': [5, 10, 15]
}

In [38]:
# Initialize a LightGBM classifier
lgb_clf = lgb.LGBMClassifier(
    objective='multiclassova',
    num_class=num_classes,
    metric='auc_mu',
    is_unbalance=True,
    boosting_type='gbdt',
    verbose=-1
)

In [39]:
# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=lgb_clf,
    param_distributions=param_grid,
    n_iter=10,
    scoring='roc_auc_ovr',
    cv=3,
    verbose=1,
    random_state=11,
    n_jobs=-1
)

In [None]:
# Fit the random search model
random_search.fit(X_train, y_train, eval_set=[(X_val, y_val)],     
                  callbacks=[lgb.early_stopping(stopping_rounds=100),lgb.log_evaluation(1)])

# Best parameters
print("Best parameters found: ", random_search.best_params_)

### Train the LightGBM Model

In [94]:
params = {
    'objective': 'multiclass',
    'num_class': num_classes,
    'metric': 'multi_logloss',
    'is_unbalance': True,
    'boosting_type': 'goss',
    'learning_rate': 0.002,
    'num_leaves': 15,
    'max_depth': -1,
    #'lambda_l1': 1.0,
    #'lambda_l2': 1.0,
    #'min_gain_to_split': 0.5,
    #'feature_fraction': 0.8,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    #'scale_pos_weight': 1,
    #'verbose': -1
}

In [95]:
gbm = lgb.train(
    params,
    train_dataset,
    num_boost_round=10000,
    valid_sets=[train_dataset, val_dataset],
    valid_names=['train', 'valid'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=500),
        lgb.log_evaluation(100)
    ]
)

Training until validation scores don't improve for 500 rounds
[100]	train's multi_logloss: 2.00897	valid's multi_logloss: 2.02973
[200]	train's multi_logloss: 1.9482	valid's multi_logloss: 1.98426
[300]	train's multi_logloss: 1.90123	valid's multi_logloss: 1.9506
[400]	train's multi_logloss: 1.86292	valid's multi_logloss: 1.92501
[500]	train's multi_logloss: 1.83005	valid's multi_logloss: 1.90438
[600]	train's multi_logloss: 1.80311	valid's multi_logloss: 1.8922
[700]	train's multi_logloss: 1.77827	valid's multi_logloss: 1.88203
[800]	train's multi_logloss: 1.75482	valid's multi_logloss: 1.87308
[900]	train's multi_logloss: 1.7329	valid's multi_logloss: 1.8654
[1000]	train's multi_logloss: 1.7122	valid's multi_logloss: 1.85852
[1100]	train's multi_logloss: 1.69251	valid's multi_logloss: 1.85232
[1200]	train's multi_logloss: 1.67372	valid's multi_logloss: 1.84661
[1300]	train's multi_logloss: 1.65553	valid's multi_logloss: 1.84133
[1400]	train's multi_logloss: 1.638	valid's multi_loglos

### Model Evaluation

In [96]:
# Evaluate the model
y_pred = gbm.predict(X_val)
y_pred_classes = np.argmax(y_pred, axis=1)

accuracy = accuracy_score(y_val, y_pred_classes)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.3562


In [None]:
gbm.save_model("lightgbm_model.txt")

<lightgbm.basic.Booster at 0x1ce777d4430>