In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import Library

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils import resample

# Load Data

In [None]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/colors.csv')

In [None]:
df.head()

Unnamed: 0,nama_warna,Warna,Hex,R,G,B
0,air_force_blue_raf,Air Force Blue (Raf),#5d8aa8,93,138,168
1,air_force_blue_usaf,Air Force Blue (Usaf),#00308f,0,48,143
2,air_superiority_blue,Air Superiority Blue,#72a0c1,114,160,193
3,alabama_crimson,Alabama Crimson,#a32638,163,38,56
4,alice_blue,Alice Blue,#f0f8ff,240,248,255


# Preprocessing Data

In [None]:
features = df.drop(columns=['Hex'])
target = df['Hex']

In [None]:
df_balanced = pd.concat([features, target], axis=1)
classes = df_balanced['Hex'].unique()
df_majority = df_balanced[df_balanced['Hex'] == df_balanced['Hex'].value_counts().idxmax()]

In [None]:
df_minority_list = [df_balanced[df_balanced['Hex'] == cls] for cls in classes if cls != df_majority['Hex'].iloc[0]]
df_minority_upsampled_list = [resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42) for df_minority in df_minority_list]

In [None]:
df_balanced_upsampled = pd.concat([df_majority] + df_minority_upsampled_list)

In [None]:
features = df_balanced_upsampled.drop(columns=['Hex'])
target = df_balanced_upsampled['Hex']

In [None]:
categorical_cols = features.select_dtypes(include=['object']).columns
numerical_cols = features.select_dtypes(include=['int64', 'float64']).columns

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train Model with Pipeline

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', KNeighborsClassifier())
])

In [None]:
pipeline.fit(X_train, y_train)

# Evaluate Model

In [None]:
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8209150326797385
Classification Report:
              precision    recall  f1-score   support

        #000       1.00      1.00      1.00         2
     #000080       0.00      0.00      0.00         4
     #00008b       0.20      1.00      0.33         1
     #00009c       1.00      1.00      1.00         1
     #0000cd       1.00      1.00      1.00         1
     #0014a8       1.00      1.00      1.00         2
     #002366       1.00      1.00      1.00         2
     #002387       1.00      1.00      1.00         1
     #002395       1.00      1.00      1.00         2
     #002e63       1.00      1.00      1.00         1
     #002fa7       1.00      1.00      1.00         1
     #00308f       1.00      1.00      1.00         2
     #00416a       1.00      1.00      1.00         2
     #0047ab       1.00      1.00      1.00         2
     #004953       0.00      0.00      0.00         0
     #004b49       0.00      0.00      0.00         3
     #004f98       1.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Hyperparameter Tuning

In [None]:
param_grid = {
    'knn__n_neighbors': np.arange(1, 31),
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan', 'minkowski']
}

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)



In [None]:
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_}')

Best Parameters: {'knn__metric': 'euclidean', 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
Best Score: 0.9983660130718954


In [None]:
best_pipeline = grid_search.best_estimator_
best_pipeline.fit(X_train, y_train)

In [None]:
best_y_pred = best_pipeline.predict(X_test)
best_accuracy = accuracy_score(y_test, best_y_pred)
print(f'Best Accuracy: {best_accuracy}')
print('Best Classification Report:')
print(classification_report(y_test, best_y_pred))
print('Best Confusion Matrix:')
print(confusion_matrix(y_test, best_y_pred))

Best Accuracy: 1.0
Best Classification Report:
              precision    recall  f1-score   support

        #000       1.00      1.00      1.00         2
     #000080       1.00      1.00      1.00         4
     #00008b       1.00      1.00      1.00         1
     #00009c       1.00      1.00      1.00         1
     #0000cd       1.00      1.00      1.00         1
     #0014a8       1.00      1.00      1.00         2
     #002366       1.00      1.00      1.00         2
     #002387       1.00      1.00      1.00         1
     #002395       1.00      1.00      1.00         2
     #002e63       1.00      1.00      1.00         1
     #002fa7       1.00      1.00      1.00         1
     #00308f       1.00      1.00      1.00         2
     #00416a       1.00      1.00      1.00         2
     #0047ab       1.00      1.00      1.00         2
     #004b49       1.00      1.00      1.00         3
     #004f98       1.00      1.00      1.00         1
     #00563f       1.00      1.00 

In [46]:
import joblib

# Simpan model ke file
joblib.dump(best_pipeline, 'model_knn_colors.joblib')

['model_knn_colors.joblib']