# **Proyek Rekomendasi Warna Berdasarkan Inputan User**
Tujuan dari proyek ini adalah untuk membuat sebuah model yang dapat memprediksi warna yang dapat direkomendasikan dari inputan user, untuk warna yang ada berjumlah 42 karena dataset ini dibuat oleh saya sendiri dan dibantu juga dari chatGPT. Dataset tersebut berisi kumpulan kata yang labelnya adalah sebuah warna dengan bentuk hexcode.

In [50]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

## 1. Load Dataset

In [139]:
# Baca dataset
dataset = pd.read_csv("/content/drive/MyDrive/Dataset/color_dataset2.csv")

In [140]:
dataset.head()

Unnamed: 0,text,hex_1,hex_2,hex_3,hex_4,hex_5
0,aku ingin warna yang menenangkan,#87CEEB,#B0E2FF,#C6E2B3,#9BBF96,#5E99D4
1,pilih warna yang membuatku rileks,#98FB98,#C4FFB2,#D5FFCC,#B2F0C3,#7DFD92
2,aku suka warna-warna yang cerah,#FFFF00,#FFFF80,#FFFF33,#FFEB3B,#FFF44F
3,cari warna yang terlihat elegan,#800080,#A85CA0,#B37CB3,#8B4F8B,#673567
4,warna yang memberikan kesan alami,#008000,#00A600,#00CC00,#008C00,#005C00


In [53]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224 entries, 0 to 223
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    224 non-null    object
 1   hex_1   224 non-null    object
 2   hex_2   224 non-null    object
 3   hex_3   224 non-null    object
 4   hex_4   224 non-null    object
 5   hex_5   224 non-null    object
dtypes: object(6)
memory usage: 10.6+ KB


Berdasarkan keterangan di atas diketahui bahwa dataset yang digunakan terdiri dari 225 data dan tidak terdapat *missing value*.

## 2. Preprocessing Data

2.1 Mengubah seluruh text kedalam bentuk lowercase

In [54]:
dataset['text'] = dataset['text'].str.lower()
dataset.head()

Unnamed: 0,text,hex_1,hex_2,hex_3,hex_4,hex_5
0,aku ingin warna yang menenangkan,#87CEEB,#B0E2FF,#C6E2B3,#9BBF96,#5E99D4
1,pilih warna yang membuatku rileks,#98FB98,#C4FFB2,#D5FFCC,#B2F0C3,#7DFD92
2,aku suka warna-warna yang cerah,#FFFF00,#FFFF80,#FFFF33,#FFEB3B,#FFF44F
3,cari warna yang terlihat elegan,#800080,#A85CA0,#B37CB3,#8B4F8B,#673567
4,warna yang memberikan kesan alami,#008000,#00A600,#00CC00,#008C00,#005C00


2.2 Menghilangkan stopwords

In [55]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [56]:
stop_word = set(stopwords.words('indonesian'))
dataset['text'] = dataset['text'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop_word)]))
dataset.head()

Unnamed: 0,text,hex_1,hex_2,hex_3,hex_4,hex_5
0,warna menenangkan,#87CEEB,#B0E2FF,#C6E2B3,#9BBF96,#5E99D4
1,pilih warna membuatku rileks,#98FB98,#C4FFB2,#D5FFCC,#B2F0C3,#7DFD92
2,suka warna-warna cerah,#FFFF00,#FFFF80,#FFFF33,#FFEB3B,#FFF44F
3,cari warna elegan,#800080,#A85CA0,#B37CB3,#8B4F8B,#673567
4,warna kesan alami,#008000,#00A600,#00CC00,#008C00,#005C00


In [57]:
# Bagi dataset menjadi data fitur (X) dan label (y)
X = dataset["text"]
y = dataset[["hex_1","hex_2","hex_3","hex_4","hex_5"]]

In [58]:
# Encoding label dengan one-hot encoding
labels = pd.get_dummies(y)

In [59]:
# Bagi dataset menjadi data pelatihan dan data pengujian dengan rasio 80:20
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=123)

In [60]:
# Melihat jumlah data pada data train dan test
print(X_train.shape)
print(X_test.shape)

(179,)
(45,)


2.3 Membuat tokenisasi

In [61]:
filt = '!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ' # Untuk menghilangkan symbols
tokenizer = Tokenizer(num_words=2000, oov_token="<OOV>", filters=filt)
tokenizer.fit_on_texts(X_train)

In [62]:
word_index = tokenizer.word_index
print(len(word_index))

43


2.4 Membuat sequences dan melakukan padding

In [63]:
train_sekuens = tokenizer.texts_to_sequences(X_train)
test_sekuens = tokenizer.texts_to_sequences(X_test)


train_padded = pad_sequences(train_sekuens, 
                             maxlen=20,
                             padding='post',
                             truncating='post')
test_padded = pad_sequences(test_sekuens,
                            maxlen=20,
                            padding='post',
                            truncating='post')

In [64]:
train_padded.shape

(179, 20)

In [65]:
test_padded.shape

(45, 20)

## 3. Implementasi Model Dengan Arsitektur LSTM

In [89]:
# Membangun model
model = Sequential()
model.add(Embedding(2000, 100, input_length=20))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(labels.shape[1], activation="softmax"))

model.summary()



Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 20, 100)           200000    
                                                                 
 lstm_6 (LSTM)               (None, 64)                42240     
                                                                 
 dense_11 (Dense)            (None, 64)                4160      
                                                                 
 dropout_4 (Dropout)         (None, 64)                0         
                                                                 
 dense_12 (Dense)            (None, 495)               32175     
                                                                 
Total params: 278,575
Trainable params: 278,575
Non-trainable params: 0
_________________________________________________________________


In [90]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

3.1 Latih model

In [91]:
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs = {}):
    if(logs.get('val_accuracy') > 0.90 and logs.get('accuracy') > 0.90):
      print("\nPELATIHAN BERHENTI, AKURASI MODEL SUDAH LEBIH DARI 90%!")
      self.model.stop_training = True

callbacks = myCallback()

In [94]:
# Melatih model
model.fit(train_padded, y_train, 
          epochs=500, 
          validation_data=(test_padded, y_test),
          verbose=2,
          callbacks=[callbacks])

Epoch 1/500
6/6 - 1s - loss: 4053.9326 - accuracy: 0.0503 - val_loss: 15429.5156 - val_accuracy: 0.1333 - 509ms/epoch - 85ms/step
Epoch 2/500
6/6 - 1s - loss: 3867.4231 - accuracy: 0.0112 - val_loss: 15507.6191 - val_accuracy: 0.0000e+00 - 575ms/epoch - 96ms/step
Epoch 3/500
6/6 - 1s - loss: 3930.9180 - accuracy: 0.0335 - val_loss: 15590.4600 - val_accuracy: 0.1333 - 546ms/epoch - 91ms/step
Epoch 4/500
6/6 - 1s - loss: 3988.8491 - accuracy: 0.0223 - val_loss: 15677.4053 - val_accuracy: 0.1333 - 693ms/epoch - 115ms/step
Epoch 5/500
6/6 - 1s - loss: 3959.8108 - accuracy: 0.0279 - val_loss: 15760.1914 - val_accuracy: 0.0667 - 538ms/epoch - 90ms/step
Epoch 6/500
6/6 - 1s - loss: 3991.8677 - accuracy: 0.0279 - val_loss: 15845.4697 - val_accuracy: 0.1333 - 1s/epoch - 184ms/step
Epoch 7/500
6/6 - 1s - loss: 4109.3013 - accuracy: 0.0168 - val_loss: 15931.1777 - val_accuracy: 0.0889 - 1s/epoch - 198ms/step
Epoch 8/500
6/6 - 1s - loss: 4146.9844 - accuracy: 0.0391 - val_loss: 16014.7393 - val_ac

<keras.callbacks.History at 0x7ff555d4ca60>

In [115]:
# Melakukan vektorisasi untuk mengekstrak fitur dengan TF-IDF
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
 
train_vectors = vectorizer.fit_transform(dataset['text'])
test_vectors = vectorizer.transform(dataset['text'])

In [119]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC

classifier = MultiOutputClassifier(SVC(kernel='linear'))
classifier.fit(train_vectors, dataset[["hex_1","hex_2","hex_3","hex_4","hex_5"]])
predictions = classifier.predict(test_vectors)

In [124]:
teks = """tolong kasih aku warna yang bikin tenang"""
# teks = """kasih aku warna elegan"""
teks_vector = vectorizer.transform([teks]) # vectorizing
print(classifier.predict(teks_vector))

[['#FFD700' '#FFDF00' '#FFE166' '#FFE699' '#FFEBCC']]


## Save Model

In [131]:
import joblib

joblib.dump(classifier, 'recommendation_color_model.pkl')

['recommendation_color_model.pkl']

In [138]:
# Memuat model dari file
loaded_classifier = joblib.load('/content/recommendation_color_model.pkl')

# Melakukan prediksi dengan model yang dimuat
predictions = loaded_classifier.predict(test_vectors)
predictions

array([['#FFD700', '#FFDF00', '#FFE166', '#FFE699', '#FFEBCC'],
       ['#87CEEB', '#8DCFF1', '#93D0F5', '#99D2FA', '#9FD3FE'],
       ['#FFFF00', '#FFFF33', '#FFFF66', '#FFFF99', '#FFFFCC'],
       ...,
       ['#00FF00', '#32CD32', '#3CB371', '#2E8B57', '#228B22'],
       ['#FFFF00', '#FFFF33', '#FFFF66', '#FFFF99', '#FFFFCC'],
       ['#800080', '#960096', '#AC00AC', '#C200C2', '#9370DB']],
      dtype=object)