In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Baca dataset
df = pd.read_csv('dataset-ayam.csv')

# Contoh pra-proses data (dapat disesuaikan)
df['Title'] = df['Title'].astype(str)
df['Ingredients'] = df['Ingredients'].astype(str)
df['Steps'] = df['Steps'].astype(str)
df['Loves'] = df['Loves'].fillna(0).astype(int)
df['URL'] = df['URL'].astype(str)

# Pisahkan data menjadi data pelatihan dan pengujian
X_train, X_test, y_train, y_test = train_test_split(df[['Title', 'Ingredients', 'Steps', 'URL']], df['Loves'], test_size=0.2, random_state=42)

# Combine text columns into a single column for both training and testing data
X_train['combined_text'] = X_train['Title'] + ' ' + X_train['Ingredients'] + ' ' + X_train['Steps'] + ' ' + X_train['URL']
X_test['combined_text'] = X_test['Title'] + ' ' + X_test['Ingredients'] + ' ' + X_test['Steps'] + ' ' + X_test['URL']

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Apply TF-IDF on the combined text data for training
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['combined_text'])

# Apply TF-IDF on the combined text data for testing
X_test_tfidf = tfidf_vectorizer.transform(X_test['combined_text'])

# Initialize the model
model = MultinomialNB()

# Train the model on the training data
model.fit(X_train_tfidf, y_train)

# Make predictions on the testing data
predictions = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

# Print the results in the order of 'Title', 'Ingredients', and 'Steps'
results_df = pd.DataFrame({'Title': X_test['Title'], 'Ingredients': X_test['Ingredients'], 'Steps': X_test['Steps'], 'Actual': y_test, 'Predicted': predictions})
results_df = results_df[['Title', 'Ingredients', 'Steps', 'Actual', 'Predicted']]

# Contoh pencarian query menggunakan kolom 'Ingredients'
query_ingredients = "sambal rawit merah"
query_tfidf = tfidf_vectorizer.transform([query_ingredients])

# Calculate cosine similarity between the query and all instances in the testing data
similarity_scores = cosine_similarity(query_tfidf, X_test_tfidf)

# Get the indices of the top 3 most similar instances
top_3_indices = similarity_scores.argsort()[0][-3:][::-1]

# Print the top 3 most similar instances
print('\nTop 3 most similar instances:')
print(results_df.iloc[top_3_indices])



Top 3 most similar instances:
                        Title  \
354            Sup Ayam Rawit   
1352              Ayam Geprek   
535   Ayam Geprek Super Pedas   

                                            Ingredients  \
354   250 gr ayam potong2--800 ml air--3 genggam cab...   
1352  1/4 kg dada ayam--4 Sdm tepung maizena--6 Sdm ...   
535   1/2 kg dada ayam--10 Cabai orange--2 Cabai ker...   

                                                  Steps  Actual  Predicted  
354   Rebus ayam sampai mendidih. Buang air rebusan ...       9          6  
1352  Pertama potong ayam menjadi tiga bagian,marina...       7          6  
535   Cuci ayam sampai bersih lalu di beri air jeruk...       4          6  


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
