In [2]:
!pip install optuna
import optuna
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

url ="/content/Sample_Dataset_15k 1.csv"
df = pd.read_csv(url, header=None, encoding='latin1')
df.drop(df.columns[-1], axis=1, inplace=True)
column_names = ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'genres', 'Title', 'combination', 'Rating', 'Age Category', 'Language']
df.columns = column_names
df['combined_features'] = df['director_name'] + ' ' +  df['actor_1_name'] + ' ' + df['actor_2_name'] + ' ' + df['actor_3_name'] + ' ' + df['genres'] + ' ' + df['Language'] + ' ' + df['Age Category'].astype(str)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

X = tfidf_matrix
y = df['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

def objective_rf(trial):
    n_estimators = trial.suggest_int("n_estimators", 10, 1000)
    max_depth = trial.suggest_int("max_depth", 2, 32)
    min_samples_split = trial.suggest_float("min_samples_split", 0.1, 1.0)
    min_samples_leaf = trial.suggest_float("min_samples_leaf", 0.1, 0.5)
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    return -accuracy

study_rf = optuna.create_study()
study_rf.optimize(objective_rf, n_trials=15)
best_params_rf = study_rf.best_params
bayesian_rf_model = RandomForestClassifier(**best_params_rf, random_state=42)
bayesian_rf_model.fit(X_train, y_train)
y_pred_rf = bayesian_rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Rating Accuracy using Bayesian Search with Random Forest Classifier:", accuracy_rf)

def recommend(movie_title, model, data, vectorizer, cosine_sim):
    idx = data[data['Title'] == movie_title].index[0]
    movie_features = vectorizer.transform([data.iloc[idx]['combined_features']])
    movie_rating = model.predict(movie_features)[0]
    similar_indices = [i for i, rating in enumerate(data['Rating']) if abs(rating - movie_rating) < 0.5]
    sim_scores = [(i, cosine_sim[idx, i]) for i in similar_indices]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_indices = [i[0] for i in sim_scores[1:11]]
    return data.iloc[sim_indices]['Title'].tolist()
    
#Recomendation
movie_title = 'jumanji'
recommended_movies = recommend(movie_title, bayesian_rf_model, df, tfidf_vectorizer, cosine_sim)
print(recommended_movies)



[I 2024-03-11 15:14:09,644] A new study created in memory with name: no-name-b54940b8-aa54-40ee-8161-674562c07f45
[I 2024-03-11 15:14:13,612] Trial 0 finished with value: -0.6506666666666666 and parameters: {'n_estimators': 838, 'max_depth': 4, 'min_samples_split': 0.37343058114824, 'min_samples_leaf': 0.2897041476692438}. Best is trial 0 with value: -0.6506666666666666.
[I 2024-03-11 15:14:14,888] Trial 1 finished with value: -0.6506666666666666 and parameters: {'n_estimators': 276, 'max_depth': 31, 'min_samples_split': 0.3163372668776899, 'min_samples_leaf': 0.43923853492002796}. Best is trial 0 with value: -0.6506666666666666.
[I 2024-03-11 15:14:18,570] Trial 2 finished with value: -0.6506666666666666 and parameters: {'n_estimators': 831, 'max_depth': 13, 'min_samples_split': 0.38817714264238445, 'min_samples_leaf': 0.2921564167005078}. Best is trial 0 with value: -0.6506666666666666.
[I 2024-03-11 15:14:21,257] Trial 3 finished with value: -0.6506666666666666 and parameters: {'n_e

Rating Accuracy using Bayesian Search with Random Forest Classifier: 0.6506666666666666
['small soldiers', 'spider-man 2', 'spider-man 3', 'wimbledon', 'crazy/beautiful', 'elizabethtown', "the cat's meow", 'eternal sunshine of the spotless mind', 'homicide', 'jurassic park iii']
