In [1]:
!pip install optuna
import optuna
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

url = "/content/Sample_Dataset_15k 1.csv"
df = pd.read_csv(url, header=None, encoding='latin1')
column_names = ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'genres', 'Title', 'combination', 'Rating', 'Age Category', 'Language', 'Subscription']
df.columns = column_names
df['combined_features'] = df['director_name'] + ' ' +  df['actor_1_name'] + ' ' + df['actor_2_name'] + ' ' + df['actor_3_name'] + ' ' + df['genres'] + ' ' +  df['Age Category'].astype(str) + ' ' + df['Language']

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

X = tfidf_matrix
y = df['Subscription']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

def objective_rf(trial):
    n_estimators = trial.suggest_int("n_estimators", 10, 1000)
    max_depth = trial.suggest_int("max_depth", 2, 32)
    min_samples_split = trial.suggest_float("min_samples_split", 0.1, 1.0)
    min_samples_leaf = trial.suggest_float("min_samples_leaf", 0.1, 0.5)
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    return -accuracy

# Create study and run optimization:
study_rf = optuna.create_study()
study_rf.optimize(objective_rf, n_trials=20)
best_params_rf = study_rf.best_params
bayesian_rf_model = RandomForestClassifier(**best_params_rf, random_state=42)
bayesian_rf_model.fit(X_train, y_train)
y_pred_rf = bayesian_rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Subscription Accuracy using Bayesian Search with Random Forest Classifier:", accuracy_rf)

def recommend(movie_title, model, data, vectorizer, cosine_sim):
    idx = data[data['Title'] == movie_title].index[0]
    movie_features = vectorizer.transform([data.iloc[idx]['combined_features']])
    movie_subscription = model.predict(movie_features)[0]
    similar_indices = [i for i, subscription in enumerate(data['Subscription']) if subscription == movie_subscription]
    sim_scores = [(i, cosine_sim[idx, i]) for i in similar_indices]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_indices = [i[0] for i in sim_scores[1:11]]
    return data.iloc[sim_indices]['Title'].tolist()
    
# Recommendation
movie_title = 'jumanji'
recommended_movies = recommend(movie_title, bayesian_rf_model, df, tfidf_vectorizer, cosine_sim)
print(recommended_movies)

Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.2 alembic-1.13.1 colorlog-6.8.2 optuna-3.5.0


[I 2024-03-11 15:26:04,252] A new study created in memory with name: no-name-d337c601-4197-4332-8959-d1cc8589d374
[I 2024-03-11 15:26:05,788] Trial 0 finished with value: -0.332 and parameters: {'n_estimators': 344, 'max_depth': 5, 'min_samples_split': 0.3951617367205691, 'min_samples_leaf': 0.18520089237981585}. Best is trial 0 with value: -0.332.
[I 2024-03-11 15:26:08,808] Trial 1 finished with value: -0.332 and parameters: {'n_estimators': 581, 'max_depth': 2, 'min_samples_split': 0.29971888644704253, 'min_samples_leaf': 0.25643160815235366}. Best is trial 0 with value: -0.332.
[I 2024-03-11 15:26:09,835] Trial 2 finished with value: -0.332 and parameters: {'n_estimators': 145, 'max_depth': 30, 'min_samples_split': 0.1820535894312349, 'min_samples_leaf': 0.1855044449453983}. Best is trial 0 with value: -0.332.
[I 2024-03-11 15:26:10,245] Trial 3 finished with value: -0.332 and parameters: {'n_estimators': 129, 'max_depth': 32, 'min_samples_split': 0.8433342319972341, 'min_samples_l

Subscription Accuracy using Bayesian Search with Random Forest Classifier: 0.332
['spider-man 2', 'october sky', 'wimbledon', 'crazy/beautiful', 'eternal sunshine of the spotless mind', 'honey, i shrunk the kids', 'get over it', 'all i wanna do', 'the night listener', 'the crow: salvation']
