In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity

url = "/content/Sample_Dataset_15k 1.csv"
df = pd.read_csv(url, header=None, encoding='latin1')
df.drop(df.columns[-1], axis=1, inplace=True)
column_names = ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'genres', 'Title', 'combination', 'Rating', 'Age Category', 'Language']
df.columns = column_names
df['Subscription'] = np.random.choice(['Basic', 'Standard', 'Premium'], size=len(df))
df['combined_features'] = df['director_name'] + ' ' + df['actor_1_name'] + ' ' + df['actor_2_name'] + ' ' + df['actor_3_name'] + ' ' + df['genres'] + ' ' + df['Language'] + ' ' + df['Age Category'].astype(str)
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
X = df.drop(['Title', 'combination', 'Rating', 'Subscription'], axis=1)
y = df['Subscription']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train_text = tfidf_vectorizer.transform(X_train['combined_features'])
X_test_text = tfidf_vectorizer.transform(X_test['combined_features'])
param_dist = {
    'n_estimators': [int(x) for x in np.linspace(start=200, stop=2000, num=10)],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [int(x) for x in np.linspace(10, 110, num=11)] + [None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_dist, n_iter=10, cv=5, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_train_text, y_train)
best_params = random_search.best_params_
best_score = random_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)
best_rf_classifier = RandomForestClassifier(**best_params, random_state=42)
best_rf_classifier.fit(X_train_text, y_train)

y_pred = best_rf_classifier.predict(X_test_text)
accuracy = accuracy_score(y_test, y_pred)
print("Subscription Accuracy using RandomizedSearchCV:", accuracy)

def recommend(movie_title, model, data, vectorizer, cosine_sim):
    idx = data[data['Title'] == movie_title].index[0]
    movie_features = vectorizer.transform([data.iloc[idx]['combined_features']])
    movie_subscription = model.predict(movie_features)[0]
    similar_indices = [i for i, subscription in enumerate(data['Subscription']) if subscription == movie_subscription]
    sim_scores = [(i, cosine_sim[idx, i]) for i in similar_indices]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_indices = [i[0] for i in sim_scores[1:11]]
    return data.iloc[sim_indices]['Title'].tolist()

# Recommendation
movie_title = 'jumanji'  # Specify the movie title for which you want recommendations
recommended_movies = recommend(movie_title, best_rf_classifier, df, tfidf_vectorizer, cosine_sim)
print(recommended_movies)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters: {'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 60, 'bootstrap': False}
Best Score: 0.3402960723284423
Subscription Accuracy using RandomizedSearchCV: 0.3317333333333333
['spider-man 2', 'the rocketeer', 'october sky', 'crazy/beautiful', 'eternal sunshine of the spotless mind', 'honey, i shrunk the kids', 'get over it', 'hidalgo', 'the crow: salvation', 'toys']
