In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity

url = "/content/Sample_Dataset_15k 1.csv"
df = pd.read_csv(url, header=None, encoding='latin1')
df.fillna('', inplace=True)
column_names = ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'genres', 'Title', 'combination', 'Rating', 'Age Category', 'Language', 'Subscription']
df.columns = column_names
df['combined_features'] = (df['director_name'] + ' ' +
                            df['actor_1_name'] + ' ' +
                            df['actor_2_name'] + ' ' +
                            df['actor_3_name'] + ' ' +
                            df['genres'] + ' ' +
                            df['Language'] + ' ' +
                            df['Age Category'].astype(str) + ' ' +
                            df['Subscription'].astype(str)
                           )

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

X = df.drop(['Title','Rating', 'Subscription'], axis=1)
y = df['Subscription']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train_text = tfidf_vectorizer.transform(X_train['combined_features'])
X_test_text = tfidf_vectorizer.transform(X_test['combined_features'])

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20]
}
rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_model, param_grid, cv=5)
grid_search.fit(X_train_text, y_train)
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test_text)
accuracy = accuracy_score(y_test, y_pred)
print("Subscription Accuracy using Grid Search:", accuracy)

def recommend(movie_title, model, data, vectorizer, cosine_sim):
    idx = data[data['Title'] == movie_title].index[0]
    movie_features = vectorizer.transform([data.iloc[idx]['combined_features']])
    movie_subscription = model.predict(movie_features)[0]
    similar_indices = [i for i, subscription in enumerate(data['Subscription']) if subscription == movie_subscription]
    sim_scores = [(i, cosine_sim[idx, i]) for i in similar_indices]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_indices = [i[0] for i in sim_scores[1:11]] 
    return data.iloc[sim_indices]['Title'].tolist()

# Recommendarion
movie_title = 'jumanji'
recommended_movies = recommend(movie_title, best_rf_model, df, tfidf_vectorizer, cosine_sim)
print(recommended_movies)

Subscription Accuracy using Grid Search: 0.3330666666666667
['spider-man 2', 'october sky', 'wimbledon', 'crazy/beautiful', 'eternal sunshine of the spotless mind', 'honey, i shrunk the kids', 'get over it', 'all i wanna do', 'the night listener', 'the crow: salvation']
