In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
 
url= "C:/Users/40538/Documents/RDK Hackathon/dataset/sample_Dataset_15k.csv"
movies_data = pd.read_csv(url, header=None, encoding='latin1')
column_names = ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'genres', 'Title', 'combination', 'Rating',  'Age Category', 'Language', 'Subscription']
movies_data.columns = column_names
movies_data['combined_features'] = movies_data['director_name'] + ' ' +  movies_data['actor_1_name'] + ' ' + movies_data['actor_2_name'] + ' ' + movies_data['actor_3_name'] + ' ' + movies_data['genres'] + ' ' +  movies_data['Age Category'].astype(str) + ' ' + movies_data['Language']
 
X = movies_data.drop('Title', axis=1)
y = movies_data['Subscription']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
 
# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns
numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
movies_data['combined_features'] = movies_data.apply(lambda row: ' '.join([str(row['genres']), str(row['director_name']), str(row['actor_1_name']), str(row['actor_2_name']), str(row['actor_3_name'])]), axis=1)
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_data['combined_features'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
 
#Golden ratio for parameter optimization
phi = (1 + 5**0.5) / 2
a, b = 1, 100
tolerance = 1e-2 
iterations = 0
while True:
    c = b - (b - a) / phi  # Calculate the midpoint c
    d = a + (b - a) / phi  # Calculate the other point d
    print(iterations, "interval:", abs(b - a), "c: ", c , "d: ", d)
    # Train Random Forest models with parameters c and d
    model_c = RandomForestClassifier(n_estimators=int(c), random_state=42)
    model_d = RandomForestClassifier(n_estimators=int(d), random_state=42)
    model_c.fit(X_train_preprocessed, y_train)
    model_d.fit(X_train_preprocessed, y_train)
    # Evaluate models on the test set and update intervals
    accuracy_c = accuracy_score(y_test, model_c.predict(X_test_preprocessed))
    accuracy_d = accuracy_score(y_test, model_d.predict(X_test_preprocessed))
    iterations += 1
    # Update bounds based on performance
    if accuracy_c >= accuracy_d:
        a = c  # If c performs better, move the left bound
    else:
        b = d  # If d performs better, move the right bound
    # Check if accuracy values have converged
    if abs(b-a) < tolerance:
        break
# Use the best n_estimators parameter found by golden section search for final model
best_n_estimators = int((a + b) / 2)
final_model = RandomForestClassifier(n_estimators=best_n_estimators, random_state=42)
final_model.fit(X_train_preprocessed, y_train)
y_pred = final_model.predict(X_test_preprocessed)
accuracy = accuracy_score(y_test, y_pred)
print("Rating Accuracy using Golden Section Search:", accuracy)
def recommend_movies_by_title(model, X_train_preprocessed, y_train_binary, movies_data, movie_title):
    movie_row = movies_data.loc[movies_data['Title'] == movie_title]
    if movie_row.empty:
        print("Movie not found in the dataset.")
        return None
    movie_features = movie_row.drop('Title', axis=1)
    movie_features_preprocessed = preprocessor.transform(movie_features)
    rating = model.predict(movie_features_preprocessed)[0]
    similarity_scores = cosine_sim[movie_row.index]
    similar_movies_indices = similarity_scores.argsort()[0][-11:-1]
    similar_movies = movies_data.iloc[similar_movies_indices]['Title']
    return rating, similar_movies
# Recommendation:
input_movie_title = "jumanji"
rating, recommended_movies = recommend_movies_by_title(final_model, X_train_preprocessed, y_train, movies_data, input_movie_title)
if recommended_movies is not None:
    print("Rating for {}: {}".format(input_movie_title, rating))
    print("Recommended Movies:")
    print(recommended_movies)

0 interval: 99 c:  38.81463511376041 d:  62.18536488623959
1 interval: 61.18536488623959 c:  24.37072977247918 d:  38.81463511376041
2 interval: 37.81463511376041 c:  38.81463511376041 d:  47.74145954495836
3 interval: 23.37072977247918 c:  33.29755420367713 d:  38.81463511376041
4 interval: 14.44390534128123 c:  38.81463511376041 d:  42.22437863487508
5 interval: 8.92682443119795 c:  36.7072977247918 d:  38.81463511376041
6 interval: 5.51708091008328 c:  38.81463511376041 d:  40.11704124590647
7 interval: 3.4097435211146703 c:  38.00970385693786 d:  38.81463511376041
8 interval: 2.1073373889686096 c:  38.81463511376041 d:  39.31210998908392
9 interval: 1.3024061321460607 c:  39.312109989083915 d:  39.619566370582966
10 interval: 0.804931256822556 c:  39.61956637058296 d:  39.80958486440743
11 interval: 0.49747487532351187 c:  39.80958486440743 d:  39.927022752082
12 interval: 0.3074563814990441 c:  39.92702275208201 d:  39.99960335823189
13 interval: 0.19001849382446068 c:  39.9996033