In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity

url= "C:/Users/40538/Documents/RDK Hackathon/dataset/sample_Dataset_15k.csv"
movies_data = pd.read_csv(url, header=None, encoding='latin1')
movies_data.drop(movies_data.columns[[-1]], axis=1, inplace=True)
column_names = ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'genres', 'Title', 'combination', 'Rating',  'Age Category', 'Language']
movies_data.columns = column_names
movies_data['combined_features'] = movies_data['director_name'] + ' ' +  movies_data['actor_1_name'] + ' ' + movies_data['actor_2_name'] + ' ' + movies_data['actor_3_name'] + ' ' + movies_data['genres'] + ' ' +  movies_data['Age Category'].astype(str) + ' ' + movies_data['Language'] 

X = movies_data.drop('Title', axis=1)
y = movies_data['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X = movies_data.drop('Title', axis=1)
y = movies_data['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Define transformers for numeric and categorical data
numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
movies_data['combined_features'] = movies_data.apply(lambda row: ' '.join([str(row['genres']), str(row['director_name']), str(row['actor_1_name']), str(row['actor_2_name']), str(row['actor_3_name'])]), axis=1)
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_data['combined_features'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

#Golden ratio for parameter optimization
phi = (1 + 5**0.5) / 2
a, b = 0.1, 100
tolerance = 1e-4  # tolerance for convergence
# Golden section search loop to find the optimal n_estimators parameter for the Random Forest model
iterations = 0
while True:
    print(iterations, "interval:", abs(b - a))
    c = b - (b - a) / phi  # Calculate the midpoint c
    d = a + (b - a) / phi  # Calculate the other point d
 
    # Train Random Forest models with parameters c and d
    model_c = RandomForestClassifier(n_estimators=int(c), random_state=42)
    model_d = RandomForestClassifier(n_estimators=int(d), random_state=42)
    model_c.fit(X_train_preprocessed, y_train)
    model_d.fit(X_train_preprocessed, y_train)
    # Evaluate models on the test set and update intervals
    accuracy_c = accuracy_score(y_test, model_c.predict(X_test_preprocessed))
    accuracy_d = accuracy_score(y_test, model_d.predict(X_test_preprocessed))
    iterations += 1
    # Update bounds based on performance
    if accuracy_c >= accuracy_d:
        a = c  # If c performs better, move the left bound
    else:
        b = d  # If d performs better, move the right bound
    # Check if accuracy values have converged
    if abs(b-a) < tolerance:
        break
best_n_estimators = int((a + b) / 2)
final_model = RandomForestClassifier(n_estimators=best_n_estimators, random_state=42)
final_model.fit(X_train_preprocessed, y_train)
y_pred = final_model.predict(X_test_preprocessed)
accuracy = accuracy_score(y_test, y_pred)
print("Rating Accuracy using Golden Section Search:", accuracy)
 
def recommend_movies_by_title(model, X_train_preprocessed, y_train_binary, movies_data, movie_title):
    movie_row = movies_data.loc[movies_data['Title'] == movie_title]
    if movie_row.empty:
        print("Movie not found in the dataset.")
        return None
    movie_features = movie_row.drop('Title', axis=1)
    movie_features_preprocessed = preprocessor.transform(movie_features)
    rating = model.predict(movie_features_preprocessed)[0]
    similarity_scores = cosine_sim[movie_row.index]
    similar_movies_indices = similarity_scores.argsort()[0][-11:-1]  # Excluding the input movie itself
    similar_movies = movies_data.iloc[similar_movies_indices]['Title']
    return rating, similar_movies
 
# Recommendation
input_movie_title = "jumanji"
rating, recommended_movies = recommend_movies_by_title(final_model, X_train_preprocessed, y_train, movies_data, input_movie_title)
if recommended_movies is not None:
    print("Rating for {}: {}".format(input_movie_title, rating))
    print("Recommended Movies:")
    print(recommended_movies)

0 interval: 99.9
1 interval: 61.7415954761145
2 interval: 38.1584045238855
3 interval: 23.58319095222899
4 interval: 14.575213571656512
5 interval: 9.007977380572477
6 interval: 5.567236191084035
7 interval: 3.4407411894884383
8 interval: 2.126495001595597
9 interval: 1.3142461878928415
10 interval: 0.8122488137027553
11 interval: 0.5019973741900827
12 interval: 0.310251439512669
13 interval: 0.1917459346774102
14 interval: 0.11850550483525524
15 interval: 0.0732404298421514
16 interval: 0.045265074993100285
17 interval: 0.027975354849047562
18 interval: 0.01728972014404917
19 interval: 0.01068563470499484
20 interval: 0.0066040854390507775
21 interval: 0.0040815492659405095
22 interval: 0.0025225361731067153
23 interval: 0.0015590130928302415
24 interval: 0.0009635230802764738
25 interval: 0.0005954900125573204
26 interval: 0.00036803306772270616
27 interval: 0.00022745694483816692
28 interval: 0.00014057612288809196
Rating Accuracy using Golden Section Search: 0.9712
Rating for juman