In [11]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import re

# Load Dataset
df = pd.read_csv("indian movies.csv", encoding='utf-8')

# Data Cleaning
def clean_text(text):
    if pd.isna(text):
        return "unknown"
    text = text.lower().strip()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    return text

df["Year"] = pd.to_numeric(df["Year"], errors='coerce')
df["Timing(min)"] = df["Timing(min)"].str.extract('(\\d+)').astype(float)
df["Rating(10)"] = pd.to_numeric(df["Rating(10)"], errors='coerce')
df["Votes"] = df["Votes"].str.replace(',', '', regex=True)
df["Votes"] = pd.to_numeric(df["Votes"], errors='coerce')
df["Genre"] = df["Genre"].replace("-", np.nan).fillna("Unknown").apply(clean_text)
df["Language"] = df["Language"].apply(clean_text)
df["Movie Name"] = df["Movie Name"].apply(clean_text)

# Create a combined feature for NLP processing
df["Features"] = df["Movie Name"] + " " + df["Genre"] + " " + df["Rating(10)"].astype(str) + " " + df["Language"]
df["Features"] = df["Features"].fillna("unknown")

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')
feature_matrix = tfidf.fit_transform(df["Features"])

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, df["Movie Name"], test_size=0.2, random_state=42)

# Nearest Neighbors Model
nn_model = NearestNeighbors(n_neighbors=10, metric='cosine', algorithm='brute')
nn_model.fit(feature_matrix)

# Evaluate Model
def evaluate_model():
    similarities = cosine_similarity(X_test, feature_matrix)
    correct_predictions = 0
    total_predictions = len(X_test.toarray())

    for i in range(total_predictions):
        predicted_idx = np.argmax(similarities[i])
        predicted_movie = df.iloc[predicted_idx]["Movie Name"]
        actual_movie = y_test.iloc[i]
        if predicted_movie == actual_movie:
            correct_predictions += 1

    accuracy = correct_predictions / total_predictions
    return accuracy

accuracy = evaluate_model()
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Recommendation Function
def recommend_movies(title, genre, rating, top_n=10):
    title = clean_text(title)
    genre = clean_text(genre)
    input_features = tfidf.transform([title + " " + genre + " " + str(rating)])
    distances, indices = nn_model.kneighbors(input_features, n_neighbors=top_n)
    return df.iloc[indices[0]][["Movie Name", "Genre", "Rating(10)"]]

# User Input
user_title = input("Enter a movie name: ")
user_genre = input("Enter the movie genre: ")
user_rating = float(input("Enter the expected rating (0-10): "))

# Get Recommendations
print("Recommended Movies:")
print(recommend_movies(user_title, user_genre, user_rating))


Model Accuracy: 99.72%
Enter a movie name: dabangg
Enter the movie genre: action drama
Enter the expected rating (0-10): 6.3
Recommended Movies:
             Movie Name                  Genre  Rating(10)
40711           dabangg          action comedy         6.2
46723         dabangg 3          action comedy         3.2
7136          dabangg 2          action comedy         4.8
16146         dabangg 3          action comedy         3.2
38970         dabangg 3          action comedy         3.2
49621         dabangg 3          action comedy         3.2
32995  he  the only one                 action         NaN
29851  enough is enough                 action         NaN
43444          take off  action drama thriller         8.2
12958                 e  action drama thriller         6.3
