In [14]:
"""

@Code author Hoping

This is Lumaa  AI & ML Coding Challenge build and executed sample output by Hoping Raising


AI/Machine Learning Intern Challenge: Simple Content-Based Recommendation

"""

import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

nltk.download("punkt")
nltk.download("wordnet")
nltk.download('punkt_tab')

# Load dataset
df = pd.read_csv("movies_metadataLumma.csv", usecols=['title', 'overview']).dropna()
df = df[df['overview'].str.strip() != ''].reset_index(drop=True)

# Text preprocessing function
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """Lowercase, remove punctuation, tokenize, and lemmatize text."""
    text = re.sub(r'\W+', ' ', text.lower())  # Remove punctuation & lowercase
    return " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(text)])  # Tokenize & lemmatize

# Apply preprocessing to movie overviews
df['processed_overview'] = df['overview'].apply(preprocess_text)

# Initialize TF-IDF vectorizer and transform data
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['processed_overview'])

# Function to recommend movies based on user input
def recommend_movies(user_input, top_n=5):
    """Recommends top N similar movies based on text input using TF-IDF & cosine similarity."""
    if not user_input.strip():
        return ["Please provide a valid movie description."]

    user_tfidf = vectorizer.transform([preprocess_text(user_input)])  # Preprocess and vectorize input
    similarities = cosine_similarity(user_tfidf, tfidf_matrix).flatten()  # Compute similarity scores

    top_indices = np.argsort(similarities)[-top_n:][::-1]  # Get top N indices sorted by similarity

    return df.iloc[top_indices]['title'].tolist()  # Return only movie titles

# Main execution loop
if __name__ == "__main__":
    while True:
        user_query = input("Describe the type of movie you like (or type 'exit' to quit): ").strip()
        if user_query.lower() == 'exit':
            print("Goodbye!")
            break

        recommendations = recommend_movies(user_query, top_n=5)
        print("\nTop 5 Movie Recommendations:")
        for idx, title in enumerate(recommendations, 1):  # Starting from 1
            print(f"{idx}. {title}")
        print()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Describe the type of movie you like (or type 'exit' to quit): I love thrilling action movies set in space, with a comedic twist

Top 5 Movie Recommendations:
1. Dangerous Game
2. Last Action Hero
3. Living in Oblivion
4. Homage
5. Pulp Fiction

Describe the type of movie you like (or type 'exit' to quit): comedy

Top 5 Movie Recommendations:
1. Pie in the Sky
2. Four Weddings and a Funeral
3. To Die For
4. Speechless
5. Martin Lawrence: You So Crazy

Describe the type of movie you like (or type 'exit' to quit): action thriller

Top 5 Movie Recommendations:
1. Sudden Death
2. Fatal Instinct
3. Dangerous Game
4. Cronos
5. Taxi Driver

Describe the type of movie you like (or type 'exit' to quit): I want to watch science fictions

Top 5 Movie Recommendations:
1. Century
2. Man of the Year
3. Race the Sun
4. Condition Red
5. Blue Sky

Describe the type of movie you like (or type 'exit' to quit): movie about secret agent undercover

Top 5 Movie Recommendations:
1. Drop Zone
2. True Lies
3. D