In [None]:
import pandas as pd

# Load TMDB datasets
tmdb_movies = pd.read_csv("../data/tmdb_5000_movies.csv")
tmdb_credits = pd.read_csv("../data/tmdb_5000_credits.csv")

# Load MovieLens ratings
ratings = pd.read_csv("../data/ratings.csv")

print("TMDB Movies:", tmdb_movies.shape)
print("TMDB Credits:", tmdb_credits.shape)
print("Ratings:", ratings.shape)

# Merge TMDB movies and credits on movie ID
movies = tmdb_movies.merge(tmdb_credits, left_on="id", right_on="movie_id")

# print(movies.shape)
movies.head(2)

# Select relevant columns
movies = movies[['id','original_title','overview','genres','keywords','cast','crew']]
movies.head(2)

# Convert JSON-like text to lists
import ast

def extract_names(obj):
    try:
        items = ast.literal_eval(obj)
        return [i['name'].lower().replace(" ", "") for i in items]
    except:
        return []
movies['genres'] = movies['genres'].apply(extract_names)
movies['keywords'] = movies['keywords'].apply(extract_names)

movies[['original_title','genres','keywords']].head(2)

#   Extract top 3 cast members
def get_top3_cast(obj):
    try:
        items = ast.literal_eval(obj)
        names = [i['name'].lower().replace(" ", "") for i in items[:3]]
        return names
    except:
        return []
    
movies['cast'] = movies['cast'].apply(get_top3_cast)
movies[['original_title','cast']].head(2)

#   Extract director from crew
def get_director(obj):
    try:
        items = ast.literal_eval(obj)
        for i in items:
            if i['job'] == 'Director':
                return [i['name'].lower().replace(" ", "")]
        return []
    except:
        return []

movies['director'] = movies['crew'].apply(get_director)
movies[['original_title','director']].head(2)

#   Process overview text
movies['overview'] = movies['overview'].fillna("").apply(lambda x: x.lower().split())
movies[['original_title','overview']].head(2)

#   Create movie profile by combining all features
movies['movie_profile'] = (
    movies['overview'] +
    movies['genres'] +
    movies['keywords'] +
    movies['cast'] +
    movies['director']
)
movies['movie_profile'] = movies['movie_profile'].apply(lambda x: " ".join(x))
movies[['original_title','movie_profile']].head(2)

#   Final selection of columns
final_movies = movies[['id','original_title','movie_profile']]
final_movies.head()

# Vectorize movie profiles
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(final_movies['movie_profile']).toarray()

print(vectors.shape)

# Calculate cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)
print(similarity.shape)

#   Create a mapping from movie titles to indices
movie_index = pd.Series(final_movies.index, 
                        index=final_movies['original_title']).drop_duplicates()
movie_index["Inception"]

#   Recommendation function
def recommend(movie):
    # Get index of the movie
    idx = movie_index.get(movie)

    if idx is None:
        print("Movie not found in dataset")
        return

    # Get similarity scores for this movie
    sim_scores = list(enumerate(similarity[idx]))

    # Sort movies based on similarity (excluding itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]

    # Get movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return top 10 similar movies
    return final_movies['original_title'].iloc[movie_indices]

recommend("Inception")
print(recommend("The Dark Knight"))
print()
print(recommend("The Godfather"))



In [None]:
import pandas as pd

ratings = pd.read_csv("../data/ratings.csv")
ratings.head()

print("Total ratings:", len(ratings))
print("Unique users:", ratings['userId'].nunique())
print("Unique movies:", ratings['movieId'].nunique())
print("Average ratings per user:", len(ratings) / ratings['userId'].nunique())
print("Average ratings per movie:", len(ratings) / ratings['movieId'].nunique())


user_movie_matrix = ratings.pivot_table(
    index='userId',
    columns='movieId',
    values='rating'
)

print(user_movie_matrix.shape)
user_movie_matrix.head()

missing = user_movie_matrix.isna().sum().sum()
total = user_movie_matrix.size
print(f"Sparsity: {round((missing/total) * 100, 2)}%")


In [None]:
links = pd.read_csv("../data/links.csv")
links.head()

In [None]:
import pandas as pd

tamil_2011_2019 = pd.read_csv("../data/tamil_movies_2011-2019.csv")
tamil_2015_2025 = pd.read_csv("../data/tamil_movies_2015-2025.csv")
tamil_imdb_2023 = pd.read_csv("../data/imdb_tamil_2023.csv")

print("2011-2019 shape:", tamil_2011_2019.shape)
print("2015-2025 shape:", tamil_2015_2025.shape)
print("2023 IMDb shape:", tamil_imdb_2023.shape)

print("Columns 2011-2019:\n", tamil_2011_2019.columns)
print("\nColumns 2015-2025:\n", tamil_2015_2025.columns)
print("\nColumns IMDb 2023:\n", tamil_imdb_2023.columns)

t1 = tamil_2011_2019.copy()

# Rename to our common schema
t1 = t1.rename(columns={
    'MovieName': 'title',
    'Genre': 'genres',
    'Director': 'director',
    'Actor': 'cast',
    'Rating': 'imdb_rating'      # use this as IMDb rating
})

# Create empty overview (this dataset doesn't have one)
t1['overview'] = ""

# Create unique dummy IDs (so they don’t clash with TMDB)
t1['id'] = range(100000, 100000 + len(t1))

# Keep only the columns we need
t1 = t1[['id','title','overview','genres','cast','director','imdb_rating']]

t1.head(2)


t3 = tamil_2015_2025.copy()

t3 = t3.rename(columns={
    'tittle': 'title',     # note: your file spells it "tittle"
    'genre': 'genres',
    'director': 'director',
    'cast': 'cast'
})

# But add imdb_rating as NaN (not available here)
t3['imdb_rating'] = None

# Create unique dummy IDs
t3['id'] = range(200000, 200000 + len(t3))

t3 = t3[['id','title','overview','genres','cast','director','imdb_rating']]

t3.head(2)


t4 = tamil_imdb_2023.copy()

t4 = t4.rename(columns={
    'Movie Name': 'title',
    'IMdb rating': 'imdb_rating',
    'Description': 'overview'
})

# We don’t have genres, cast, director here → create empty placeholders
t4['genres'] = ""
t4['cast'] = ""
t4['director'] = ""

# Create unique dummy IDs
t4['id'] = range(300000, 300000 + len(t4))

t4 = t4[['id','title','overview','genres','cast','director','imdb_rating']]

t4.head(2)

tamil_movies = pd.concat([t1, t3, t4], ignore_index=True)

print("Combined Tamil shape:", tamil_movies.shape)
tamil_movies.head(3)


def to_list(x):
    if isinstance(x, str):
        return [i.strip().lower().replace(" ", "") for i in x.split(',')]
    elif isinstance(x, list):
        return [i.lower().replace(" ", "") for i in x]
    else:
        return []

tamil_movies['genres'] = tamil_movies['genres'].apply(to_list)
tamil_movies['cast'] = tamil_movies['cast'].apply(to_list)
tamil_movies['director'] = tamil_movies['director'].apply(lambda x: to_list(x))
tamil_movies['overview'] = tamil_movies['overview'].fillna("").apply(lambda x: x.lower().split())

tamil_movies['movie_profile'] = (
    tamil_movies['overview'] +
    tamil_movies['genres'] +
    tamil_movies['cast'] +
    tamil_movies['director']
)

tamil_movies['movie_profile'] = tamil_movies['movie_profile'].apply(lambda x: " ".join(x))

tamil_movies[['title','movie_profile']].head(2)

tamil_final = tamil_movies[['id','title','movie_profile']]
tamil_final.head()

# Rename original_title → title in Hollywood data
final_movies = final_movies.rename(columns={'original_title': 'title'})

final_movies_extended = pd.concat([final_movies, tamil_final], ignore_index=True)

print("Old size:", final_movies.shape)
print("New size:", final_movies_extended.shape)

final_movies_extended.to_csv("../data/final_movies_extended.csv", index=False)

final_movies_extended.head()
final_movies_extended.tail()

# If movie_profile is empty, use title words instead
final_movies_extended['movie_profile'] = final_movies_extended.apply(
    lambda row: row['title'].lower().replace(".", "").split()
    if (isinstance(row['movie_profile'], str) and row['movie_profile'].strip() == "")
    else row['movie_profile'],
    axis=1
)
final_movies_extended['title'] = final_movies_extended['title'].str.replace(r'^\d+\.\s*', '', regex=True)

final_movies_extended.tail(5)


In [23]:
import pandas as pd

final_movies_extended = pd.read_csv("../data/final_movies_extended.csv")

print(final_movies_extended.shape)
print(final_movies_extended.columns)

# 1) Remove leading numbers like "206. "
final_movies_extended['title'] = final_movies_extended['title'].str.replace(
    r'^\d+\.\s*', '', regex=True
)

# 2) Fix spacing issues (e.g., "DoubleX" -> "Double X")
final_movies_extended['title'] = final_movies_extended['title'].str.replace(
    r'DoubleX', 'Double X', regex=False
)

# 3) Remove extra spaces and standardize case
final_movies_extended['title'] = (
    final_movies_extended['title']
    .str.strip()
)

# Keep only the first occurrence of each title
final_movies_extended = final_movies_extended.drop_duplicates(
    subset=['title'], keep='first'
).reset_index(drop=True)

print("New shape after removing duplicates:", final_movies_extended.shape)

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000, stop_words='english')

vectors = cv.fit_transform(final_movies_extended['movie_profile']).toarray()
print("New vector shape:", vectors.shape)

from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)
print("Similarity matrix shape:", similarity.shape)

movie_index = pd.Series(
    final_movies_extended.index,
    index=final_movies_extended['title']
).drop_duplicates()

def recommend(movie):
    idx = movie_index.get(movie)

    if idx is None:
        print("Movie not found in dataset")
        return []

    sim_scores = list(enumerate(similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]
    movie_indices = [i[0] for i in sim_scores]

    return final_movies_extended['title'].iloc[movie_indices].tolist()


print("Recommendations for Inception:\n")
print(recommend("Inception"))

print("\nRecommendations for Jigarthanda Double X:\n")
print(recommend("Jigarthanda Double X"))


(6131, 3)
Index(['id', 'title', 'movie_profile'], dtype='object')
New shape after removing duplicates: (5671, 3)
New vector shape: (5671, 5000)
Similarity matrix shape: (5671, 5671)
Recommendations for Inception:

['Duplex', 'The Helix... Loaded', 'Star Trek II: The Wrath of Khan', 'Timecop', 'Chicago Overcoat', 'Looper', 'Premium Rush', 'Transformers: Revenge of the Fallen', 'Congo', 'Flatliners']

Recommendations for Jigarthanda Double X:

['Mark Antony', 'Ok Kanmani', 'Kubera', 'Girl 6', 'Inside Deep Throat', 'Harrison Montgomery', 'Iraivi', 'Kadhai', 'Incident at Loch Ness', 'Baby and Baby']
