# Task3

In [35]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [36]:
df = pd.read_csv('.csv /movies.csv')

In [37]:
df.dropna(inplace=True)

In [38]:
df['tagline'] = (
    df['spoken_languages'].fillna('') + ' ' +
    df['keywords'].fillna('') + ' ' +
    df['genres'].fillna('') + ' ' +
    df['cast'].fillna('') + ' ' +
    df['crew'].fillna('') + ' ' +
    df['overview'].fillna('') + ' ' +
    df['production_companies'].fillna('')
)

df.head(5)

Unnamed: 0,id,title,genres,overview,rating,spoken_languages,cast,crew,vote_average,vote_count,popularity,budget,keywords,production_companies,revenue,runtime,tagline
0,862,Toy Story,Animation Comedy Family,"Led by Woody, Andy's toys live happily in his ...",3.0,English,TomHanks TimAllen DonRickles,JohnLasseter,7.7,5415.0,21.946943,30000000,jealousy toy boy friendship friends rivalry bo...,Pixar Animation Studios,373554033.0,81.0,English jealousy toy boy friendship friends ri...
1,862,Toy Story,Animation Comedy Family,"Led by Woody, Andy's toys live happily in his ...",4.0,English,TomHanks TimAllen DonRickles,JohnLasseter,7.7,5415.0,21.946943,30000000,jealousy toy boy friendship friends rivalry bo...,Pixar Animation Studios,373554033.0,81.0,English jealousy toy boy friendship friends ri...
2,862,Toy Story,Animation Comedy Family,"Led by Woody, Andy's toys live happily in his ...",5.0,English,TomHanks TimAllen DonRickles,JohnLasseter,7.7,5415.0,21.946943,30000000,jealousy toy boy friendship friends rivalry bo...,Pixar Animation Studios,373554033.0,81.0,English jealousy toy boy friendship friends ri...
3,862,Toy Story,Animation Comedy Family,"Led by Woody, Andy's toys live happily in his ...",2.0,English,TomHanks TimAllen DonRickles,JohnLasseter,7.7,5415.0,21.946943,30000000,jealousy toy boy friendship friends rivalry bo...,Pixar Animation Studios,373554033.0,81.0,English jealousy toy boy friendship friends ri...
4,862,Toy Story,Animation Comedy Family,"Led by Woody, Andy's toys live happily in his ...",3.5,English,TomHanks TimAllen DonRickles,JohnLasseter,7.7,5415.0,21.946943,30000000,jealousy toy boy friendship friends rivalry bo...,Pixar Animation Studios,373554033.0,81.0,English jealousy toy boy friendship friends ri...


In [39]:
df = df.drop(['rating','spoken_languages','vote_average','vote_count','popularity','budget','revenue','runtime','cast','keywords','crew','overview','production_companies'],axis=1)
df.head(5)

Unnamed: 0,id,title,genres,tagline
0,862,Toy Story,Animation Comedy Family,English jealousy toy boy friendship friends ri...
1,862,Toy Story,Animation Comedy Family,English jealousy toy boy friendship friends ri...
2,862,Toy Story,Animation Comedy Family,English jealousy toy boy friendship friends ri...
3,862,Toy Story,Animation Comedy Family,English jealousy toy boy friendship friends ri...
4,862,Toy Story,Animation Comedy Family,English jealousy toy boy friendship friends ri...


In [40]:
print(df.duplicated().sum())
df = df.drop_duplicates().reset_index(drop=True)

18816


In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfv = TfidfVectorizer(min_df=3,
                      max_features=None, 
                      
                      # this removes the all the characters from the sentences
                      strip_accents = 'unicode',
                      analyzer = 'word',
                      token_pattern = r'\w{1,}',
                      
                      ngram_range = (1, 3), # takes combination 1-3 words
                      stop_words = 'english') # this will remove all the unnecessary words like the that etc.

# Filling NaNs with empty string
df['tagline'] = df['tagline'].fillna('')

In [42]:
# Fitting the TF-IDF on the 'overview' text
tfv_matrix = tfv.fit_transform(df['tagline'])
print(tfv_matrix)

  (0, 8921)	0.016462131388803643
  (0, 15333)	0.07948259519445919
  (0, 29134)	0.10056585474217311
  (0, 3323)	0.0638252096853962
  (0, 12091)	0.06523515079120551
  (0, 12071)	0.05663767024093756
  (0, 24076)	0.08852887567673193
  (0, 29136)	0.11096460190113233
  (0, 1467)	0.11934440925490594
  (0, 5263)	0.029845016229510846
  (0, 10491)	0.040797219159699756
  (0, 28982)	0.0930181837053329
  (0, 28833)	0.11096460190113233
  (0, 15665)	0.12256416744666161
  (0, 16677)	0.08081371398998001
  (0, 31172)	0.3182476294356918
  (0, 1408)	0.3053000193862296
  (0, 24711)	0.0844744416224003
  (0, 29138)	0.1088177714256033
  (0, 17119)	0.06915478065444072
  (0, 13326)	0.09892762283569115
  (0, 24546)	0.0901671075832139
  (0, 2991)	0.09075550538805756
  (0, 3485)	0.08177162788119566
  (0, 3842)	0.35474489229052264
  :	:
  (7795, 28925)	0.1297161614994831
  (7795, 5953)	0.14055836598812124
  (7795, 12648)	0.11280598070580682
  (7795, 10083)	0.11280598070580682
  (7795, 29104)	0.14554413873438313
  (

In [43]:
# Using kernel function to bring values between 0 and 1 (just in case for similarity)
from sklearn.metrics.pairwise import sigmoid_kernel
# Compute the sigmoid kernel which brings the values between 0 and 1
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [44]:
def get_recommendations(title, sig=sig):
    # Get DataFrame index for the given title
    idx = indices[title]  
    
    # Similarity scores
    sig_scores = list(enumerate(sig[idx].flatten()))
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)[1:11]  # Skip itself
    
    # Convert similarity positions to actual DataFrame indexes
    movie_indices = [df.index[i] for i, _ in sig_scores]
    
    return df.loc[movie_indices, 'title'].tolist()



In [48]:
def recommend_cold_start(fav_genres=None, fav_titles=None):
    if fav_genres:
        subset = df[df['genres'].apply(lambda g: any(genre.lower() in g.lower() for genre in fav_genres))]
        return subset.sort_values('title', ascending=False).head(10)['title'].tolist()

    elif fav_titles:
        all_recommendations = []
        for title in fav_titles:
            if title in indices:
                recs = get_recommendations(title)
                all_recommendations.extend(recs)

        # Remove duplicates while keeping order and avoid showing fav_titles themselves
        seen = set()
        unique_recs = []
        for m in all_recommendations:
            if m not in seen and m not in fav_titles:
                seen.add(m)
                unique_recs.append(m)

        return unique_recs[:10]

    else:
        return df.sort_values('title', ascending=False).head(10)['title'].tolist()


In [49]:
# By genres & title $ nothing just new
print(recommend_cold_start(fav_genres=["Action", "Drama"]))
print(recommend_cold_start(fav_titles=["City Hall"]))
print(recommend_cold_start())


['’Round Midnight', 'Æon Flux', 'xXx: State of the Union', 'xXx', 'eXistenZ', 'Zulu', 'Zorro, The Gay Blade', 'Zorba the Greek', 'Zoot Suit', 'Zombeavers']
['My Giant', 'Othello', 'Extreme Measures', 'Dracula: Dead and Loving It', 'Did You Hear About the Morgans?', 'Little Big League', 'Envy', 'Striptease', 'Misery', 'Absolute Power']
['’Round Midnight', 'Æon Flux', '¡Three Amigos!', 'xXx: State of the Union', 'xXx', 'loudQUIETloud: A Film About the Pixies', 'eXistenZ', '[REC]', 'Zulu', 'Zorro, The Gay Blade']
