# Tag Based Cosine Filtering
- This will use a tag that the user may search for, in order to find a list of recommended movies
- This will have to take into account the ratings of films aswell

In [9]:
# Obtain Movies Variable
%store -r movies

- preprocessed movies are obtained from `preprocessing.ipynb`

In [10]:
movies

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew,weighted_average
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",http://www.avatarmovie.com/,19995,"[culture clash, future, space war, space colon...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[Ingenious Film Partners, Twentieth Century Fo...",...,"[English, Español]",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,Sam Worthington Zoe Saldana Sigourney Weaver S...,James Cameron,7.049985
1,300000000,"[Adventure, Fantasy, Action]",http://disney.go.com/disneypictures/pirates/,285,"[ocean, drug abuse, exotic island, east india ...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[Walt Disney Pictures, Jerry Bruckheimer Films...",...,[English],Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,Johnny Depp Orlando Bloom Keira Knightley Stel...,Gore Verbinski,6.664988
2,245000000,"[Action, Adventure, Crime]",http://www.sonypictures.com/movies/spectre/,206647,"[spy, based on novel, secret agent, sequel, mi...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[Columbia Pictures, Danjaq, B24]",...,"[Français, English, Español, Italiano]",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,Daniel Craig Christoph Waltz Léa Seydoux Ralph...,Sam Mendes,6.239535
3,250000000,"[Action, Crime, Drama, Thriller]",http://www.thedarkknightrises.com/,49026,"[dc comics, crime fighter, terrorist, secret i...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.312950,"[Legendary Pictures, Warner Bros., DC Entertai...",...,[English],Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,Christian Bale Michael Caine Gary Oldman Anne ...,Christopher Nolan,7.345527
4,260000000,"[Action, Adventure, Science Fiction]",http://movies.disney.com/john-carter,49529,"[based on novel, mars, medallion, space travel...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,[Walt Disney Pictures],...,[English],Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,Taylor Kitsch Lynn Collins Samantha Morton Wil...,Andrew Stanton,6.097042
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4804,220000,"[Action, Crime, Thriller]",-1,9367,"[united states–mexico barrier, legs, arms, pap...",es,El Mariachi,El Mariachi just wants to play his guitar and ...,14.269792,[Columbia Pictures],...,[Español],Released,"He didn't come looking for trouble, but troubl...",El Mariachi,6.6,238,9367,Carlos Gallardo Jaime de Hoyos Peter Marquardt...,Robert Rodriguez,6.151336
4805,9000,"[Comedy, Romance]",-1,72766,[],en,Newlyweds,A newlywed couple's honeymoon is upended by th...,0.642552,[],...,[],Released,A newlywed couple's honeymoon is upended by th...,Newlyweds,5.9,5,72766,Edward Burns Kerry Bishé Marsha Dietlein Caitl...,Edward Burns,6.093125
4806,0,"[Comedy, Drama, Romance, TV Movie]",http://www.hallmarkchannel.com/signedsealeddel...,231617,"[date, love at first sight, narration, investi...",en,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...",1.444476,"[Front Street Pictures, Muse Entertainment Ent...",...,[English],Released,-1,"Signed, Sealed, Delivered",7.0,6,231617,Eric Mabius Kristin Booth Crystal Lowe Geoff G...,Scott Smith,6.096576
4807,0,[],http://shanghaicalling.com/,126186,[],en,Shanghai Calling,When ambitious New York attorney Sam is sent t...,0.857008,[],...,[English],Released,A New Yorker in Shanghai,Shanghai Calling,5.7,7,126186,Daniel Henney Eliza Coupe Bill Paxton Alan Ruck,Daniel Hsia,6.092164


- movies dataframe display and shape

In [11]:
import pandas as pd 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import euclidean_distances

R = movies['vote_average']
v = movies['vote_count']
# We will only consider movies that have more votes than at least 80% of the movies in our dataset
m = movies['vote_count'].quantile(0.9)
C = movies['vote_average'].mean()

movies['weighted_average'] = (R*v + C*m)/(v+m)

- Using feature extraction, the most related items to the movies are extracted and used within our search
- The following equation is then used to find a weighted average
    - Weight Average = (R*v)+(C*m) / (v+m)

In [12]:
scaler = MinMaxScaler()
scaled = scaler.fit_transform(movies[['popularity', 'weighted_average']])
weighted_df = pd.DataFrame(scaled, columns=['popularity', 'weighted_average'])

weighted_df.index = movies['original_title']

weighted_df['score'] = weighted_df['weighted_average']*0.4 + weighted_df['popularity'].astype('float64')*0.6

- scales the dataframe
- calculates a weighted score for each movie by combining the scaled 'weighted_average' (weighted by 0.4) and 'popularity' (weighted by 0.6) features
- these weights are used to strike a balance between both popularity and quality in the final 'score'
- this will reflect the trade-off between different aspects of movie performance 

In [13]:
weighted_df.sort_values(by='score', ascending=False)

Unnamed: 0_level_0,popularity,weighted_average,score
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Interstellar,0.827162,0.914048,0.861917
Minions,1.000000,0.397697,0.759079
Guardians of the Galaxy,0.549462,0.846272,0.668186
Deadpool,0.587690,0.708298,0.635933
Mad Max: Fury Road,0.495989,0.641577,0.554224
...,...,...,...
Independence Day: Resurgence,0.055707,0.086135,0.067878
Dragonball Evolution,0.024758,0.102317,0.055782
Batman & Robin,0.057189,0.038990,0.049910
The Boy Next Door,0.027595,0.077716,0.047643


In [14]:
# Check for null values
if movies[['title', 'tagline', 'cast', 'crew']].isnull().values.any():
    print("Warning: Null values found in the dataset.")

In [15]:
# Convert 'cast' and 'crew' columns to strings
movies['cast'] = movies['cast'].apply(lambda x: ' '.join(map(str, x)) if isinstance(x, list) else str(x))
movies['crew'] = movies['crew'].apply(lambda x: ' '.join(map(str, x)) if isinstance(x, list) else str(x))

# Fill NaN values in 'tagline' column
movies['tagline'] = movies['tagline'].fillna('')

# Convert all columns to string type
movies = movies.astype(str)

# Concatenate text from 'title', 'cast', 'crew', and 'tagline' columns
movies['concatenated_text'] = (movies['title'] + ' ' + 
                            movies['tagline'] + ' ' + 
                            movies['cast'] + ' ' + 
                            movies['crew'])

In [16]:
 # Initialize CountVectorizer to extract keywords
vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the concatenated text to extract keywords
keywords_matrix = vectorizer.fit_transform(movies['concatenated_text'])

# Get feature names (keywords)
keywords = vectorizer.get_feature_names_out()

In [17]:
 # Convert keywords matrix to DataFrame
keywords_df = pd.DataFrame(keywords_matrix.toarray(), columns=keywords)

# Add 'tags' column containing keywords separated by commas
movies['tags'] = keywords_df.apply(lambda row: ', '.join(keywords[row > 0]), axis=1)

# Drop the 'concatenated_text' column
movies.drop(columns=['concatenated_text'], inplace=True)

In [22]:
def findRecommendeds(search_term):
    # Remove rows with NaN values in the 'tags' column
    movies.dropna(subset=['tags'], inplace=True)

    # Filter movies by keyword in the 'tags' column
    keyword_movies = movies[movies['tags'].str.contains(search_term, case=False)]

    if keyword_movies.empty:
        print("No movies found matching the keyword.")
    else:
        # Sort filtered DataFrame by rating ('vote_average') in descending order
        top_rated_keyword_movies = keyword_movies.sort_values(by='vote_average', ascending=False).head(10)
        top_rated_keyword_movies = top_rated_keyword_movies[['title', 'vote_average', 'tags']]
        return top_rated_keyword_movies
    
top_rated_keyword_movies = findRecommendeds('love')
top_rated_keyword_movies

Unnamed: 0,title,vote_average,tags
2975,There Goes My Baby,8.5,"akshay, deepika, deshmukh, dreams, dutta, fell..."
2766,Room,8.1,"abrahamson, allen, boundaries, bridgers, brie,..."
2291,Back to the Future,8.0,"born, christopher, crispin, fox, future, glove..."
3419,Incendies,7.9,"bellucci, bentivoglio, fabrizio, forgotten, ga..."
3576,Dallas Buyers Club,7.9,"david, donald, duplass, effect, evan, evil, ge..."
2003,Her,7.9,"adams, amy, joaquin, johansson, jonze, love, m..."
4223,Mr. Smith Goes to Washington,7.9,"charles, comes, esther, feldshuh, friedman, he..."
2553,The Theory of Everything,7.8,"changed, charlie, cox, eddie, emily, felicity,..."
4527,Beyond the Mat,7.8,"different, falls, gries, hicks, idaho, jon, ki..."
1563,Million Dollar Baby,7.7,"baby, baruchel, clint, dollar, dreams, eastwoo..."


In [24]:
top_rated_keyword_movies = findRecommendeds('god')
top_rated_keyword_movies

Unnamed: 0,title,vote_average,tags
3872,Whiplash,8.3,"alexandre, city, dead, douglas, fernando, firm..."
2737,The Godfather: Part II,8.3,"al, coppola, diane, don, duvall, enemies, ever..."
3947,Oldboy,8.0,"bailee, cheatham, david, god, jeffrey, johnson..."
1759,The Blues Brothers,7.5,"aykroyd, belushi, blues, brothers, brown, cab,..."
1708,Die Hard,7.5,"40, alan, alexander, bedelia, bonnie, bruce, c..."
2951,What's Eating Gilbert Grape,7.5,"bible, bunce, christopher, cusick, daniel, god..."
3923,Red River,7.3,"drake, girlz, godfrey, jean, jimmy, joyful, li..."
3692,Pat Garrett & Billy the Kid,7.2,"brown, burke, devil, forgives, god, gordon, go..."
867,The Godfather: Part III,7.1,"al, andy, change, coppola, destiny, diane, ear..."
4345,Dr. No,6.9,"cronk, david, dead, god, harold, hart, hayley,..."
