# Home task 

1) Replicate Simple recommender implementation 
1) (optional) Replicate the the content based  recommender implementation 

[Beginner Tutorial: Recommender Systems in Python](https://www.datacamp.com/community/tutorials/recommender-systems-python?utm_source=adwords_ppc&utm_campaignid=1455363063&utm_adgroupid=65083631748&utm_device=c&utm_keyword=&utm_matchtype=b&utm_network=g&utm_adpostion=&utm_creative=332602034358&utm_targetid=aud-748597547652:dsa-473406569915&utm_loc_interest_ms=&utm_loc_physical_ms=1012865&gclid=Cj0KCQjwsZKJBhC0ARIsAJ96n3XK-0y5uKGhO4w7V-A3nvj7WZlIg9NVQ8aeCLYKiEqhcb44rtw9qDoaAmeLEALw_wcB)

In [5]:
import pandas as pd

### Simple recommender

In [53]:
# Load Movies Metadata
metadata = pd.read_csv('./dataset/movies_metadata.csv', low_memory=False)

# Print the first three rows
metadata.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


#### WeightedRating(WR)=(vv+m⋅R)+(mv+m⋅C)
- v is the number of votes for the movie;

- m is the minimum votes required to be listed in the chart;

- R is the average rating of the movie;

- C is the mean vote across the whole report.

In [54]:
class SimpleRecommender:
    def __init__(self, m_name: str, C_name: str, result_col: str):
        """Constructor

        Args:
            C_name (str): column name, calculate the mean
            m_name (str): column name, calculate minimum required parameter
            result_col (str): where we will save our results
            m (float, optional): minimum votes number. Defaults to None.
            C (float, optional): mean of vote average column. Defaults to None.
        """
        self.m_name = m_name
        self.C_name = C_name
        self.result_col = result_col

    def fit(self, X: pd.DataFrame):
        
        # Calculate mean of vote average column
        self.C = X[self.C_name].mean()

        # Calculate the minimum number of votes required to be in the chart, m
        self.m = X[self.m_name].quantile(0.90)

        # Filter out all qualified movies into a new DataFrame
        q_movies = X.copy().loc[X[self.m_name] >= self.m]

        # Define a new feature [self.result_col] and calculate its value with `weighted_rating()`
        q_movies[self.result_col] = q_movies.apply(self.weighted_rating, axis=1)

        # Sort movies based on score calculated above
        q_movies = q_movies.sort_values(self.result_col, ascending=False)

        return q_movies 
    
    # Function that computes the weighted rating
    def weighted_rating(self, X):
        v = X[self.m_name]
        R = X[self.C_name]
        # Calculation based on the IMDB formula
        return (v / (v + self.m) * R) + (self.m / (self.m + v) * self.C)

In [55]:
model = SimpleRecommender(m_name="vote_count", C_name="vote_average", result_col="score")
recommends = model.fit(metadata)
recommends[['title', 'vote_count', 'vote_average', 'score']].head(20)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


#### Model evaluation

In [56]:
from sklearn.metrics import r2_score

In [57]:
# calculated accuracy
accuracy = (recommends["score"].mean()/recommends["vote_average"].mean()) * 100
print(f"Model accuracy: {round(accuracy, 2)}%")

# r2_score
r2_score_ = r2_score(recommends['vote_average'], recommends['score']) * 100
print(f"Model r2_score - {round(r2_score_, 2)}%")

Model accuracy: 96.89%
Model r2_score - 83.56%


### Content-based recommender

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [65]:
# Load Movies Metadata
metadata = pd.read_csv('./dataset/movies_metadata.csv', low_memory=False)[:20000]

# Print plot overviews of the first 5 movies.
metadata['overview'].head()

metadata['overview'] = metadata['overview'].fillna('')

In [66]:
class ContentBasedRecommender:
    def __init__(self, fit_on: str, recommend_for_col: str, stop_words: str = "english"):
        self.fit_on = fit_on
        self.stop_words = stop_words
        self.recommend_for_col = recommend_for_col

    def fit(self, X: pd.DataFrame):
        # Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
        tfidf = TfidfVectorizer(stop_words=self.stop_words)

        # Construct the required TF-IDF matrix by fitting and transforming the data
        tfidf_matrix = tfidf.fit_transform(X[self.fit_on])

        # Compute the cosine similarity matrix
        self.cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

        #Construct a reverse map of indices and movie titles
        self.indices = pd.Series(X.index, index=X[self.recommend_for_col]).drop_duplicates()

        # if train succeed
        # define data
        self.data = X

    def get_recommendation(self, title: str):
        # Get the index of the movie that matches the title
        idx = self.indices[title]

        # Get the pairwsie similarity scores of all movies with that movie
        sim_scores = list(enumerate(self.cosine_sim[idx]))

        # Sort the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores of the 10 most similar movies
        sim_scores = sim_scores[1:11]

        # Get the movie indices
        movie_indices = [i[0] for i in sim_scores]

        # Return the top 10 most similar movies
        return self.data[self.recommend_for_col].iloc[movie_indices]
        

In [67]:
model = ContentBasedRecommender(fit_on="overview", recommend_for_col="title")
model.fit(metadata)

In [68]:
model.get_recommendation('The Dark Knight Rises')

12481                            The Dark Knight
150                               Batman Forever
1328                              Batman Returns
15511                 Batman: Under the Red Hood
585                                       Batman
9230          Batman Beyond: Return of the Joker
18035                           Batman: Year One
19792    Batman: The Dark Knight Returns, Part 1
3095                Batman: Mask of the Phantasm
10122                              Batman Begins
Name: title, dtype: object

In [69]:
model.get_recommendation('The Godfather')

1178      The Godfather: Part II
1914     The Godfather: Part III
11297           Household Saints
10821                   Election
17729          Short Sharp Shock
8653                Violent City
13177               I Am the Law
6711                    Mobsters
6977             Queen of Hearts
18224                  Miss Bala
Name: title, dtype: object

In [70]:
model.get_recommendation("Household Saints")

12062            December Boys
1151          Paris is Burning
5873                 WiseGirls
4806                      Fame
1120             Love In Bloom
16634                Hadewijch
8369                 Show Boat
6186     It Runs in the Family
834              The Godfather
18093            Higher Ground
Name: title, dtype: object