#### Lab 01 - Content Based Recommender (Film Genre)

In [1]:
import pandas, warnings

warnings.filterwarnings("ignore")

In [2]:
origin = pandas.read_table("movies.csv", sep=",")

origin.iloc[:5]

Unnamed: 0,MovieID,Title,Genre
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
origin.tail()

Unnamed: 0,MovieID,Title,Genre
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation
9741,193609,Andrew Dice Clay: Dice Rules (1991),Comedy


In [4]:
null = origin.isnull().sum()

null.sum()

0

In [5]:
total = origin.shape[0]

total

9742

#### Maintain Genre Column Quality

In [6]:
# Change Genre Column Separator

splita = lambda val : " ".join(val.split("|"))

origin["Genre"] = origin["Genre"].apply(splita)

In [7]:
# Delete Film Without Genre

nolist = "(no genres listed)"

origin = origin[origin["Genre"] != nolist]

origin.iloc[:5]

Unnamed: 0,MovieID,Title,Genre
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


For Our Goal Using `TfidfVectorizer`, Replacing `Sci-Fi` to `SciFi`, `Film-Noir` to `Filnoir` is Mandatory

Others ? Resulting `Sci-Fi` to (`Sci` + `Fi`), `Film-Noir` to (`Film` + `Noir`)

In [8]:
# Replace Sci-Fi + Film-Noir

replacer = {"Sci-Fi":"SciFi", "Film-Noir":"Filnoir"}

for i, t in replacer.items():

  origin["Genre"] = origin["Genre"].str.replace(i, t)

origin.iloc[:5]

Unnamed: 0,MovieID,Title,Genre
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
# Find List of Genres

ListOfGenre = []

for insect in origin["Genre"].values:

  insect = insect.split(" ")

  for val in insect:

    if val not in ListOfGenre: ListOfGenre.append(val)

len(ListOfGenre)

19

There Are 19 Unique Genre on Our Film Table

In [10]:
# Check Our First Five Genres

ListOfGenre[:5]

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']

In [11]:
# Converting Genres to TF-IDF (Term Frequency-Inverse Document Frequency) Matrix

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words="english")

metrics = tfidf.fit_transform(origin["Genre"])

metrics.shape

(9708, 19)

Our TF-IDF Metrics Consist of 9708 Film and 19 Genres

In [12]:
# Understand our TF-IDF Metrics (1)

tfidf.get_feature_names_out().tolist()[:5]

['action', 'adventure', 'animation', 'children', 'comedy']

In [13]:
# Understand our TF-IDF Metrics (2)

len(tfidf.get_feature_names_out().tolist())

19

In [14]:
# Understand our TF-IDF Metrics (3)

assert len(tfidf.get_feature_names_out().tolist()) == len(ListOfGenre), "Fail !"

In [15]:
# Understand our TF-IDF Metrics (4)

assert metrics.todense().shape == metrics.shape, "Fail !"

In [16]:
# Check Genre : Comedy (5), Romance (15)

metrics.todense()[2]

matrix([[0.        , 0.        , 0.        , 0.        , 0.57070525,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.82115499,
         0.        , 0.        , 0.        , 0.        ]])

In [17]:
origin.iloc[2:3]

Unnamed: 0,MovieID,Title,Genre
2,3,Grumpier Old Men (1995),Comedy Romance


In [18]:
# Check Genre : Comedy (5), Drama (8), Romance (15)

metrics.todense()[3]

matrix([[0.        , 0.        , 0.        , 0.        , 0.50488626,
         0.        , 0.        , 0.46621628, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.72645182,
         0.        , 0.        , 0.        , 0.        ]])

In [19]:
origin.iloc[3:4]

Unnamed: 0,MovieID,Title,Genre
3,4,Waiting to Exhale (1995),Comedy Drama Romance


Perform Cosine Similarity to Measure Genres Similarity Between One Film to Another Film

In [20]:
# Use Cosine Similarity to Find Similarity of Our TF-IDF Metrics

from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(metrics)

similarities.shape

(9708, 9708)

In [21]:
# Check Similarities (1)

len(similarities[0])

9708

In [22]:
# Toy Story Similarities Result

similarities[0][:10]

array([1.        , 0.8136036 , 0.15259961, 0.13500041, 0.26738778,
       0.        , 0.15259961, 0.65470981, 0.        , 0.26241348])

In [23]:
# Jumanji Similarities Result

similarities[1][:10]

array([0.8136036 , 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.80470368, 0.        , 0.32253235])

Our Movies, Jumanji, Have 81 % Similarity on Genres to Toy Story

In [24]:
# Helper Function

intler = lambda i : origin.iloc[i, 1]

titint = lambda t : origin[origin["Title"] == t].index.values[0]

In [25]:
# Helper Test

title = "Jumanji (1995)"

titint(title)

1

In [26]:
# Helper Test

intler(1)

'Jumanji (1995)'

#### Creating Main Function to Our Model !

In [27]:
similar_result = enumerate(similarities[int(titint(title))])

molist = list(similar_result)

len(molist)

9708

In [28]:
molist[:5]

[(0, 0.8136035975025231), (1, 1.0), (2, 0.0), (3, 0.0), (4, 0.0)]

In [29]:
molist = sorted(molist, key=lambda i:i[1], reverse=True)

molist[:5]

[(1, 1.0), (53, 1.0), (109, 1.0), (767, 1.0), (1514, 1.0)]

#### Main Model

In [30]:
# Create Main Model Output Function

limit = 10

title = "Jumanji (1995)"

def output(title, limit):

  morate = enumerate(similarities[int(titint(title))])
  molist = list(morate)
  morter = sorted(molist, key=lambda i:i[1], reverse=True)

  molist = morter[:limit]
  molter = filter(lambda i : i[0] != titint(title), molist)
  molist = list(molter)

  titles = [intler(molist[i][0]) for i in range(len(molist))]
  result = origin[origin["Title"].isin(titles)]
  result["Similarity"] = [molist[i][1] for i in range(len(molist))]

  return result

output(title, limit)

Unnamed: 0,MovieID,Title,Genre,Similarity
53,60,"Indian in the Cupboard, The (1995)",Adventure Children Fantasy,1.0
109,126,"NeverEnding Story III, The (1994)",Adventure Children Fantasy,1.0
767,1009,Escape to Witch Mountain (1975),Adventure Children Fantasy,1.0
1514,2043,Darby O'Gill and the Little People (1959),Adventure Children Fantasy,1.0
1556,2093,Return to Oz (1985),Adventure Children Fantasy,1.0
1617,2161,"NeverEnding Story, The (1984)",Adventure Children Fantasy,1.0
1618,2162,"NeverEnding Story II: The Next Chapter, The (1...",Adventure Children Fantasy,1.0
1799,2399,Santa Claus: The Movie (1985),Adventure Children Fantasy,1.0
3574,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure Children Fantasy,1.0


In [31]:
# First Harry Potter Film

title = origin.iloc[3574:3575]

title

Unnamed: 0,MovieID,Title,Genre
3574,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure Children Fantasy


In [32]:
# Test : Main Model Output

title = origin.iloc[3574, 1]

output(title, 5)

Unnamed: 0,MovieID,Title,Genre,Similarity
1,2,Jumanji (1995),Adventure Children Fantasy,1.0
53,60,"Indian in the Cupboard, The (1995)",Adventure Children Fantasy,1.0
109,126,"NeverEnding Story III, The (1994)",Adventure Children Fantasy,1.0
767,1009,Escape to Witch Mountain (1975),Adventure Children Fantasy,1.0
1514,2043,Darby O'Gill and the Little People (1959),Adventure Children Fantasy,1.0


#### First Time User

`Cold Start Problem`

Let's Try Tackle Cold Start Problem

In [33]:
# Michael Recently Subscribe to Our Product

# Michael Choose Children and Fantasy Genres

option = ["Children", "Fantasy"]

option

['Children', 'Fantasy']

In [34]:
# Find Film to Recommend to Michael

def starter(option, limit):

  result = []

  ontari = origin["Genre"].unique()

  for item in ontari:
    # Split Genre
    spliter = item.split(" ")

    # Find Overlap Between Genre Input and Available Genre
    overlap = len(set(option) & set(spliter)) / len(spliter)
    if overlap > 0.5:
      result.append((item, overlap))

  # Sort Table Result
  result = sorted(result, key=lambda i:i[1], reverse=True)[:limit]
  result = [i[0] for i in result]
  result = origin[origin["Genre"].isin(result)].iloc[:limit]

  return result

starter(option, 5)

Unnamed: 0,MovieID,Title,Genre
1,2,Jumanji (1995),Adventure Children Fantasy
53,60,"Indian in the Cupboard, The (1995)",Adventure Children Fantasy
109,126,"NeverEnding Story III, The (1994)",Adventure Children Fantasy
209,243,Gordy (1995),Children Comedy Fantasy
301,343,"Baby-Sitters Club, The (1995)",Children


Michael Choose Gordy

Let's Recommend Other Film to Michael

In [35]:
# Recommend Other Film to Michael Based on Gordy

title = origin.iloc[209, 1]

output(title, 5)

Unnamed: 0,MovieID,Title,Genre,Similarity
313,355,"Flintstones, The (1994)",Children Comedy Fantasy,1.0
355,410,Addams Family Values (1993),Children Comedy Fantasy,1.0
637,810,Kazaam (1996),Children Comedy Fantasy,1.0
649,837,Matilda (1996),Children Comedy Fantasy,1.0
