In [3]:
# Content-based
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [68]:
###### helper functions. Use them when needed #######
def get_title_from_index(index):
	return df[df.index == index]["title"].values[0]

def get_index_from_title(title):
	return df[df.title == title]["index"].values[0]
##################################################

# Step 1: Read CSV File
df = pd.read_csv("tv_shows_dataset.csv")
# print(df.columns)
df

Unnamed: 0,index,title,Year Released,Content Rating,IMDB Rating,R Rating,Genre,Description,No of Seasons,Streaming Platform
0,0,Breaking Bad,2008,18+,9.5,100,"Crime,Drama","When Walter White, a New Mexico chemistry teac...",5Seasons,Netflix
1,1,Game of Thrones,2011,18+,9.3,99,"Action & Adventure,Drama",Seven noble families fight for control of the ...,8Seasons,"HBO MAX,HBO"
2,2,Rick and Morty,2013,18+,9.2,97,"Animation,Comedy",Rick is a mentally-unbalanced but scientifical...,4Seasons,"Free Services,HBO MAX,Hulu"
3,3,Stranger Things,2016,16+,8.8,96,"Drama,Fantasy","When a young boy vanishes, a small town uncove...",3Seasons,Netflix
4,4,The Boys,2019,18+,8.7,95,"Action & Adventure,Comedy",A group of vigilantes known informally as “The...,2Seasons,Prime Video
...,...,...,...,...,...,...,...,...,...,...
12348,12348,A Fishing Story with Ronnie Green,2017,,,-1,"2017,Prime Video",A Fishing Story with Ronnie Green has one or m...,2Seasons,"Prime Video,fuboTV"
12349,12349,CMT Most Shocking,2003,,,-1,-1,-1,-1,-1
12350,12350,NHL Road to the Outdoor Classics,2016,,,-1,"2016,Prime Video",Road to the NHL Outdoor Classics takes us deep...,1Season,"Prime Video,Epix"
12351,12351,Addy Media,2018,,,-1,"2018,Prime Video",Addy Media has one or more episodes streaming ...,1Season,Prime Video


In [43]:
print(df.columns)

Index(['index', 'Series Title', 'Year Released', 'Content Rating',
       'IMDB Rating', 'R Rating', 'Genre', 'Description', 'No of Seasons',
       'Streaming Platform'],
      dtype='object')


In [35]:
# Step 2: Select Features
features = ['Genre']
features

['Genre']

In [32]:
# df.isnull().sum()

index                    0
Series Title             0
Year Released            0
Content Rating        5121
IMDB Rating           2146
R Rating                 0
Genre                    0
Description              0
No of Seasons            0
Streaming Platform    1983
dtype: int64

In [36]:
for feature in features:
	df[feature] = df[feature].fillna('')

In [39]:
# Step 3: Create a column in DF which combines all selected features
def combine_features(row):
    try:
        return row['Genre']
    except:
        print("Error: ", row)

df["combined_features"] = df.apply(combine_features,axis=1)
print("Combined Features:\n", df["combined_features"].head())

Combined Features:
 0                  Crime,Drama
1     Action & Adventure,Drama
2             Animation,Comedy
3                Drama,Fantasy
4    Action & Adventure,Comedy
Name: combined_features, dtype: object


In [40]:
# Step 4: Create count matrix from this new combined column
cv = CountVectorizer()
count_matrix = cv.fit_transform(df["combined_features"])
print(count_matrix)

  (0, 98)	1
  (0, 104)	1
  (1, 104)	1
  (1, 81)	1
  (1, 82)	1
  (2, 85)	1
  (2, 97)	1
  (3, 104)	1
  (3, 106)	1
  (4, 81)	1
  (4, 82)	1
  (4, 97)	1
  (5, 98)	1
  (5, 104)	1
  (6, 104)	1
  (6, 118)	1
  (7, 98)	1
  (7, 81)	1
  (7, 82)	1
  (8, 98)	1
  (8, 104)	1
  (9, 81)	1
  (9, 82)	1
  (9, 85)	1
  (10, 97)	1
  :	:
  (12339, 133)	1
  (12339, 152)	1
  (12341, 74)	1
  (12341, 133)	1
  (12341, 152)	1
  (12342, 67)	1
  (12342, 110)	1
  (12342, 137)	1
  (12343, 75)	1
  (12343, 133)	1
  (12343, 152)	1
  (12344, 75)	1
  (12344, 122)	1
  (12348, 75)	1
  (12348, 133)	1
  (12348, 152)	1
  (12350, 74)	1
  (12350, 133)	1
  (12350, 152)	1
  (12351, 76)	1
  (12351, 133)	1
  (12351, 152)	1
  (12352, 72)	1
  (12352, 110)	1
  (12352, 137)	1


In [41]:
# Step 5: Compute the Cosine Similarity based on the count_matrix 
# gives similarity scores
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim)

[[1.         0.40824829 0.         ... 0.         0.         0.        ]
 [0.40824829 1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.66666667 0.        ]
 [0.         0.         0.         ... 0.66666667 1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [65]:
# i want to list the tv shows in descending order of their similarity scores
# 1.00 to 0.00
# we want convert list of elements into a list of ttuples
tvshow_user_likes = "Breaking Bad"
shows_index = get_index_from_title(tvshow_user_likes)
similar_shows = list(enumerate(cosine_sim[shows_index]))

In [66]:
# Step 7: Get a list of similar movies in descending order of similarity score
sorted_similar_shows = sorted(similar_shows, key=lambda x:x[1], reverse=True)[0:10]

In [67]:
# Step 8: Print titles of first 50 shows based on similarity score via genres
i = 0
print("People who watched "+tvshow_user_likes+" also liked:\n")
for element in sorted_similar_shows:
    print(get_title_from_index(element[0]))
    i = i + 1
    if i > 10:
        break

People who watched Breaking Bad also liked:

Breaking Bad
Dark
Fargo
The Wire
Peaky Blinders
The Sopranos
Mindhunter
Dexter
Narcos
True Detective


by IMDB

In [69]:
df["IMDB Rating"].unique()

array([9.5, 9.3, 9.2, 8.8, 8.7, 9.4, 9.1, 8.9, 8.4, 8.2, 8.5, 8. , 8.3,
       8.6, 9. , 8.1, 7.7, 7.6, 7.5, 7.9, 7.8, 7.4, 7.3, 7. , 6.8, 7.1,
       7.2, 6.4, 6.9, 6.3, 6.7, 6.6, 6.5, 5.8, 6.1, 5.5, 5.3, 6.2, 6. ,
       nan, 9.7, 5.7, 4.9, 5.4, 5.6, 3.6, 5.1, 5.9, 5. , 5.2, 4.6, 4.5,
       4.2, 2.5, 4.1, 4.3, 2.8, 4.8, 3.5, 3.4, 4. , 4.4, 3.2, 4.7, 3.1,
       3.8, 3.3, 2.6, 3.7, 2.9, 3.9, 3. , 1.6, 2. , 2.3, 1.8, 2.1, 2.4,
       1.7, 2.7, 1. , 2.2, 1.3, 1.9, 1.2])

In [70]:
# step 7 repeated but this time with 'sorted_similar_shows' = genre and IMDB
sort_by_IMDB = sorted(sorted_similar_shows,key=lambda x:df["IMDB Rating"][x[0]],reverse=True)
print(sort_by_IMDB)

[(0, 0.9999999999999998), (13, 0.9999999999999998), (26, 0.9999999999999998), (37, 0.9999999999999998), (8, 0.9999999999999998), (5, 0.9999999999999998), (22, 0.9999999999999998), (35, 0.9999999999999998), (33, 0.9999999999999998), (34, 0.9999999999999998)]


In [73]:
# Step 8: Print titles of first n shows based on similarity score via genres
# and again by IMDB
i=0
print("Suggesting top 5 TV Shows in order of Genres and IMDB rating:\n")
for element in sort_by_IMDB:
    print(get_title_from_index(element[0]))
    i=i+1
    if i>10:
        break



Suggesting top 5 movies in order of genres and IMDB rating:

Breaking Bad
The Wire
The Sopranos
True Detective
Fargo
Dark
Peaky Blinders
Narcos
Mindhunter
Dexter
