# Data mining Project1 
### Αντώνιος Καρβέλας sdi1600060 Μηνάς Μαρίος Σωτηρίου sdi1700156

# Part 1

Necessary imports.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances

Read datasets.

In [None]:
ratings = pd.read_csv('../input/mydata/IMDb ratings.csv')
movies = pd.read_csv('../input/mydata/IMDb movies.csv')
titles = pd.read_csv( '../input/mydata/netflix_titles.csv')

Preview head of datasets.

In [None]:
titles.head()

In [None]:
movies.head()

In [None]:
ratings.head()

### Question 1
Number of movies and number of tv shows.

In [None]:
titles["type"].value_counts().plot.bar()
print(titles["type"].value_counts())

### Question 2
Both movies and tv shows show and increase in numbers, but movies are still somewhat higher.

In [None]:
dates = titles["date_added"]
dates = pd.to_datetime(dates) #we want to extract the year of date_added
q2 = titles.copy()
q2['date_added'] = dates.dt.year
q2 = q2.groupby(["date_added", "type"])
fig, ax = plt.subplots()
q2.count()['show_id'].unstack().plot(ax=ax)
plt.show()

### Question 3
Top 10 countries with most content.

In [None]:
q3 = titles.copy()
q3['country'] = q3.country.str.strip().str.split("\s*[\,]\s*").tolist()
original_columns = [str(i) for i in q3.columns]
q3 = q3.join(q3.country.str.join('|').str.get_dummies())  #we create columns that are 1 if the content is produced in that country
countries_count = {}
for i in q3.columns:
    if i not in original_columns: #we do not want to parse non country columns
        countries_count[i] = q3[i].sum() #simply we can find what content is produced where
        
#we find the top 10 countries in regards of content 
countries_large = []
countries_labels = []
for i in range(10):
    max_element = max(countries_count, key=countries_count.get)
    countries_large.append( countries_count[max_element])
    countries_labels.append(max_element)
    del countries_count[max_element]
    
plt.figure(figsize=(15, 5))
plt.title("Top 10 countries with most content:")
plt.bar(countries_labels,countries_large)

### Question 4
Genres per country.

In [None]:
unique_countries = [str(i) for i in q3.columns if i not in original_columns]
q4 = q3.copy() # we will use the columns of countries we created in q3
q4['listed_in'] = q4.listed_in.str.strip().str.split("\s*[\,]\s*").tolist()
original_columns.append('listed_in')
q4 = q4.join(q4.listed_in.str.join('|').str.get_dummies()) #we create columns that are 1 if the content is in a category

for a in unique_countries:
    country_shows = q4[q4[a] == 1]
    genre_count={}
    for b in country_shows.columns:
        if b not in original_columns and b not in unique_countries: #we do not want to parse non category columns
            sum_of_shows =  country_shows[b].sum()  #simply we can find what content is produced where
            if sum_of_shows > 0:
                genre_count[b] = sum_of_shows
    #we find the top 10 genres in each country 
    genres_large = []
    genres_labels = []
    for i in range(min([10, len(genre_count.keys())])):
        max_element = max(genre_count, key=genre_count.get)
        genres_large.append(genre_count[max_element])
        genres_labels.append(max_element)
        del genre_count[max_element]
    
    plt.figure(figsize=(30, 5))
    plt.title(f"{a} genres:")
    plt.bar(genres_labels, genres_large)

### Question 5
Top cast per country for movies and tv shows.
Some graphs are of course empty.

In [None]:
unique_cast = [str(i) for i in q3.columns if i not in original_columns]
q5 = q3.copy()
q5['cast'] = q5.cast.str.strip().str.split("\s*[\,]\s*").tolist()
print(q5['cast'])
q5.fillna("",inplace=True)

#we modify the column 'cast' into a list in order to traverse it with a for 

for a in unique_countries:
    country_shows = q5[(q5['type'] == "TV Show") & (q5[a] == 1)]
    country_movies = q5[(q5['type'] == "Movie") & (q5[a] == 1)]
    actor_count_shows = {}
    actor_count_movies = {} 
    for index,row in country_shows.iterrows(): 
        for actor in row['cast']:
            actor_count_shows[actor] = actor_count_shows.get(actor,0) + 1
    for index,row in country_movies.iterrows(): 
        for actor in row['cast']:
            actor_count_movies[actor] = actor_count_movies.get(actor,0) + 1
    
    

    plot_large = []
    plot_labels = []
    for i in range(min([10, len(actor_count_shows.keys())])):
        max_element = max(actor_count_shows, key=actor_count_shows.get)
        plot_large.append(actor_count_shows[max_element])
        plot_labels.append(max_element)
        del actor_count_shows[max_element]

    plt.figure(figsize=(30, 5))
    plt.title(f"{a} top actors in shows:")
    plt.bar(plot_labels, plot_large)
    
    plot_large = []
    plot_labels = []
    for i in range(min([10, len(actor_count_movies.keys())])):
        max_element = max(actor_count_movies, key=actor_count_movies.get)
        plot_large.append(actor_count_movies[max_element])
        plot_labels.append(max_element)
        del actor_count_movies[max_element]

    plt.figure(figsize=(30, 5))
    plt.title(f"{a} top actors in movies:")
    plt.bar(plot_labels, plot_large)
    
        
                


### Question 6
Content per age target group.

In [None]:
Little_Kids = titles[(titles['rating'] == 'G') | (titles['rating'] == 'TV-Y') | (titles['rating'] == 'TV-G') ]
Older_Kids = titles[(titles['rating'] == 'PG') | (titles['rating'] == 'TV-Y7') | (titles['rating'] == 'TV-Y7-FV') | (titles['rating'] == 'TV-PG') ]
Teens = titles[(titles['rating'] == 'PG-13') | (titles['rating'] == 'TV-14') ]
Mature = titles[(titles['rating'] == 'R') | (titles['rating'] == 'NC-17') | (titles['rating'] == 'TV-MA') ]

plt.title("type of content in Netflix:")

categories = ['Little Kids','Older Kids','Teens','Mature']
count_in_categories = [Little_Kids.shape[0],Older_Kids.shape[0],Teens.shape[0],Mature.shape[0]]
#.shape[0] lets us know the number of rows of a dataframe

plt.bar(categories,count_in_categories)


### Question 7
Content added per month.

In [None]:
q7 = titles.copy()
q7['date_added'] = dates

#we suppose the director wants to know the frequency of content added to netflix for the past 5 years, we can easily change that

for year in range(2015,2021):
    content_year = q7[q7['date_added'].dt.year == year]
    
    content_per_month = {}
    for index,row in content_year.iterrows():
        content_per_month[row['date_added'].date().month] = content_per_month.get(row['date_added'].date().month,0)+1
    plt.title(f"content per month in {year}")
    plt.bar(content_per_month.keys(),content_per_month.values())
    plt.show()

### Question 8
Content genres in Netflix.

In [None]:
q8 = q4
#we will use the columns with genres that we created in Question 4
shows = {}
for a in q8.columns:
    if a not in original_columns and a not in unique_countries:
        sum_of_shows =  q8[a].sum()
        if sum_of_shows > 0:
            shows[a] = sum_of_shows
            

plt.figure(figsize=(30, 25))
plt.legend(fontsize=30)
plt.title("genres of content in Netflix:")
plt.barh(list(shows.keys()),list(shows.values()))


### Question 9
Top directors per country.

In [None]:
q9 = q3.copy()
q9['director'] = q9.director.str.strip().str.split("\s*[\,]\s*").tolist()
q9.fillna("",inplace=True)
#we will use the columns with countries that we created in Question 3

for a in unique_countries:
    directors = q9[q9[a]==1]
    directors_number = {}
    for index,row in directors.iterrows():
        for director in row['director']:
            directors_number[director] = directors_number.get(director,0) + 1
    plot_large = []
    plot_labels = []
    for i in range(min([10, len(directors_number.keys())])):
        max_element = max(directors_number, key=directors_number.get)
        plot_large.append(directors_number[max_element])
        plot_labels.append(max_element)
        del directors_number[max_element]

    plt.figure(figsize=(30, 5))
    plt.title(f" top directors in {a}")
    plt.bar(plot_labels, plot_large)

### Question 10
Shows with the same number of seasons. Most have only one or two seasons, of course.

In [None]:
tv_shows = titles[titles['type']=='TV Show']

tv_shows_season = {}

for index,row in tv_shows.iterrows():
    if "Season" in row['duration'] or "Seasons" in row['duration']:
        number_of_seasons = int(row['duration'].split()[0])
    else:
        number_of_seasons = 1
    tv_shows_season[number_of_seasons] = tv_shows_season.get(number_of_seasons,0) + 1


output_x = [i for i in sorted(tv_shows_season.keys())]
output_y = [tv_shows_season[i] for i in sorted(tv_shows_season.keys())]
plt.title("Shows with the same number of seasons")
plt.bar(output_x,output_y)

Question 11

In [None]:
netflix_titles = titles['title']
imdb_movies = movies[['title','imdb_title_id']]
imdb_ratings = ratings[['weighted_average_vote','imdb_title_id']]

netflix_rankings = {}

# we will try to accosiate the movies that neflix has with imdb's movie database then
# we will use the ids in the imdb's database in order to associate them to their imdb ratings 

for movie in tqdm(netflix_titles):
    movie_id = imdb_movies[imdb_movies['title'] == movie]
    
    if movie_id.shape[0]>0:
        movie_id = movie_id.iloc[0]['imdb_title_id']
        netflix_rankings[movie] = float(imdb_ratings[imdb_ratings['imdb_title_id'] == str(movie_id)].iloc[0]['weighted_average_vote'])


#It takes about 2 minutes to complete the necessary actions for us to output the plot, so we added a bar to show the progress!

In [None]:
#here we do the plot for Question 11
plot_large = []
plot_labels = []
for i in range(min([20, len(netflix_rankings.keys())])):
    max_element = max(netflix_rankings, key=netflix_rankings.get)
    plot_large.append(netflix_rankings[max_element])
    plot_labels.append(max_element)
    del netflix_rankings[max_element]

plt.figure(figsize=(30, 5))
plt.title(f"top movies in netflix")
plt.barh(plot_labels, plot_large)

# Part 2

In [None]:
content = pd.read_csv('../input/mydata/netflix_titles.csv')

Fill NaNs and join string columns for later vectorization.

In [None]:
content.fillna('', inplace=True)
content['text'] = content[['show_id', 'title', 'description']].agg(' '.join, axis=1)
content.head()

Initialize bag of words.

In [None]:
bowVectorizer = CountVectorizer(max_df=0.99, min_df=0.01, ngram_range=(1, 2), binary=True)
bow = bowVectorizer.fit_transform(content['text']).toarray()

Initialize TF-IDF.

In [None]:
tfidfVectorizer = TfidfVectorizer(max_df=0.99, ngram_range=(1, 2))
tfidf = tfidfVectorizer.fit_transform(content['text'])

Find most similar movies using bow. Jaccard pairwise distance doesn't work with sparse matrices and this makes the process slow and innacurate.

In [None]:
bow_most_similar = {}
similarities = 1 - pairwise_distances(bow, metric = "jaccard")
for index, data in content.iterrows():
    title = data['title']
    top100Ind = similarities[index].argsort()[:-101:-1]
    top100Titles = content['title'].iloc[top100Ind]
    bow_most_similar[title] = list(top100Titles)

Find most similar movies using TF-IDF.

In [None]:
tfidf_most_similar = {}
similarities = cosine_similarity(tfidf)
for index, data in content.iterrows():
    title = data['title']
    top100Ind = similarities[index].argsort()[:-101:-1]
    top100Titles = content['title'].iloc[top100Ind]
    tfidf_most_similar[title] = list(top100Titles)

In [None]:
def get_similar_movies1(title, N, method):
    if method == 'boolean':
        return bow_most_similar[title][1:N+1]
    elif method == 'tf-idf':
        return tfidf_most_similar[title][1:N+1]
    else:
        return None

Test get similar movies with movie title.

In [None]:
get_similar_movies1('#FriendButMarried', 10, 'tf-idf')

In [None]:
def get_similar_movies2(sentence, N, method):
    if method == 'boolean':
        sentenceVect = bowVectorizer.transform([sentence]).toarray()
        simils = [jaccard_score(i, sentenceVect[0]) for i in bow]
        simils = np.array(simils).argsort()[:-N:-1]
        topNTitles = content['title'].iloc[simils].tolist()
        return topNTitles
    elif method == 'tf-idf':
        sentenceVect = tfidfVectorizer.transform([sentence])
        simils = np.squeeze(cosine_similarity(tfidf, sentenceVect))
        simils = simils.argsort()[:-N:-1]
        topNTitles = content['title'].iloc[simils].tolist()
        return topNTitles
    else:
        return None

Test get movies based on a sentence.

In [None]:
get_similar_movies2('married life', 10, 'tf-idf')