In [16]:
import pandas as pd
import numpy as np

## Innlesing og opprydding i data

In [17]:
data = pd.read_csv('../../datasets/imdb-5000-movie-dataset/movie_metadata.csv')

to_use = ['genres', 'plot_keywords', 'movie_title', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name', 'imdb_score']
data_use = data[to_use].copy()
data_use['movie_title'] = [i.replace("\xa0","") for i in list(data_use['movie_title'])]

In [18]:
clean_data = data_use.dropna(axis=0)
clean_data = clean_data.drop_duplicates(['movie_title'])
clean_data = clean_data.reset_index(drop=True)

In [19]:
people_list = []
merge_columns = ['actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name']
for i in range(clean_data.shape[0]):
    people_list.append('|'.join(clean_data.iloc[i][col].replace(' ', '_') for col in merge_columns))
clean_data['people'] = people_list

clean_data.head()

Unnamed: 0,genres,plot_keywords,movie_title,actor_1_name,actor_2_name,actor_3_name,director_name,imdb_score,people
0,Action|Adventure|Fantasy|Sci-Fi,avatar|future|marine|native|paraplegic,Avatar,CCH Pounder,Joel David Moore,Wes Studi,James Cameron,7.9,CCH_Pounder|Joel_David_Moore|Wes_Studi|James_C...
1,Action|Adventure|Fantasy,goddess|marriage ceremony|marriage proposal|pi...,Pirates of the Caribbean: At World's End,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski,7.1,Johnny_Depp|Orlando_Bloom|Jack_Davenport|Gore_...
2,Action|Adventure|Thriller,bomb|espionage|sequel|spy|terrorist,Spectre,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes,6.8,Christoph_Waltz|Rory_Kinnear|Stephanie_Sigman|...
3,Action|Thriller,deception|imprisonment|lawlessness|police offi...,The Dark Knight Rises,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan,8.5,Tom_Hardy|Christian_Bale|Joseph_Gordon-Levitt|...
4,Action|Adventure|Sci-Fi,alien|american civil war|male nipple|mars|prin...,John Carter,Daryl Sabara,Samantha Morton,Polly Walker,Andrew Stanton,6.6,Daryl_Sabara|Samantha_Morton|Polly_Walker|Andr...


Som variabler her, skal vi bruke om hver av sjangerne, keywords eller person har noe med filmen å gjøre. Altså 1 hvis den er der, og 0 ellers.
Avstanden mellom to filmer blir dermed liten hvis de har mange av disse felles.

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

def token(text):
    return(text.split('|'))

cv_kw=CountVectorizer(max_features=100, tokenizer=token)
keywords = cv_kw.fit_transform(clean_data['plot_keywords'])
keywords_list = ['kw_' + i for i in cv_kw.get_feature_names()]

cv_ge=CountVectorizer(tokenizer=token )
genres = cv_ge.fit_transform(clean_data['genres'])
genres_list = ['genres_' + i for i in cv_ge.get_feature_names()]

cv_pp=CountVectorizer(max_features=100,tokenizer=token )
people = cv_pp.fit_transform(clean_data['people'])
people_list = ['pp_' + i for i in cv_pp.get_feature_names()]

cluster_data = np.hstack([keywords.todense(),genres.todense(),people.todense()*2])
criterion_list = keywords_list+genres_list+people_list

## Clustering med K-Means

In [21]:
from sklearn.cluster import KMeans

mod = KMeans(n_clusters=100)
category = mod.fit_predict(cluster_data)
category_dataframe = pd.DataFrame({'category': category}, index=clean_data['movie_title'])

In [22]:
clean_data.iloc[list(category_dataframe['category'] == 0)][['genres', 'movie_title', 'people']]

Unnamed: 0,genres,movie_title,people
288,Animation|Drama|Family|Musical|Romance,The Hunchback of Notre Dame,Demi_Moore|Jason_Alexander|Bill_Fagerbakke|Gar...
481,Drama|Musical|Romance,Nine,Fergie|Elio_Germano|Andrea_Di_Stefano|Rob_Mars...
514,Comedy|Drama|Family|Music|Musical|Romance,Hairspray,Jerry_Stiller|Elijah_Kelley|Paul_Dooley|Adam_S...
649,Comedy|Drama|Family|Musical,Annie,Quvenzhané_Wallis|Dorian_Missick|David_Zayas|W...
850,Drama|Musical|Romance|Thriller,The Phantom of the Opera,Gerard_Butler|Minnie_Driver|Miranda_Richardson...
858,Drama|Music|Musical|Romance,Burlesque,Eric_Dane|Peter_Gallagher|David_Walton|Steve_A...
1232,Drama|Musical|Romance,Rent,Rosario_Dawson|David_Fine|Jesse_L._Martin|Chri...
1770,Drama|Musical|Romance,The Magic Flute,Kim-Marie_Woodhouse|Amy_Carson|Joseph_Kaiser|K...
2040,Comedy|Drama|Musical|Romance|War,Darling Lili,Rock_Hudson|Vernon_Dobtcheff|Jeremy_Kemp|Blake...
2111,Animation|Comedy|Drama|Family|Musical,The Tigger Movie,Kath_Soucie|John_Fiedler|Ken_Sansom|Jun_Falken...


## Anbefaling av filmer

Vi kan bruke `recommend`-funksjonen til å få filmer som ligner på den vi sender inn. Filmtitlene kommer ut sortert etter IMDb-score.

In [23]:
def recommend(movie_name, recommend_number=5):
    if movie_name in list(clean_data['movie_title']):
        movie_cluster = category_dataframe.loc[movie_name]['category']
        score = clean_data.iloc[list(category_dataframe['category'] == movie_cluster)][['imdb_score', 'movie_title']]
        sort_score = score.sort_values(['imdb_score'], ascending=[0])
        sort_score = sort_score[sort_score['movie_title'] != movie_name]
        recommend_number = min(sort_score.shape[0], recommend_number)
        recommend_movie = list(sort_score.iloc[range(recommend_number), 1])
        return recommend_movie
    else:
        print('Can\'t find this movie!')

In [24]:
recommend('Harry Potter and the Prisoner of Azkaban', 10)

['The Lord of the Rings: The Return of the King',
 'The Lord of the Rings: The Fellowship of the Ring',
 'The Lord of the Rings: The Two Towers',
 'Princess Mononoke',
 'Monty Python and the Holy Grail',
 'Pirates of the Caribbean: The Curse of the Black Pearl',
 'The Princess Bride',
 'The Wizard of Oz',
 'The Hobbit: The Desolation of Smaug',
 'The Hobbit: An Unexpected Journey']

**Oppgave:** Synes du sortering på IMDb-rating er den beste måten å sortere output? Kommer du på et alternativ

Løsning: eks. sortere etter avstand til filmen i clusteret