In [2]:
# import libraries

import re
import nltk
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from ast import literal_eval
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.font_manager import FontProperties
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import plotly
import plotly.io as pio
from os import path
from PIL import Image
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

%matplotlib inline
warnings.simplefilter('ignore')
pd.set_option('display.max_columns', 50)

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

ModuleNotFoundError: No module named 'plotly'

## google drive mount -> for read file in colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path_metadata = '/content/drive/MyDrive/Colab Notebooks/archive/movies_metadata.csv'
path_credits = '/content/drive/MyDrive/Colab Notebooks/archive/credits.csv'
path_keyword = '/content/drive/MyDrive/Colab Notebooks/archive/keywords.csv'

Jupyter Notebook

In [None]:
path_metadata = '/Users/jules/Projects/Projects/lib/archive_movie_dataset/movies_metadata.csv'
path_credits = '/Users/jules/Projects/Projects/lib/archive_movie_dataset/credits.csv'
path_keyword = '/Users/jules/Projects/Projects/lib/archive_movie_dataset/keywords.csv'

In [None]:
#Loading the datasets
#metadata of the movies
md = pd.read_csv(path_metadata)
#movie credits
credits = pd.read_csv(path_credits) 
#movie keywords
keywords = pd.read_csv(path_keyword) 

### credits.csv

In [None]:
# Check the head of the data
# credits
credits.head()

In [None]:
#Converting the string into list of dictionaries
credits.cast = credits.cast.apply(literal_eval)
credits.crew = credits.crew.apply(literal_eval)

In [None]:
# Extracting the Casts into a list from Dictionaries
credits['cast'] = credits['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [None]:
# Extracting_Crew 에서 director 
def extract_director(x):
    for crew_mem in x:
        if crew_mem['job'] == 'Director':
            return crew_mem['name']
        else:
            return np.nan

credits['director'] = credits['crew'].apply(extract_director)
credits['director'].fillna('',inplace = True)

In [None]:
credits.drop(['crew'],axis = 1,inplace = True)
credits.head()

### keywords.csv

In [None]:
keywords.head()

In [None]:
#Converting the string into list of dictionaries
keywords.keywords = keywords.keywords.apply(literal_eval)

In [None]:
# Extracting the Keywords into a list from Dictionaries
keywords['keywords'] = keywords['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
keywords.head()

### movies_metadata.csv

In [None]:
md.head()

In [None]:
md.describe(include = 'all')

In [None]:
md[(md.adult != "True") & (md.adult != "False")]

위 세 편의 영화는 여러 칼럼들 사이에 데이터가 뒤죽박죽으로 섞여 있다. 
예를 들어 성인 열에는 개요가 포함되고 개요 열에는 상태 등이 포함되지만, 실제로 데이터는 두 개의 지수로 나눠져 있다. 따라서 데이터를 단일 인덱스로 결합하고 이후 인덱스를 삭제.

In [None]:
idx = [19729,29502,35586]
lst_1 = ['popularity', 'poster_path', 'production_companies','production_countries', 'release_date', 'revenue',
         'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count']
lst_2 = ['belongs_to_collection', 'budget', 'genres', 'homepage', 'id', 'imdb_id', 'original_language', 'original_title', 
         'overview','popularity', 'poster_path', 'production_companies','production_countries', 'release_date']
for i in idx:
    for col_seq in range(len(lst_1)):
            md[lst_1[col_seq]][i] = md[lst_2[col_seq]][i+1]

In [None]:
idx = [x+1 for x in idx]
md.drop(index = idx,inplace = True)

In [None]:
md.adult = md.adult.apply(lambda x : True if (x == 'True') else False)

장르 - 변환

In [None]:
# Extracting the Genres into a list from Dictionaris
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [None]:
# Dropping Duplicates 
credits.drop_duplicates('id',inplace = True)
keywords.drop_duplicates('id',inplace = True)
md.drop_duplicates('id',inplace = True)

In [None]:
#Converting IDs into same data type
md.id = md.id.astype(int)

### Merge movie metadata + credits + keyword

In [None]:
#Merging DataFrames into one
md = md.merge(credits, on = 'id', how = 'left')
md = md.merge(keywords, on = 'id', how = 'left')
md.head()

필요한 열 선택하기

In [None]:
# Selecting required columns from the master dataframe
movies = md[['id','original_title','title','cast', 'director', 'keywords', 'genres', 'release_date', 'overview', 
             'original_language', 'adult', 'runtime', 'tagline', 'vote_average', 'vote_count','popularity']]
movies.head(30)

In [None]:
# Missing Value
movies.isna().sum()

null value 제거

In [None]:
movies.original_language.fillna('',inplace = True)
# Fill NA of Tagline with empty strings
movies.tagline.fillna('',inplace = True)
# Fill NA of overview with empty strings
movies.overview.fillna('',inplace = True)
movies.loc[movies.overview == 'No overview found.','overview'] = ''
# Fill NA of runtime with 0
movies.runtime.fillna(0,inplace = True)

movies.cast = movies.cast.apply(lambda x: x if isinstance(x, list) else [])
movies.director.fillna('',inplace = True)
movies.keywords = movies.keywords.apply(lambda x: x if isinstance(x, list) else [])

# If the release_Date is missing, as of now we're putting the date of 2050-01-01 in order to be able to convert in into datetime object
movies.loc[movies['release_date'].isna(),'release_date'] = '2050-01-01'
movies.release_date = pd.to_datetime(movies.release_date,format = '%Y-%m-%d')

In [None]:
movies.head()

## Data 처리

In [None]:
# popularity
movies["popularity"] = pd.to_numeric(movies["popularity"], downcast="float")
movies = movies.sort_values(by='popularity',axis=0, ascending=False)[0:20000].reset_index()
movies=movies.drop(['index'], axis=1)
movies.head(5)

In [None]:
#overview and taglines 결합
movies['plot_corpus'] = movies['overview'] + movies['tagline']

def listtostr(txt):
    '''
    Returns string by joining the elements of the list
    '''
    
    txt_clean = ' '.join([str(elem) for elem in txt])
    return txt_clean

movies['keywords'] = movies['keywords'].apply(listtostr)
movies['genres'] = movies['genres'].apply(listtostr)

#movies['plot_corpus_1'] = movies['overview'] + movies['tagline'] + movies['keywords']
movies['genre_corpus'] = movies['keywords'] + movies['genres']

Text 정리

점 제거,소문자 변환 등 

Lematization: 단어를 기본 형태로 변환하는 과정

In [None]:
def get_wordnet_pos(word):
    '''
    Returns the tag for the word
    '''
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer=WordNetLemmatizer()

def clean_plot(txt):
    '''
    Returns the cleaned plot text 
    '''
    
    regex = re.compile(r"[!@%&;?'',.""-]")
    txt_clean = re.sub(regex,'',txt)
    txt_clean = txt_clean.lower()
    txt_clean = txt_clean.split(' ')
    txt_clean = [word for word in txt_clean if word not in stopwords.words('english')]
    txt_clean = ' '.join(txt_clean)
    word_list = nltk.word_tokenize(txt_clean)
    txt_clean = ' '.join([lemmatizer.lemmatize(w,get_wordnet_pos(w)) for w in word_list])
    return txt_clean

def clean_cast(txt):
    '''
    Returns the cleaned cast string
    '''
    
    for i in range(len(txt)):
        txt[i] = re.sub(r"[.,']","",txt[i])
        txt[i] = re.sub(r"[-]"," ",txt[i])
        txt[i] = re.sub(" ","_",txt[i])
        txt[i] = txt[i].lower()
    return txt

def clean_director(txt):
    '''
    Returns the cleaned director string
    '''
    
    txt_clean = re.sub(r"[.,']","",txt)
    txt_clean = re.sub(r"[-]"," ",txt_clean)
    txt_clean = re.sub(" ","_",txt_clean)
    txt_clean = txt_clean.lower()
    return txt_clean

In [None]:
movies['plot_corpus'] = movies['plot_corpus'].apply(clean_plot)
movies['genre_corpus'] = movies['genre_corpus'].apply(clean_plot)
movies['genre_pure'] = movies['genres'].apply(clean_plot)

In [None]:
movies['genre_pure']

In [None]:
movies['cast'] = movies['cast'].apply(clean_cast)
movies['cast'] = movies['cast'].apply(listtostr)
movies['director'] = movies['director'].apply(clean_director)

In [None]:
movies['genre_corpus'] = movies['genre_corpus'] + movies['cast']
movies['mixed_corpus'] = movies['genre_corpus'] + movies['plot_corpus']

## 벡터화

텍스트에 대한 추가 작업을 수행하기 위해서는 수치 기계 학습을 적용할 수 있도록 문서를 벡터 표현으로 변환해야 한다.

CountVectorizer: 단어 빈도를 카운트.

TfidfVectorizer: 값은 카운트에 비례하여 증가하지만 말뭉치 내의 단어 빈도에 반비례.

In [None]:
tf = TfidfVectorizer(analyzer = 'word', ngram_range = (1,2), min_df = 0, stop_words = 'english')
cv = CountVectorizer(analyzer = 'word', ngram_range = (1,2), min_df = 0, stop_words = 'english')

plot_vector = tf.fit_transform(movies['plot_corpus'])
genre_vector = cv.fit_transform(movies['genre_corpus'])
cast_vector = cv.fit_transform(movies['cast'])
director_vector = cv.fit_transform(movies['director'])
genre_only_vector = cv.fit_transform(movies['genre_pure'])

벡터화된 문서의 유사도 점수를 계산

cosine_similarity 이용

: 벡터 입력에 대한 쌍별 코사인 유사도 행렬을 반환. 데이터 프레임에 저장될 때 각 행은 다른 모든 동영상에 대한 특정 동영상의 유사성 점수를 나타낸다.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

plot_score = cosine_similarity(plot_vector,plot_vector)
genre_score = cosine_similarity(genre_vector,genre_vector)
cast_score = cosine_similarity(cast_vector,cast_vector)
director_score = cosine_similarity(director_vector, director_vector)
genre_only_score = cosine_similarity(genre_only_vector,genre_only_vector)

plot_score = pd.DataFrame(plot_score)
genre_score = pd.DataFrame(genre_score)
cast_score = pd.DataFrame(cast_score)
director_score = pd.DataFrame(director_score)
genre_only_score = pd.DataFrame(genre_only_score)

다양한 이용자들이 매긴 시청률의 평균을 계산하면 각 영화가 받은 시청률의 차이 때문에 편향된 결과를 얻을 수 있기 때문에 가중치 평가 공식을 사용한다.

In [None]:
vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
m = vote_counts.quantile(0.95)

print(C,m)

def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

movies['wr'] = movies.apply(weighted_rating, axis=1)

같은 장르에서 비슷한 영화들의 표를 가중평균해서 얻을 수 있고 또한 해당 영화의 인기에 따라 알 수 있다.

In [None]:
movies['genres'] = movies['genres'].apply(lambda x : x.split())
movies['release_year'] = movies.release_date.apply(lambda x: x.year)

In [None]:
def score(value,index_list,feature):
    '''
    Returns list of scores for the passed feature
    '''
    if feature == 'genre':
        df_temp = pd.DataFrame(genre_only_score[value])
    if feature == 'plot':
        df_temp = pd.DataFrame(plot_score[value])
    if feature == 'plot_1':
        df_temp = pd.DataFrame(plot_score_1[value])
    if feature == 'cast':
        df_temp = pd.DataFrame(cast_score[value])
    if feature == 'director':
        df_temp = pd.DataFrame(director_score[value])
    df_temp = df_temp.loc[df_temp.index.isin(index_list)]
    my_list = df_temp[value].tolist()
    return my_list

대상 영화에 대한 장르 유사성 점수에 따라 영화를 내림차순으로 배열.
plot 점수에 근거하여 이전 단계에서 받은 동영상을 클러스터화.

모든 유사성 점수(이전 계산)와 계산된 가중 등급을 기준으로 각 클러스터 내에서 동영상을 추가로 정렬(내림차순).

각 클러스터의 상위 50개 권장 사항 반환


영화 추천

In [None]:
def get_feature_set(df1,df2,df3,title):
    
    '''
    idx : index value of the target movie
    top : index value of top 500 movies(sorted(descending) by genre similarity score w.r.t. target movie)
    feature_set : Data frame containing plot score matrix of movies which had their index in "top"
    movie_set : Name of the movies which had their index in "top"
    '''   
    
    idx = movies.index[movies.title == title].values.astype(int)[0]
    top = df1[idx].sort_values(ascending = False)[0:500].index.values.tolist()
    top = df1[idx].sort_values(ascending = False)[0:500].index.values.tolist()
    feature_set = df2[df2.index.isin(top)]
    movies_set = pd.DataFrame(movies.loc[movies.index.isin(top),'title'])
    return feature_set,movies_set

def get_recommendations(title,cluster_num,df1=genre_score,df2=plot_score,df3=cast_score):
    
    '''
    movie_set = dataframe to store the cluster labels(1,2,3) assigned to movies along with their similarity scores and ratings
    df_recommend = dataframe with information about the top 50 movies recommended from each cluster
    '''
    
    feature_set,movies_set = get_feature_set(df1,df2,df3,title)
    cluster_algo = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=42)
    cluster = cluster_algo.fit(feature_set)
    movies_set['cluster'] = cluster.labels_
    index = movies_set.index.values.tolist()
    idx = movies.index[movies.title == title].values.astype(int)[0]
    movies_set.loc[movies_set.index.isin(index),'wr'] = movies.loc[movies.index.isin(index),'wr']
    movies_set.loc[movies_set.index.isin(index),'p_score'] = score(idx,index,'plot')
    movies_set.loc[movies_set.index.isin(index),'g_score'] =score(idx,index,'genre')
    movies_set.loc[movies_set.index.isin(index),'c_score'] = score(idx,index,'cast')
    movies_set.loc[movies_set.index.isin(index),'d_score'] = score(idx,index,'director')
    target_cluster = movies_set.loc[movies_set.title == title, 'cluster'].values[0]
    if(target_cluster!=0):
        movies_set.loc[movies_set.cluster==target_cluster,'cluster'] = 100
        movies_set.loc[movies_set.cluster==0,'cluster'] = target_cluster
        movies_set.loc[movies_set.cluster==100,'cluster'] = 0
    if(cluster_num==1):
        recommend_1 = movies_set[movies_set.cluster == 0] 
        df_recommend = pd.DataFrame(recommend_1.sort_values(['g_score','p_score','c_score','d_score', 'wr'],ascending=[False,False,False,False,False])[1:50].title)
    if(cluster_num==2):
        recommend_1 = movies_set[movies_set.cluster == 1] 
        df_recommend = pd.DataFrame(recommend_1.sort_values(['g_score','p_score','c_score','d_score', 'wr'],ascending=[False,False,False,False,False])[1:50].title)
    if(cluster_num==3):
        recommend_1 = movies_set[movies_set.cluster == 2] 
        df_recommend = pd.DataFrame(recommend_1.sort_values(['g_score','p_score','c_score','d_score', 'wr'],ascending=[False,False,False,False,False])[1:50].title)
    df_recommend.loc[df_recommend.index.isin(index),'genres'] = movies.loc[movies.index.isin(index),'genre_pure']
    df_recommend.loc[df_recommend.index.isin(index),'title'] = movies.loc[movies.index.isin(index),'title']
    df_recommend.loc[df_recommend.index.isin(index),'director'] = movies.loc[movies.index.isin(index),'director']
    df_recommend.loc[df_recommend.index.isin(index),'cast'] = movies.loc[movies.index.isin(index),'cast']
    df_recommend.loc[df_recommend.index.isin(index),'ratings'] = movies.loc[movies.index.isin(index),'wr']
    df_recommend.loc[df_recommend.index.isin(index),'adult'] = movies.loc[movies.index.isin(index),'adult']
    df_recommend['ratings'] = df_recommend['ratings'].round(decimals=2)
    return df_recommend

시각화를 위한 부분

In [None]:
def cluster_class(title,cl_num):
    '''
    converts each column of the recommendation dataframe into list
    '''
    df = get_recommendations(title,cluster_num=cl_num)
    cast = ' '.join(df.cast.tolist())
    genre = ' '.join(df.genres.tolist())
    director = ' '.join(df.director.tolist())
    ratings = df.ratings.tolist()
    return df,cast,director,genre,ratings,cl_num

In [None]:
class recommended_cluster:
    '''
    movies   : A dataframe of movies with other information within a cluster
    cast     : A list of cast for the movies within a cluster
    director : A list of directors for the movies within a cluster
    genre    : A list of genres of the movies within a cluster
    ratings  : A list of ratings of the movies within a cluster
    
    '''
   
    def __init__(self,cluster_tuple):
        self.movies = cluster_tuple[0]
        self.cast = cluster_tuple[1]
        self.director = cluster_tuple[2]
        self.genre = cluster_tuple[3]
        self.ratings = cluster_tuple[4]
        self.cl_num = cluster_tuple[5]
        
    def recommended_movie(self):
        '''
        generates a table containing top 10 recommendations from each cluster along with their corresponding ratings
        '''
        df = self.movies[0:10]
        fig = go.Figure(data=[go.Table(header = dict(values = ['Title','Rating'],
                                                     font = dict(size=15),
                                                     align = "center"),
                                       cells = dict(values = [df.title,df.ratings],
                                                    align = "center")
                                      )
                             ]
                       )
        fig.show()    

In [None]:
def Recommendation_Board(title):
    for i in [1,2,3]:
        cluster = recommended_cluster(cluster_class(title,i))
        cluster.recommended_movie()


In [None]:
Recommendation_Board("The Avengers")