## Extract Appropriate feature for TfidfVectorizer

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [8]:
movies = pd.read_csv('../data/MoviesMetadata.csv')
movies.head()

Unnamed: 0,budget,id,imdb_id,original_language,original_title,overview,popularity,poster_path,release_date,revenue,...,title,vote_average,vote_count,name_genres,id_genres,name_production_countries,iso_3166_1_production_countries,name_production_companies,id_production_companies,year
0,30.0,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,1995-10-30,373.554033,...,Toy Story,7.7,5415.0,"Animation, Comedy, Family","16, 35, 10751",United States of America,US,Pixar Animation Studios,3,1995
1,65.0,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,1995-12-15,262.797249,...,Jumanji,6.9,2413.0,"Adventure, Fantasy, Family","12, 14, 10751",United States of America,US,"TriStar Pictures, Teitler Film, Interscope Com...","559, 2550, 10201",1995
2,0.0,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,1995-12-22,0.0,...,Grumpier Old Men,6.5,92.0,"Romance, Comedy","10749, 35",United States of America,US,"Warner Bros., Lancaster Gate","6194, 19464",1995
3,16.0,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,1995-12-22,81.452156,...,Waiting to Exhale,6.1,34.0,"Comedy, Drama, Romance","35, 18, 10749",United States of America,US,Twentieth Century Fox Film Corporation,306,1995
4,0.0,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,1995-02-10,76.578911,...,Father of the Bride Part II,5.7,173.0,Comedy,35,United States of America,US,"Sandollar Productions, Touchstone Pictures","5842, 9195",1995


In [9]:
movies.drop(columns=['iso_3166_1_production_countries', 'id_production_companies', 'id_genres'], inplace=True)
movies.shape

(45443, 20)

In [10]:
credits = pd.read_csv('../data/NewCredits.csv')
keywords = pd.read_csv('../data/NewKeywords.csv')

In [11]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')

In [12]:
movies = movies.merge(credits, on='id')
movies = movies.merge(keywords, on='id')

In [13]:
links = pd.read_csv('../data/links.csv')
links = links[links['tmdbId'].notnull()]['tmdbId'].astype('int')

In [14]:
movies_meta = movies[movies['id'].isin(links)].copy()
movies_meta.shape

(45453, 35)

In [15]:
def is_float(string):
    try:
        float(string)
        return True
    except ValueError:
        return False

In [16]:
def find_director(job_crew, index):
    if not is_float(job_crew):
        jobs = job_crew.split(', ')
        for job in jobs:
            if job == 'Director':
                idx = jobs.index(job)
                names = movies_meta.loc[index, 'name_crew']
                if not is_float(names):
                    names = names.split(', ')
                    return names[idx]
                else:
                    return np.nan
    return np.nan

In [17]:
for i in movies_meta.index:
    movies_meta.loc[i, 'director'] = find_director(movies_meta.loc[i, 'job_crew'], i)
movies_meta['director']

0           John Lasseter
1            Joe Johnston
2           Howard Deutch
3         Forest Whitaker
4           Charles Shyer
               ...       
45448    Hamid Nematollah
45449            Lav Diaz
45450      Mark L. Lester
45451    Yakov Protazanov
45452       Daisy Asquith
Name: director, Length: 45453, dtype: object

In [18]:
movies_meta['director'].isna().sum()

887

In [19]:
movies_meta['director'] = movies_meta['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
movies_meta['director'] = movies_meta['director'].apply(lambda x: [x])
movies_meta['director']

0           [johnlasseter]
1            [joejohnston]
2           [howarddeutch]
3         [forestwhitaker]
4           [charlesshyer]
               ...        
45448    [hamidnematollah]
45449            [lavdiaz]
45450       [markl.lester]
45451    [yakovprotazanov]
45452       [daisyasquith]
Name: director, Length: 45453, dtype: object

In [20]:
keywords_set = set()
def find_unique(keywords_str):
    if not is_float(keywords_str):
        string_list = keywords_str.split(', ')
        for string in string_list:
            keywords_set.add(string)

In [21]:
movies_meta['name_keywords'].apply(find_unique)

0        None
1        None
2        None
3        None
4        None
         ... 
45448    None
45449    None
45450    None
45451    None
45452    None
Name: name_keywords, Length: 45453, dtype: object

In [22]:
keywords_dict = dict()

for keyword in keywords_set:
    keywords_dict[keyword] = 0
    
def count(keywords_str):
    if not is_float(keywords_str):
        string_list = keywords_str.split(', ')
        for string in string_list:
            keywords_dict[string] += 1

In [23]:
movies_meta['name_keywords'].apply(count)

0        None
1        None
2        None
3        None
4        None
         ... 
45448    None
45449    None
45450    None
45451    None
45452    None
Name: name_keywords, Length: 45453, dtype: object

In [24]:
keyword_unique = pd.Series(keywords_dict)

In [25]:
keyword_unique = keyword_unique[keyword_unique > 1]
keyword_unique

reference to longfellow     2
man eating plant            5
charles manson              4
light                      14
langley virginia            4
                           ..
romulans                    3
exchange student            3
poem or rhyme               4
east germany                8
schlock                     3
Length: 11278, dtype: int64

In [26]:
def keywords_filtering(keywords):
    final_keywords = list()
    if not is_float(keywords):
        keywords_list = keywords.split(', ')
        for key in keywords_list:
            if key in keyword_unique:
                final_keywords.append(key)
    return final_keywords

In [27]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

In [28]:
movies_meta['name_keywords'] = movies_meta['name_keywords'].apply(keywords_filtering)
movies_meta['name_keywords'] = movies_meta['name_keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
movies_meta['name_keywords'] = movies_meta['name_keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [29]:
def str_to_list(col):
    if not is_float(col):
        return col.split(', ')
    else:
        return []

In [30]:
for i in movies_meta.index:
    if movies_meta.loc[i, 'name_genres'] == '[]':
        movies_meta.loc[i, 'name_genres'] = np.nan

In [31]:
movies_meta['name_cast'] = movies_meta['name_cast'].apply(str_to_list)
movies_meta['name_genres'] = movies_meta['name_genres'].apply(str_to_list)

In [32]:
movies_meta['name_cast'] = movies_meta['name_cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
movies_meta['name_genres'] = movies_meta['name_genres'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [33]:
movies_meta['model_feature'] = movies_meta['name_keywords'] + movies_meta['name_cast'] + movies_meta['director'] + movies_meta['name_genres']
movies_meta['model_feature'] = movies_meta['model_feature'].apply(lambda x: ' '.join(x))

In [34]:
movies_meta['model_feature']

0        jealousi toy boy friendship friend rivalri boy...
1        boardgam disappear basedonchildren'sbook newho...
2        fish bestfriend duringcreditssting oldmen walt...
3        basedonnovel interracialrelationship singlemot...
4        babi midlifecrisi confid age daughter motherda...
                               ...                        
45448    tragiclov leilahatami kouroshtahami elhamkorda...
45449    artist play pinoy angelaquino perrydizon hazel...
45450    erikaeleniak adambaldwin juliedupage jamesrema...
45451    iwanmosschuchin nathalielissenko pavelpavlov a...
45452                                         daisyasquith
Name: model_feature, Length: 45453, dtype: object

In [35]:
movies_meta['model_feature'].to_csv('../data/MovieBasedRecommenderData.csv', index=False)