In [1]:
# re-import modules without restarting kernel
%load_ext autoreload
%autoreload 2
%reload_ext autoreload


from io_helpers import *
from matching_helpers import *

#essential imports
import pandas as pd
import numpy as np

#date
import datetime as dt

from IPython.display import clear_output



nb_controls_per_subject = 7



# Matching movies and dead actors/directors

### 1) Load and clean the meta_movies data generated with Amazon API

In [2]:
## open datafiles
##

features_meta = ['asin', 'actors', 'directors', 'creators', 'authors']

MMTV = pd.read_csv('DATA/metadata_processed/meta_Movies_and_TV_processed(v1).csv', low_memory=False)[features_meta]
AIV = pd.read_csv('DATA/metadata_processed/meta_Amazon_Instant_Video_processed(v1).csv', low_memory=False)[features_meta]

video_df = pd.concat((MMTV, AIV))
video_df = df_rm_punctuation(video_df)

In [3]:
assert(MMTV.shape[0]+AIV.shape[0] == video_df.shape[0])
video_df.head()

Unnamed: 0,asin,actors,directors,creators,authors
0,0000143561,,,,
1,0000589012,,,,Joe Herzanek
2,0000695009,,,,Epilepsy Foundation
3,000107461X,Douglas Miller,,"(Joe Thomas,Producer),(Joseph Williams,Producer)",
4,0000143529,Alton Brown,,,


## 2) Clean and filter dead actors data

In [4]:
deaths = pd.read_csv('DATA/deaths.csv', low_memory=False, encoding="ISO-8859-1")
dead_actors = deaths[deaths['Actor']==True].drop('Actor', axis=1).drop('Author', axis=1).drop('Musician', axis=1).drop('Description', axis=1)
# for some reason the index skips some numbers
dead_actors = dead_actors.reset_index().drop('index', axis=1)
dead_actors['Name'] = std_actors_names(dead_actors['Name'])
dead_actors.head()

Unnamed: 0,Name,Birth Date,Death Date
0,jack weston,1924,1996-05-03
1,john beradino,1917,1996-05-19
2,jon pertwee,1919,1996-05-20
3,lash larue,1917,1996-05-21
4,enrique álvarez félix,1934,1996-05-24


## 3) Build (metadata<->actors) controls list

In [5]:
# filter out blanks, products_actors will be re-used
amazon_actor_products = video_df[(video_df["actors"].isnull()==False)]
amazon_actor_products = amazon_actor_products[amazon_actor_products['actors']!='']
print(amazon_actor_products.shape)
amazon_actor_products.head()

(144720, 5)


Unnamed: 0,asin,actors,directors,creators,authors
3,000107461X,Douglas Miller,,"(Joe Thomas,Producer),(Joseph Williams,Producer)",
4,0000143529,Alton Brown,,,
5,0000143502,Alton Brown,,,
6,0000143588,Ina Garten,,,
11,0001485423,Sr Faustina,J Paddy Nolan,,


In [6]:
amazon_actor_names = [troup.split(',') for troup in list(amazon_actor_products['actors'])]
amazon_actor_names = [lm.lower() for lm in np.concatenate(amazon_actor_names)]
amazon_actor_names = list(set(amazon_actor_names))

# weird... 
print(amazon_actor_names[:5])
#amazon_actor_names = amazon_actor_names[:1]+ amazon_actor_names[2:] 
amazon_actor_names = amazon_actor_names[1:]

amazon_actors = pd.DataFrame(amazon_actor_names, columns=['Name'])
amazon_actors['Name'] = std_actors_names(amazon_actors['Name'])
amazon_actors['Birth Date'] = pd.Series()
amazon_actors['Death Date'] = pd.Series()
amazon_actors.head()

['', 'warren cook', 'herbert flight time lang', 'jonna järnefelt', 'yan athena chu']


Unnamed: 0,Name,Birth Date,Death Date
0,warren cook,,
1,herbert flight time lang,,
2,jonna järnefelt,,
3,yan athena chu,,
4,lee reyes,,


In [7]:
## filter out deads
##

dead_actors['clname'] = dead_actors['Name'].map(cleanstr)
dead_actors = dead_actors.drop('Name', axis=1)
nondead_actors = pd.DataFrame(list(map(lambda s : cleanstr(s), amazon_actor_names)),columns=['clname'])
nondead_actors= nondead_actors[nondead_actors['clname'].map(lambda e : len(e.split())>1)]

for idx, name in enumerate(dead_actors['clname'].tolist()):
    if idx % 100 == 0 : 
        print(idx)
    mapp = nondead_actors['clname'].str.contains(name)
    if mapp.any():
        nondead_actors = nondead_actors[mapp==False] 

nondead_actors['Birth Date'] = pd.Series()
nondead_actors['Death Date'] = pd.Series()
nondead_actors.head()

controls = nondead_actors

all_actors = pd.concat((dead_actors, nondead_actors))

0
100
200
300
400
500
600


In [8]:
controls.shape, dead_actors.shape, amazon_actors.shape, amazon_actors.shape[0]-controls.shape[0]

((149443, 3), (639, 3), (155561, 3), 6118)

## 4) Match dead actors and meta data

In [9]:
pd.options.mode.chained_assignment = None 
meta_dead_actors = pd.DataFrame()

amazon_actor_products['clname'] = amazon_actor_products['actors'].map(cleanstr)

for idx, actor in enumerate(list(dead_actors['clname'])):
    
    match = amazon_actor_products[amazon_actor_products['clname'].str.contains(actor)==True].drop('actors',axis=1)
    match['clname'] = actor
    match['Death Date'] = dead_actors.reset_index()['Death Date'].get_value(idx)
    meta_dead_actors = meta_dead_actors.append(match)
    
    if idx%100==0 and idx!=0: 
        print('actor {} in {} - Done'.format(idx, len(list(dead_actors['clname']))))
        
print(meta_dead_actors.shape)        
meta_dead_actors.head()

actor 100 in 639 - Done
actor 200 in 639 - Done
actor 300 in 639 - Done
actor 400 in 639 - Done
actor 500 in 639 - Done
actor 600 in 639 - Done
(14282, 6)


Unnamed: 0,asin,directors,creators,authors,clname,Death Date
3487,792840461,Norman Jewison,"(Haskell Wexler,Cinematographer),(Norman Jewis...",,jack weston,1996-05-03
10823,6300216217,,,,jack weston,1996-05-03
11212,6300270297,Richard Lester,"(Paul Wilson,Cinematographer),(John Bloom,Edit...",,jack weston,1996-05-03
11233,6300270025,Brian G Hutton,"(Andre Morgan,Producer),(Daniel Grodnik,Produc...",,jack weston,1996-05-03
12506,6301514017,Stuart Rosenberg,,,jack weston,1996-05-03


In [10]:
len(set(meta_dead_actors['clname']))

481

## 5) Subsample control actors and match meta data

In [11]:
from numpy.random import randint

def subsample_df (df, nbWanted) : 
    assert nbWanted < len(df)
    randomIndex = randint(0, len(df)-1, nbWanted)
    return df.loc[randomIndex]

sub_controls = subsample_df(controls, nb_controls_per_subject*len(dead_actors))
print(sub_controls.shape)
sub_controls.head(3)

(2556, 3)


Unnamed: 0,clname,Birth Date,Death Date
133115,maria riccarda wesseling,,
115722,donna quashnick,,
22607,charlie lynch,,


In [12]:
pd.options.mode.chained_assignment = None 
meta_control_actors = pd.DataFrame()

d=''
for idx, actor in enumerate(list(sub_controls.dropna(how='all')['clname'])):  
    match = pd.DataFrame()
    match = amazon_actor_products[amazon_actor_products['clname'].str.contains(actor)==True].drop('actors',axis=1)
    match['clname'] = actor
    match['Death Date'] = sub_controls.reset_index()['Death Date'].get_value(idx)
    meta_control_actors = meta_control_actors.append(match)
    
    if idx%100==0 and idx!=0: 
        print('actor {} in {} - Done'.format(idx, len(list(sub_controls['clname']))))


print(meta_control_actors.shape)
print(len(set(meta_dead_actors['clname'])))
meta_control_actors.head()

actor 100 in 2556 - Done
actor 200 in 2556 - Done
actor 300 in 2556 - Done
actor 400 in 2556 - Done
actor 500 in 2556 - Done
actor 600 in 2556 - Done
actor 700 in 2556 - Done
actor 800 in 2556 - Done
actor 900 in 2556 - Done
actor 1000 in 2556 - Done
actor 1100 in 2556 - Done
actor 1200 in 2556 - Done
actor 1300 in 2556 - Done
actor 1400 in 2556 - Done
actor 1500 in 2556 - Done
actor 1600 in 2556 - Done
actor 1700 in 2556 - Done
actor 1800 in 2556 - Done
actor 1900 in 2556 - Done
actor 2000 in 2556 - Done
actor 2100 in 2556 - Done
actor 2200 in 2556 - Done
actor 2300 in 2556 - Done
actor 2400 in 2556 - Done
(7882, 6)
481


Unnamed: 0,asin,directors,creators,authors,clname,Death Date
84148,B000AMMSCQ,,,,maria riccarda wesseling,
138222,B001EZ79UY,Laurent Pelly,,,maria riccarda wesseling,
148622,B0027YUKKM,"Olivier Py,Philippe Beziat",,,maria riccarda wesseling,
154953,B002QXI2L8,,,,maria riccarda wesseling,
117170,B000TEUSL8,,"(Brian Lovett,Producer),(Brian Lovett,Writer),...",,donna quashnick,


## 6) Concatenate (metadatas<->artist) for dead and control artists

In [13]:
meta_all_actors = pd.concat((meta_dead_actors, meta_control_actors))

## 7) Loading Reviews data for Movies and TV and cleaning

In [14]:
f_MTV = 'DATA/review/reviews_Movies_and_TV.json.gz'
f_AIV = 'DATA/review/reviews_Amazon_Instant_Video.json.gz'
features_reviews = ['asin', 'reviewTime']
#features_reviews = ['asin', 'reviewTime', 'overall', 'helpful', 'summary', 'reviewText']

In [18]:
b=gz_to_dataframe(f_AIV)[features_reviews]
a=gz_to_dataframe(f_MTV)[features_reviews]
safe_reviews_df = pd.concat((a, b))

In [19]:
reviews_df = safe_reviews_df
print(reviews_df.shape)
reviews_df.head()

(5190980, 2)


Unnamed: 0,asin,reviewTime
0,143502,"01 17, 2013"
1,143529,"10 2, 2013"
2,143561,"07 17, 2008"
3,143588,"03 13, 2009"
4,143588,"01 18, 2009"


In [20]:
def prefilter_reviews(review_df):
    filtered_reviews = review_df[review_df['asin'].isin(meta_all_actors['asin'].tolist())]
    filtered_reviews = filtered_reviews[features_reviews]
    filtered_reviews['reviewTime'] = list(pd.to_datetime(filtered_reviews['reviewTime'].str.replace(',','').str.replace(' ','-')))
    
    return filtered_reviews

In [21]:
filtered_reviews = prefilter_reviews(reviews_df)
filtered_reviews = filtered_reviews.reset_index()
print(filtered_reviews.shape)
filtered_reviews.head()

(702414, 3)


Unnamed: 0,index,asin,reviewTime
0,741,030714142X,2006-03-30
1,746,0307141985,2010-11-08
2,747,0307141985,2008-11-25
3,748,0307141985,2008-11-17
4,749,0307141985,2004-11-13


## 8) Matching meta/dead data with review data

In [22]:
print(meta_dead_actors.shape)
meta_dead_actors.head()

(14282, 6)


Unnamed: 0,asin,directors,creators,authors,clname,Death Date
3487,792840461,Norman Jewison,"(Haskell Wexler,Cinematographer),(Norman Jewis...",,jack weston,1996-05-03
10823,6300216217,,,,jack weston,1996-05-03
11212,6300270297,Richard Lester,"(Paul Wilson,Cinematographer),(John Bloom,Edit...",,jack weston,1996-05-03
11233,6300270025,Brian G Hutton,"(Andre Morgan,Producer),(Daniel Grodnik,Produc...",,jack weston,1996-05-03
12506,6301514017,Stuart Rosenberg,,,jack weston,1996-05-03


In [23]:
pd.options.mode.chained_assignment = None  # default='warn'

def matching_meta_reviews(filtered_reviews, meta):
    
    processed_reviews = pd.DataFrame()
    
    for idx, asin in enumerate(meta['asin']):
        match_reviews = pd.DataFrame()
        match_reviews = filtered_reviews[filtered_reviews['asin']==asin]

        match_reviews['actor name'] = meta.reset_index()['clname'].get_value(idx)
        match_reviews['death date'] = meta.reset_index()['Death Date'].get_value(idx)

        processed_reviews = processed_reviews.append(match_reviews)

        if idx %1000 == 0 and idx!=0: 
            print('{} in {}'.format(idx, len(meta['asin'])))
        
    return processed_reviews

In [24]:
deads_reviews = matching_meta_reviews(filtered_reviews, meta_dead_actors)
print(deads_reviews.shape)
deads_reviews.head()

1000 in 14282
2000 in 14282
3000 in 14282
4000 in 14282
5000 in 14282
6000 in 14282
7000 in 14282
8000 in 14282
9000 in 14282
10000 in 14282
11000 in 14282
12000 in 14282
13000 in 14282
14000 in 14282
(576041, 5)


Unnamed: 0,index,asin,reviewTime,actor name,death date
92660,308659,792840461,2006-04-07,jack weston,1996-05-03
92661,308660,792840461,2013-03-24,jack weston,1996-05-03
92662,308661,792840461,2014-04-05,jack weston,1996-05-03
92663,308662,792840461,2012-12-21,jack weston,1996-05-03
92664,308663,792840461,2013-05-12,jack weston,1996-05-03


## 8) Matching controls and reviews

In [25]:
print(meta_control_actors.shape)
meta_control_actors.sort_values(by='asin').head()

(7882, 6)


Unnamed: 0,asin,directors,creators,authors,clname,Death Date
76,030714142X,,,,alan merrill,
81,0307142353,,,,alan merrill,
264,0394897609,,,,jon stone,
261,0394898842,,,,jon stone,
354,0615336132,Dori Berinstein,,,the netsational senior dancers,


In [26]:
def match_control_meta_reviews(filtered_reviews, meta):
    
    processed_reviews = pd.DataFrame()
    
    for idx, asin in enumerate(meta['asin']):
        match_reviews = pd.DataFrame()
        match_reviews = filtered_reviews[filtered_reviews['asin']==asin]

        match_reviews['actor name'] = meta.reset_index()['clname'].get_value(idx)

        processed_reviews = processed_reviews.append(match_reviews)

        if idx %1000 == 0 and idx!=0: 
            print('{} in {}'.format(idx, len(meta['asin'])))
        
    return processed_reviews

In [27]:
ctrls_reviews = match_control_meta_reviews(filtered_reviews, meta_control_actors)

1000 in 7882
2000 in 7882
3000 in 7882
4000 in 7882
5000 in 7882
6000 in 7882
7000 in 7882


## Save files

In [29]:
deads_reviews.to_csv('DATA/review_filtered/movie_reviews_matched.csv')

In [30]:
ctrls_reviews.to_csv('DATA/review_filtered/movie_reviews_matched_ctrls.csv')

## problems ?

In [41]:
deads_reviews.groupby('actor name').count().sort_values(by='index', ascending=False)[:10]

Unnamed: 0_level_0,index,asin,reviewTime,death date
actor name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
james stewart,10103,10103,10103,10103
charlton heston,9428,9428,9428,9428
james garner,7547,7547,7547,7547
marlon brando,7466,7466,7466,7466
paul newman,7285,7285,7285,7285
philip seymour hoffman,6470,6470,6470,6470
deforest kelley,6466,6466,6466,6466
mickey rooney,6295,6295,6295,6295
gregory peck,6187,6187,6187,6187
james gandolfini,6113,6113,6113,6113


In [42]:
ctrls_reviews.groupby('actor name').count().sort_values(by='index', ascending=False)

Unnamed: 0_level_0,index,asin,reviewTime
actor name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
michael cain,19024,19024,19024
amy aquino,12269,12269,12269
laura linney,8212,8212,8212
joshua jackson,7669,7669,7669
liev schreiber,6259,6259,6259
forest whitaker,6078,6078,6078
jared padalecki,5636,5636,5636
ch b,5286,5286,5286
robin wright,5158,5158,5158
annette bening,4974,4974,4974


In [49]:
### PROBLEM
###

#ctrls_reviews[ctrls_reviews['actor name']=='johnny depp'].groupby('asin').count()['asins']
idx = meta_control_actors[meta_control_actors['clname']=='chris fisher'].groupby('asin').count().reset_index()['asin']
reviews_df[reviews_df['asin'].isin(idx)].count()

asin          1
reviewTime    1
dtype: int64

In [50]:
match_reviews = filtered_reviews[filtered_reviews['asin'].str.contains('B00G3NK8NW')]
match_reviews.shape

(0, 3)

In [51]:
meta_all_actors[meta_all_actors['asin'].isin(['B00G3NK8NW'])]

Unnamed: 0,asin,directors,creators,authors,clname,Death Date


In [52]:
reviews_df[reviews_df['asin'].isin(['B00G3NK8NW'])].shape

(12, 2)

In [53]:
filtered_reviews[filtered_reviews['asin'].isin(['B00G3NK8NW'])].shape

(0, 3)

In [54]:
len(list(meta_all_actors['asin']))

22164

In [55]:
len(meta_all_actors['asin'].unique()), len(meta_all_actors['asin'])

(19161, 22164)

In [56]:
ctrls_reviews[ctrls_reviews['asin'].isin(['B00G3NK8NW'])]

Unnamed: 0,index,asin,reviewTime,actor name
