In [13]:
#essential imports
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize

#scraping imports
import requests
from bs4 import BeautifulSoup

#plotting imports
%matplotlib inline
import matplotlib.pyplot as plt

#String matching
import re

# Date
import datetime as dt
# Sleep
import time

# Strict JSON conversion
import json 
import gzip 

# Progress display
from IPython.display import clear_output

# Amazon API querying
from amazon.api import AmazonAPI
from amazon.api import AsinNotFound

#garbage collector
import gc

datapath = 'DATA/review_filtered/'
cps = 4

# Load data

In [31]:
subjects_actors = pd.read_csv(datapath+'movie_reviews_matched_'+str(cps)+'cps.csv').drop('Unnamed: 0',axis=1)
subjects_actors.head(3)

Unnamed: 0,index,asin,reviewTime,actor name,death date
0,308659,792840461,2006-04-07,jack weston,1996-05-03
1,308660,792840461,2013-03-24,jack weston,1996-05-03
2,308661,792840461,2014-04-05,jack weston,1996-05-03


In [32]:
controls_actors = pd.read_csv(datapath+'movie_reviews_matched_'+str(cps)+'cps_ctrls.csv').drop('Unnamed: 0',axis=1)
controls_actors.head(3)

Unnamed: 0,index,asin,reviewTime,actor name
0,2215101,B000AMMSCQ,2005-09-29,maria riccarda wesseling
1,2215102,B000AMMSCQ,2010-05-20,maria riccarda wesseling
2,2215103,B000AMMSCQ,2006-08-19,maria riccarda wesseling


# Extract pre-features

Put to Datetime subjects and controls dates

In [33]:
#counts = subjects_actors.groupby('actor name').count()
#subjects_actors['death date'] = pd.to_datetime('2001').year

# SUBJECTS
revtimes = list(subjects_actors['reviewTime'])
subjects_actors = subjects_actors.drop('reviewTime',axis=1)
subjects_actors['reviewTime'] = pd.to_datetime(revtimes).year
deathtimes = list(subjects_actors['death date'])
subjects_actors = subjects_actors.drop('death date',axis=1)
subjects_actors['death date'] = pd.to_datetime(deathtimes).year

# CONTROLS
revtimes_ctrl = list(controls_actors['reviewTime'])
controls_actors.drop('reviewTime',axis = 1)
controls_actors['reviewTime'] = pd.to_datetime(revtimes_ctrl).year

Extract death dates and subjects names

In [34]:
deathdates = subjects_actors.groupby(['actor name','death date']).count().reset_index()[['death date','actor name']]
sbj_names = list(deathdates['actor name'])
deathdates.head()

Unnamed: 0,death date,actor name
0,1997,adriana caselotti
1,2006,adrienne shelly
2,2009,al martino
3,2009,alain bashung
4,2003,alan bates


Extract controls names

In [35]:
ctrl_names = list(set(controls_actors['actor name']))

# Pre-filtering

Filter actors based on their popularity: they should be mentioned at least once in the reviews. It is a way to be sure the reviewers know the actor.

In [20]:
def mentions(df, names):
    count_mentions = []
    for name in names:
        subdata = df[df['reviewText'].str.contains(name) == True]
        count_mentions.append(len(subdata.index))
    
    return count_mentions


In [21]:
# SUBJECTS
subjs_mentions = pd.DataFrame(mentions(subjects_actors, sbj_names), columns=['mentions'])
subjs_mentions['names'] = sbj_names

# CONTROLS
ctrls_mentions = pd.DataFrame(mentions(subjects_actors, ctrl_names), columns=['mentions'])
ctrls_mentions['names'] = ctrl_names

KeyError: 'reviewText'

In [122]:
pd.options.display.max_colwidth
subdata = subjects_actors[subjects_actors['actor name']=='William Hickey (actor)|William Hickey']
filt = subdata[subdata['reviewText'].str.contains('William Hickey (actor)|William Hickey')]
filt['reviewText'][13227]

  This is separate from the ipykernel package so we can avoid doing imports until


'Al Pacino, Ellen Barkin and John Goodman are all so entertaining to watch that I\'ve always enjoyed this film no matter how many times I\'ve seen it.The suspense is what this film is all about, but I am fascinated with these actors, including supporting performances by Michael Rooker and William Hickey. Although her looks are a little hard and her mouth way too profane, Barkin never looked better and sexier. That Pacino\'s character, "Detective Frank Keller" would fall for her is understandable.However, adultery, once again is excused in this film along with other not-so-moral acts so this isn\'t a film you want to show to your church group. All three of the leads are very weak, ethically-speaking.The fact that this film can go on for almost two hours with very little action and still keep you riveted to your seat speaks highly of its entertainment value. Why national critics knocked it so much, I don\'t know. Hey, it\'s good entertainment. What more do you want?'

In [123]:
ctrls_mentions[ctrls_mentions['mentions']>0].head()

Unnamed: 0,mentions,names
1,3,Mae West
3,35,Terri
6,5,Bruck
10,1,Jimmy Lloyd
11,3,Anders Randolf


# Get matching features

Extract features for subjects

In [36]:
# SUBJECTS COUNTS BY YEAR
subjects_revs = subjects_actors.groupby(['actor name','reviewTime']).count()


def getreview(actor, year) : 
    try : 
        val = subjects_revs.loc[actor, year]['asin']
        return val
    except TypeError :
        return 0
    

# FILTER TO GET THE DEATH YEAR COUNT - 1
subjects_feats = deathdates[1:].apply(lambda row : getreview(row['actor name'], int(row['death date'])-1), axis=1)
subjects_feats = pd.DataFrame(subjects_feats, columns=['reviews'])
subjects_feats['actor name'] = deathdates['actor name']
subjects_feats['death date'] = deathdates['death date']

subjects_feats.head()

Unnamed: 0,reviews,actor name,death date
1,11,adrienne shelly,2006
2,0,al martino,2009
3,0,alain bashung,2009
4,44,alan bates,2003
5,6,alan north,2000


Extract features for subjects

In [41]:
controls_revs = controls_actors.groupby(['actor name','reviewTime']).count()['asin'].to_frame().reset_index()

In [42]:
controls_revs.head()

Unnamed: 0,actor name,reviewTime,asin
0,xa0caroline ducey,2008,2
1,xa0caroline ducey,2009,1
2,xa0caroline ducey,2011,1
3,xa0caroline ducey,2012,1
4,xa0caroline ducey,2013,1


# Matching

Construct a matching function per subject

In [49]:
def match(actorname, threshold, ctrl_revs) : 

    # extract the useful features for the subject 
    sbj_actor = subjects_feats[subjects_feats['actor name'] == actorname]
    deathyear = int(sbj_actor['death date'])
    sbj_revs = int(sbj_actor['reviews'])
    
    #find the non zero reviews count for the year preceding the death of the subject
    mapp = (ctrl_revs['reviewTime']==(deathyear-1))
    
    
    if (sbj_revs == 0) and (not ctrl_revs[mapp==False].empty):
        
        # zero count subjects are matched with zero count controls 
        matched_name = ctrl_revs[mapp==False].reset_index()['actor name'].get_value(0)
        matched_count = 0
        
        # delete the matched control 
        ctrl_revs = ctrl_revs[ctrl_revs['actor name'].isin([matched_name])==False]
        
        return matched_name, matched_count, ctrl_revs
    
    
    if mapp.any():
        
        for row in ctrl_revs[mapp].iterrows():
                   
            if np.abs(sbj_revs-row[1]['asin']) < threshold :
                
                matched_name = row[1]['actor name']
                matched_count = row[1]['asin']
                
                # delete the matched control
                ctrl_revs = ctrl_revs[ctrl_revs['actor name'].isin([matched_name])==False]
                
                return matched_name, matched_count, ctrl_revs
    else:
        
        matched_name = ctrl_revs.reset_index()['actor name'].get_value(0)
        matched_count = 0
        
        #delete the matched control
        ctrl_revs = ctrl_revs[ctrl_revs['actor name'].isin([matched_name])==False]
        
        return matched_name, matched_count, ctrl_revs
    
    
    return 0, 0, ctrl_revs  


In [54]:
ctrl_revs = controls_revs.copy()

print(len(ctrl_names), len(sbj_names))

2383 481


In [62]:
thresh = 1
ctrl_revs = controls_revs.copy().sample(frac=1) #shuffle the data
ctrl_list = [] # stores the name of matched controls
count_list = [] # stores the features of matched controls

for subject_name in subjects_feats['actor name'][:19]:
    
    mtch = 0
    thr = thresh

    while mtch == 0:
        
        thr += 0.2*thresh
        name, count, ctrl_revs_ = match(subject_name,thr,ctrl_revs)
        ctrl_revs = ctrl_revs_.sample(frac=1) 
        
        if name != 0:
            print(subject_name+' matched with '+name)
            
            ctrl_list.append(name)
            count_list.append(count)
            mtch += 1
    

adrienne shelly matched with kyung-hyun min
al martino matched with douglas chamberlain
alain bashung matched with ritchie coster
alan bates matched with chuck norris
alan north matched with sammo kam-bo hung
alec guinness matched with laura linney
alida valli matched with vincent margera
aliki vougiouklaki matched with panward hemmanee
alvy moore matched with ayoub ahmadi
amrish puri matched with tôru furusawa
amália rodrigues matched with michael bertolini
andreas katsulas matched with karena lam




andy griffith matched with peter davison
andy whitfield matched with joseph fiennes
anita page matched with ye-jin son
ann doran matched with tenzin yeshi paichang
ann miller matched with billy boyd
ann rutherford matched with candice accola
ann sothern matched with alan curtis


In [None]:
features_subjects_dat.head(6)