In [19]:
#essential imports
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize

#scraping imports
import requests
from bs4 import BeautifulSoup

#plotting imports
%matplotlib inline
import matplotlib.pyplot as plt

#String matching
import re

#date
import datetime as dt

# Matching CDs/Vynils and dead musicians

## 1) Clean the meta data generated with Amazon API

In [2]:
meta_CDs_Vin = pd.read_csv(
        'DATA/processed/meta_CDs_and_Vinyl_processed(v2).csv',
        encoding = 'ISO-8859-1',
        low_memory=False)

In [20]:
meta_CDs_Vin

Unnamed: 0.1,Unnamed: 0,asin,categories,description,title,salesRank,actors,artist,authors,creators,directors
0,0,0001501348,"[['CDs & Vinyl', 'Christian', 'Pop & Contempor...","Lenny LeBlanc, Alex Acuna, Justo Almario, Tom ...",Lift Him Up With Ron Kenoly [VHS],{'Movies & TV': 359265},[],['Ron Kenoly'],[],[],[]
1,1,0001393774,"[['CDs & Vinyl', 'Christian']]",Audio CD,Songs for the Shepherd,{'Music': 41017},[],[],[],"[('Keith Green', 'Performer')]",[]
2,2,0005123909,"[['CDs & Vinyl', ""Children's Music""], ['Movies...",18 Music Videos for Kids: Do Your Ears Hang Lo...,Silly Songs: 18 Wholesome Fun Songs for Kids [...,{'Movies & TV': 451209},[],['Cedarmont Kids'],[],[],[]
3,3,0005072298,"[['CDs & Vinyl', ""Children's Music""], ['CDs & ...",,Hymns: 16 Classic Hymns for Children,{'Music': 350804},[],[],[],"[('Sue Gay', 'Performer'), ('Mike Gay', 'Perfo...",[]
4,4,0005224896,"[['CDs & Vinyl', 'Christian', 'Praise & Worshi...",,"Voice of the Wind: Personal Worship, Vol. 1",{'Music': 347825},[],[],[],[],[]
5,5,0005134188,"[['CDs & Vinyl', ""Children's Music""], ['Movies...",VHS Tape,Toddler Tunes [VHS],{'Movies & TV': 363858},[],['Cedarmont Kids'],[],[],[]
6,6,0005441382,"[['CDs & Vinyl', ""Children's Music""], ['CDs & ...",Book by,Bless My Little Girl,{'Music': 54044},[],[],[],"[('Integrity Music', 'Performer'), ('Various',...",[]
7,7,0026197898,"[['CDs & Vinyl', 'Alternative Rock', 'Indie & ...","When you first hear The Sudden Passion, you d ...",Southern Fashion,,[],['The Sudden Passion'],[],[],[]
8,8,0152222227,"[['CDs & Vinyl', 'World Music', 'Middle East',...",,Got Jewish,{'Music': 1185025},[],[],['Brad Schachter'],[],[]
9,9,030714142X,"[['CDs & Vinyl', ""Children's Music""], ['Movies...",,Encyclopedia Brown: Ghostly Rider [VHS],{'Movies & TV': 427298},"['Alan Merrill', 'Lance C. Williams', 'Dion Za...",[],[],[],[]


In [4]:
meta_CDs_Vin.columns

Index(['Unnamed: 0', 'asin', 'categories', 'description', 'title', 'salesRank',
       'actors', 'artist', 'authors', 'creators', 'directors'],
      dtype='object')

In [5]:
clean_CD_V = meta_CDs_Vin.drop(['Unnamed: 0','actors','creators','categories','artist','directors','authors'],axis=1)

def clean_serie(series_):
    return (series_.str.replace("[", "")
            .str.replace("]","")
            .str.replace("'","")
            .str.replace(".","")
            .str.replace(", ",",")
            .str.replace('"','')
            .str.replace(' & ',',')
            .str.replace(' , ',',')
           .str.replace(':',',')
           .str.replace(' / ',',')
           .str.replace('/',',')
           .str.lower())

    #return list(series_)

clean_CD_V['artist'] = clean_serie(meta_CDs_Vin['artist'])
clean_CD_V['categories'] = clean_serie(meta_CDs_Vin['categories'])

## 2) Cleaning and filtering the dead names data

In [6]:
deaths = pd.read_csv('DATA/deaths.csv', low_memory=False, encoding="ISO-8859-1")

In [7]:
deaths.head()

Unnamed: 0,Name,Birth Date,Death Date,Description,Actor,Author,Musician
0,Jack Weston,1924,1996-05-03,", american actor",True,False,False
1,John Beradino,1917,1996-05-19,", american baseball player and actor",True,False,False
2,Jon Pertwee,1919,1996-05-20,", british actor",True,False,False
3,Paul Delph,1957,1996-05-21,", american musician and producer",False,False,True
4,Lash LaRue,1917,1996-05-21,", american actor",True,False,False


In [8]:
deaths[deaths['Name']=='Heath Ledger']

Unnamed: 0,Name,Birth Date,Death Date,Description,Actor,Author,Musician
897,Heath Ledger,1979,2008-01-22,", australian actor",True,False,False


In [9]:
dead_musicians = deaths[deaths['Musician']==True].reset_index()
dead_musicians['name'] = dead_musicians['Name'].str.lower()
dead_musicians = dead_musicians.drop('Name',axis=1)
dead_musicians.head()

Unnamed: 0,index,Birth Date,Death Date,Description,Actor,Author,Musician,name
0,3,1957,1996-05-21,", american musician and producer",False,False,True,paul delph
1,7,1928,1996-05-24,", american composer",False,False,True,jacob druckman
2,8,1968,1996-05-25,", american musician",False,False,True,bradley nowell
3,13,1928,1996-06-02,", spanish soprano",False,False,True,pilar lorengar
4,19,1917,1996-06-15,", american singer",False,False,True,ella fitzgerald


## 3) Build controls list

In [10]:
musics_products = clean_CD_V[clean_CD_V['artist'].isnull()==False]
musics_products = musics_products[musics_products["artist"].isin([""])==False]
musics_products.head()

Unnamed: 0,asin,description,title,salesRank,artist,categories
0,1501348,"Lenny LeBlanc, Alex Acuna, Justo Almario, Tom ...",Lift Him Up With Ron Kenoly [VHS],{'Movies & TV': 359265},ron kenoly,"cds,vinyl,christian,pop,contemporary,cds,vinyl..."
2,5123909,18 Music Videos for Kids: Do Your Ears Hang Lo...,Silly Songs: 18 Wholesome Fun Songs for Kids [...,{'Movies & TV': 451209},cedarmont kids,"cds,vinyl,childrens music,movies,tv,movies"
5,5134188,VHS Tape,Toddler Tunes [VHS],{'Movies & TV': 363858},cedarmont kids,"cds,vinyl,childrens music,movies,tv,movies"
7,26197898,"When you first hear The Sudden Passion, you d ...",Southern Fashion,,the sudden passion,"cds,vinyl,alternative rock,indie,lo-fi,indie rock"
18,307141950,"Spring has come, so Little Critter and Dad are...",Just Me &amp; My Dad [VHS],{'Movies & TV': 375462},golden books,"cds,vinyl,childrens music,movies,tv,movies"


In [11]:
def clean_comas(list_):
    clean_ = []
    for word in list_:
        clean_.append(word.split('(')[0].replace("(","").replace(")",""))
    return clean_
        

In [12]:
musicians_names = [troup.split(',') for troup in list(musics_products['artist'])]
all_musicians = clean_comas(list(set(np.concatenate(musicians_names))))

In [13]:
all_controls = pd.DataFrame(all_musicians, columns = ['name'])
all_controls = all_controls[all_controls['name'].isin(list(dead_musicians['name']))==False].reset_index()

In [14]:
len(all_controls), len(dead_musicians)

(163791, 424)

In [15]:
def is_name(list_):
    only_names = []
    for word in list_:
        decompose = word.split()
        if len(decompose) == 2:
            firstg = decompose[0]
            if firstg not in ['the','The','le','les','los','la']:
                only_names.append(word)
    
    return only_names

In [16]:
N = int(0.009*len(all_controls))

controls = pd.DataFrame([all_controls['name'].get_value(idx) for idx
                        in np.arange(0,N)],
                        columns =['name'])

controls = controls.drop(0,axis=0)
single_interprets = is_name(controls['name'])
controls = controls[controls['name'].isin(single_interprets)]

In [17]:
len(single_interprets)

746

## 4) Matching dead musicians and meta data

In [42]:
pd.options.mode.chained_assignment = None 
meta_dead_musicians = pd.DataFrame()

for idx, musician in enumerate(list(dead_musicians['name'])):
    
    match = pd.DataFrame()
    match = (musics_products[(musics_products['artist'].str.contains(musician)==True) | 
             (musics_products['description'].str.contains(musician)==True)].drop('artist',axis=1))
    
    match['name'] = musician
    match['death date'] = dead_musicians['Death Date'].get_value(idx)
    match['birth date'] = dead_musicians['Birth Date'].get_value(idx)
    meta_dead_musicians = meta_dead_musicians.append(match)
    
    if idx %100 == 0: print('passing the {}th musician'.format(idx))
        
    #  | (musics_products['description'].str.contains(musician)==True)

passing the 0th musician


  import sys
  


passing the 100th musician
passing the 200th musician
passing the 300th musician
passing the 400th musician


In [37]:
meta_dead_musicians.head()
len(meta_dead_musicians.groupby('name').count()), len(meta_dead_musicians)

(332, 10434)

In [33]:
meta_dead_musicians[meta_dead_musicians['name']=='whitney houston']

Unnamed: 0,asin,description,title,salesRank,categories,name,death date,birth date
1516,6301980468,The Star Spangled Banner was performed by Whit...,Whitney Houston - The Star Spangled Banner [VHS],{'Movies & TV': 371047},"cds,vinyl,dance,electronic,cds,vinyl,pop,adult...",whitney houston,2012-02-11,1963
42940,B000002GFJ,,A Very Special Christmas,,"cds,vinyl",whitney houston,2012-02-11,1963
51278,B000002VCQ,When Whitney Houston arrived on the scene as a...,Whitney Houston,{'Music': 32757},"cds,vinyl,dance,electronic,disco,cds,vinyl,pop...",whitney houston,2012-02-11,1963
51329,B000002VH6,CDS,I'm Your Baby Tonight,{'Music': 139247},"cds,vinyl,dance,electronic,house,cds,vinyl,pop...",whitney houston,2012-02-11,1963
51349,B000002VEB,,Whitney,{'Music': 22387},"cds,vinyl,blues,contemporary blues,cds,vinyl,p...",whitney houston,2012-02-11,1963
51485,B000002VMD,"At the time of its release, this was the water...",The Bodyguard: Original Soundtrack Album,{'Music': 3170},"cds,vinyl,pop,adult contemporary,cds,vinyl,pop...",whitney houston,2012-02-11,1963
51516,B000002VIF,,Run to You / After We Make Love,{'Music': 712275},"cds,vinyl,blues,contemporary blues,cds,vinyl,p...",whitney houston,2012-02-11,1963
100352,B000008Q7R,Exhale (Shoop Shoop) [Single]\r\r\r\nWhitney H...,Exhale (Shoop Shoop),{'Music': 204419},"cds,vinyl,blues,contemporary blues,cds,vinyl,p...",whitney houston,2012-02-11,1963
108893,B00000DE22,"WHITNEY SINGS THE STAR SPANGLED BANNER,AS PERF...",Star Spangled Banner,{'Music': 63973},"cds,vinyl,pop,adult contemporary,cds,vinyl,r&b...",whitney houston,2012-02-11,1963
108954,B00000DE23,,I'm Every Woman,{'Music': 217300},"cds,vinyl,blues,contemporary blues,cds,vinyl,p...",whitney houston,2012-02-11,1963


## 5) Matching controls and meta data

In [None]:
pd.options.mode.chained_assignment = None
meta_control_musicians = pd.DataFrame()

for idx, musician in enumerate(list(controls['name'])):
    
    #musician = musician.replace("(","").replace(")","")
    match = pd.DataFrame()
    mapping_name = musics_products['artist'].str.contains(musician)
    mapping_descp = musics_products['description'].str.contains(musician)
    
    if any(mapping_name)|any(mapping_descp) : 
        match = musics_products[(mapping_name==True)|(mapping_descp==True)].drop('artist',axis=1)
        match['name'] = musician
        match['death date'] = 'unknown'
        match['birth date'] = 'unknown'
        meta_control_musicians = meta_control_musicians.append(match)
    
    else: 
        print('no matching for control name: {}'.format(musician))
    
    if idx %100 == 0: print('passing the {}th musician'.format(idx))

In [None]:
meta_control_musicians.head()
len(meta_control_musicians.groupby('name').count())

## 6) Loading Reviews data for Movies and TV and cleaning

In [None]:
datapath = 'DATA/reviews/'
filename = 'reviews_CDs_and_Vinyl.json.gz'

In [None]:
import gzip
def gz_to_dataframe(datapath, filename):
    def parse(path): 
        g = gzip.open(path, 'rb') 
        for l in g: 
            yield eval(l) 
    def getDF(path): 
        i = 0 
        df = {} 
        for d in parse(path): 
            df[i] = d 
            i += 1 
        return pd.DataFrame.from_dict(df, orient='index') 
    return getDF(datapath+filename)
    
reviews_df = gz_to_dataframe(datapath, filename)

In [None]:
reviews_df.head()

In [None]:
reviews_df.shape

In [None]:
def clean_reviews(review_df):
    filtered_reviews = reviews_df[reviews_df['asin'].isin(list([meta_dead_musicians['asin'], meta_control_musicians['asin']]))]
    filtered_reviews = filtered_reviews[['asin','reviewText','summary','reviewTime']]
    filtered_reviews['ReviewTime'] = list(pd.to_datetime(filtered_reviews['reviewTime'].str.replace(',','').str.replace(' ','-')))
    filtered_reviews = filtered_reviews.drop('reviewTime',axis=1)
    
    return filtered_reviews

In [None]:
filtered_reviews = clean_reviews(reviews_df)

In [None]:
filtered_reviews.set_index(['asin', 'reviewTime']).head()

In [None]:
filtered_reviews.shape

## 7) Matching meta/dead data with review data

In [None]:
#metactors = meta_dead_actors[['asin','name','death date', 'title','categories','salesRank']].reset_index() #.set_index(['name','death date','asin'])

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'

def matching_meta_reviews(filtered_reviews, meta):
    
    processed_reviews = pd.DataFrame()
    
    for idx, asin in enumerate(sbj['asin']):
        match_reviews = pd.DataFrame()
        match_reviews = filtered_reviews[filtered_reviews['asin']==asin]

        match_reviews['interpret name'] = meta['name'].get_value(idx)
        match_reviews['death date'] = meta['death date'].get_value(idx)
        match_reviews['birth date'] = meta['birth date'].get_value(idx)
        match_reviews['title'] = meta['title'].get_value(idx)
        match_reviews['categories'] = clean_serie(meta['categories']).get_value(idx)
        match_reviews['salesRank'] = clean_serie(meta['salesRank']).get_value(idx)

        processed_reviews = processed_reviews.append(match_reviews)

        if idx %1000 == 0: 
            print('passing the {}th asin'.format(idx))
            #break
        
    return processed_reviews

In [None]:
processed_reviews_sbj = pd.DataFrame(matching_meta_reviews(filtered_reviews, meta_dead_musicians))

## 8) Matching controls and reviews

In [None]:
metacontrols = meta_control_actors[['asin','name', 'title','categories','salesRank']].reset_index()

In [None]:
processed_reviews_ctrls = pd.DataFrame(matching_meta_reviews(filtered_reviews, meta_control_musicians))

## Save files

In [None]:
processed_reviews.to_csv('DATA/movie_reviews_matched.csv')

In [None]:
processed_reviews_ctrls.to_csv('DATA/movie_reviews_matched_ctrls.csv')