In [3]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

#essential imports
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize

#scraping imports
import requests
from bs4 import BeautifulSoup

#plotting imports
%matplotlib inline
import matplotlib.pyplot as plt

#String matching
import re

#date
import datetime as dt
from matching_helpers import *

cps = 4

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Matching CDs/Vynils and dead musicians

## 1) Clean the meta data generated with Amazon API

In [4]:
features_meta = ['asin', 'artist', 'description']

meta_CDs_Vin = pd.read_csv(
        'DATA/metadata_processed/meta_CDs_and_Vinyl_processed(v2).csv',
        encoding = 'ISO-8859-1',
        low_memory=False)

meta_CDs_Vin = meta_CDs_Vin[features_meta]
meta_CDs_Vin = df_rm_punctuation(meta_CDs_Vin)

In [5]:
meta_CDs_Vin.head()

Unnamed: 0,asin,artist,description
0,1501348,Ron Kenoly,"Lenny LeBlanc,Alex Acuna,Justo Almario,Tom Bro..."
1,1393774,,Audio CD
2,5123909,Cedarmont Kids,18 Music Videos for Kids: Do Your Ears Hang Lo...
3,5072298,,
4,5224896,,


## 2) Cleaning and filtering the dead names data

In [6]:
deaths = pd.read_csv('DATA/deaths.csv', low_memory=False, encoding="ISO-8859-1")

In [7]:
musician_features = ['clname', 'Birth Date', 'Death Date']

dead_musicians = deaths[deaths['Musician']==True].reset_index()
dead_musicians['name'] = dead_musicians['Name'].str.lower()
dead_musicians['clname'] = dead_musicians['Name'].map(cleanstr)
dead_musicians =dead_musicians[musician_features]

dead_musicians.head()

Unnamed: 0,clname,Birth Date,Death Date
0,paul delph,1957,1996-05-21
1,jacob druckman,1928,1996-05-24
2,bradley nowell,1968,1996-05-25
3,pilar lorengar,1928,1996-06-02
4,ella fitzgerald,1917,1996-06-15


## 3) Build controls list

In [8]:
musics_products = meta_CDs_Vin[meta_CDs_Vin['artist'].isnull()==False]
musics_products = musics_products[musics_products["artist"]!='']

musics_products['artist'] = musics_products['artist'].apply(cleanstr)
musics_products['description'] = musics_products['description'].apply(lambda s : str(s).lower())

print(musics_products.shape)
musics_products.head()

(403854, 3)


Unnamed: 0,asin,artist,description
0,1501348,ron kenoly,"lenny leblanc,alex acuna,justo almario,tom bro..."
2,5123909,cedarmont kids,18 music videos for kids: do your ears hang lo...
5,5134188,cedarmont kids,vhs tape
7,26197898,the sudden passion,"when you first hear the sudden passion,you d p..."
18,307141950,golden books,"spring has come,so little critter and dad are ..."


In [9]:
musicians_names = [splitstr(troup) for troup in list(musics_products['artist'])]
musicians_names = [cleanstr(elem) for elem in np.concatenate(musicians_names)]
musicians_names = list(set(musicians_names))

# weird... 
print(musicians_names[:5])
#amazon_actor_names = amazon_actor_names[:1]+ amazon_actor_names[2:] 
musicians_names = musicians_names[1:]

amazon_musicians = pd.DataFrame(musicians_names, columns=['Name'])
amazon_musicians['Birth Date'] = pd.Series()
amazon_musicians['Death Date'] = pd.Series()
amazon_musicians.head()

['', 'virgin steele', 'elena paparizou', 'master joe', 'fernandi']


Unnamed: 0,Name,Birth Date,Death Date
0,virgin steele,,
1,elena paparizou,,
2,master joe,,
3,fernandi,,
4,garageland,,


In [10]:
''' Returns the lines of a .txt file, with '\n' characters removed 
'''
def read_txt(path) : 
    file = open(path, "r")
    tmplist = file.read().split("\n")
    # the last line is an empty line
    return tmplist[:len(tmplist)-1]

def write_lines(iterable, f_out) :
    outputfile = open(f_out, 'w')
    for elem in iterable : 
        outputfile.write(elem+'\n')
    outputfile.close()

englishwords = read_txt('DATA/englishwords.txt')

def is_name(name):
    if len(name.split()) <= 1 : 
        return False
    for word in name.split(' ') :
        if word in englishwords or any(char.isdigit() for char in word):
            return False
    return True
    '''only_names = []
    for word in list_:
        decompose = word.split()
        if len(decompose) == 2:
            firstg = decompose[0]
            if firstg not in ['the','The','le','les','los','la']:
                only_names.append(word)
    
    return only_names
    '''

In [11]:
#single_interprets = is_name(list(musicians_names)) 
amazon_musicians_singles = amazon_musicians[amazon_musicians['Name'].apply(is_name)==True]
single_interprets = amazon_musicians_singles['Name'].tolist()

In [12]:
print(len(musicians_names), ' reduced to ', amazon_musicians_singles.shape)
amazon_musicians_singles.head()

162575  reduced to  (41308, 3)


Unnamed: 0,Name,Birth Date,Death Date
1,elena paparizou,,
13,danny krivit,,
23,gary nicholson,,
26,bennie hess,,
30,nicolaus harnoncourt,,


In [13]:
nondead_musicians = pd.DataFrame(single_interprets, columns = ['clname'])

for idx, name in enumerate(dead_musicians['clname'].tolist()):
    if idx % 100 == 0 : 
        print(idx)
    mapp = nondead_musicians['clname'].str.contains(name)
    if mapp.any():
        nondead_musicians = nondead_musicians[mapp==False] 

nondead_musicians['Birth Date'] = pd.Series()
nondead_musicians['Death Date'] = pd.Series()
nondead_musicians.head()

controls = nondead_musicians

all_musicians = pd.concat((dead_musicians, nondead_musicians))

0
100
200
300
400


In [14]:
len(controls), len(dead_musicians)

(41038, 424)

## 4) Matching dead musicians and meta data

In [15]:
pd.options.mode.chained_assignment = None 
meta_dead_musicians = pd.DataFrame()

for idx, musician in enumerate(list(dead_musicians['clname'])):
    
    match = pd.DataFrame()
    match = musics_products[ \
                (musics_products['artist'].str.contains(musician)==True) | 
                (musics_products['description'].str.contains(musician)==True) \
            ].drop('artist',axis=1)
    
    match['name'] = musician
    match['death date'] = dead_musicians['Death Date'].get_value(idx)
    match['birth date'] = dead_musicians['Birth Date'].get_value(idx)
    meta_dead_musicians = meta_dead_musicians.append(match)
    
    if idx %100 == 0: 
        print('{} in {} done'.format(idx, len(dead_musicians['clname'])))

0 in 424 done
100 in 424 done
200 in 424 done
300 in 424 done
400 in 424 done


In [16]:
meta_dead_musicians.head()
len(meta_dead_musicians.groupby('name').count()), len(meta_dead_musicians)

(368, 26173)

## 5) Subsample control musicians and match meta data

In [17]:
from numpy.random import randint


def subsample_df (df, nbWanted) : 
    assert nbWanted < len(df)
    randomIndex = randint(0, len(df)-1, nbWanted)
    return df.loc[randomIndex]

sub_controls = subsample_df(controls, cps*len(dead_musicians))
sub_controls = sub_controls.dropna(how='all')
print(sub_controls.shape)
sub_controls.head(3)

(1684, 3)


Unnamed: 0,clname,Birth Date,Death Date
9691,tike jah fakoly,,
21169,patricia risley,,
34685,marie bergman,,


In [18]:
pd.options.mode.chained_assignment = None
meta_control_musicians = pd.DataFrame()

for idx, musician in enumerate(list(sub_controls['clname'])):
    
    #musician = musician.replace("(","").replace(")","")
    match = pd.DataFrame()
    try : 
        mapping_name = musics_products['artist'].str.contains(musician)
    except TypeError : 
        print(musician)
        raise TypeError
        
    mapping_descp = musics_products['description'].str.contains(musician)
    
    if any(mapping_name)|any(mapping_descp) : 
        match = musics_products[(mapping_name==True)|(mapping_descp==True)].drop('artist',axis=1)
        match['name'] = musician
        match['death date'] = 'unknown'
        match['birth date'] = 'unknown'
        meta_control_musicians = meta_control_musicians.append(match)
    
    else: 
        print('no matching for control name: {}'.format(musician))
    
    if idx %100 == 0: print('{} out of {}'.format(idx, len(sub_controls['clname'])))

0 out of 1684
100 out of 1684
200 out of 1684
300 out of 1684
400 out of 1684
500 out of 1684
600 out of 1684
700 out of 1684
800 out of 1684
900 out of 1684
1000 out of 1684
1100 out of 1684
1200 out of 1684
1300 out of 1684
1400 out of 1684
1500 out of 1684
1600 out of 1684


In [19]:
meta_control_musicians.head()
len(meta_control_musicians.groupby('name').count())

1652

## 6) Loading Reviews data for Movies and TV and cleaning

In [20]:
datapath = 'DATA/review/'
filename = 'reviews_CDs_and_Vinyl.json.gz'

In [21]:
import gzip
def gz_to_dataframe(datapath, filename):
    def parse(path): 
        g = gzip.open(path, 'rb') 
        for l in g: 
            yield eval(l) 
    def getDF(path): 
        i = 0 
        df = {} 
        for d in parse(path): 
            df[i] = d 
            i += 1 
        return pd.DataFrame.from_dict(df, orient='index') 
    return getDF(datapath+filename)
    
reviews_df = gz_to_dataframe(datapath, filename)

In [22]:
print(reviews_df.shape)
reviews_df.head()

(3749004, 9)


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A9DMTMLFR9CO5,1393774,Albert Luguterah,"[0, 0]",fantastic. old time religion is good for me. t...,5.0,i love it,1377907200,"08 31, 2013"
1,AHG1GTQZUYNJN,1393774,CAROLYNE CHAMBERLAIN,"[0, 0]",I HAD THE ALBUM FOR YEARS AGO ....AND I AM VER...,5.0,PURE JOY!,1372723200,"07 2, 2013"
2,A2TFO7NREP2B2D,1393774,cindy terpening_smith,"[0, 0]",Pure praise to throne room. He had a unique st...,5.0,pure,1396396800,"04 2, 2014"
3,A2YAPAG1IPNK7K,1393774,diane tousley,"[0, 0]",I have always loved Keith Green's music and ha...,5.0,Love this CD!,1392422400,"02 15, 2014"
4,AEKGGV851HY3K,1393774,D. MILLS,"[13, 15]",Keith Green had a passionate love for Jesus. ...,5.0,Passionate Faith Is Contagious,1130803200,"11 1, 2005"


In [26]:
all_musicians_asins = pd.concat((meta_dead_musicians['asin'], meta_control_musicians['asin']))

def clean_reviews(review_df):
    filtered_reviews = reviews_df[reviews_df['asin'].isin(all_musicians_asins)]
    filtered_reviews = filtered_reviews[['asin','reviewText','summary','reviewTime']]
    filtered_reviews['reviewTime'] = list(pd.to_datetime(filtered_reviews['reviewTime'].str.replace(',','').str.replace(' ','-')))
    #filtered_reviews = filtered_reviews.drop('reviewTime',axis=1)
    
    return filtered_reviews

In [27]:
filtered_reviews = clean_reviews(reviews_df)

In [28]:
#filtered_reviews['reviewTime'] = filtered_reviews['ReviewTime']
filtered_reviews.set_index(['asin', 'reviewTime']).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,reviewText,summary
asin,reviewTime,Unnamed: 2_level_1,Unnamed: 3_level_1
0563494255,2013-11-02,"Why I like it goes without saying, it is Agath...",Listening to the story unfold allows you to us...
073890015X,2000-01-29,i just kicked its just that simple if you were...,i loved it
073890015X,2006-06-02,"I'm really tired by all this ""metal"" stuff. It...",in case my previous review gets edited for con...
073890015X,2013-09-11,ok I guess a little over 2 hours was not enoug...,great late 90's concert
073890015X,2000-07-16,"I saw the show live, and enjoyed it very much....","great show, terrible coverage"


In [29]:
filtered_reviews.shape

(367153, 4)

## 7) Matching meta/dead data with review data

In [30]:
#metactors = meta_dead_actors[['asin','name','death date', 'title','categories','salesRank']].reset_index() #.set_index(['name','death date','asin'])

In [31]:
pd.options.mode.chained_assignment = None  # default='warn'

def matching_meta_reviews(filtered_reviews, meta_in):
    
    processed_reviews = pd.DataFrame()

    meta = meta_in.reset_index()    
    
    for idx, asin in enumerate(meta['asin']):
        match_reviews = pd.DataFrame()
        match_reviews = filtered_reviews[filtered_reviews['asin']==asin]
        
        match_reviews['name'] = meta['name'].get_value(idx)
        match_reviews['death date'] = meta['death date'].get_value(idx)
        match_reviews['birth date'] = meta['birth date'].get_value(idx)
        #match_reviews['title'] = meta['title'].get_value(idx)
        #match_reviews['categories'] = clean_serie(meta['categories']).get_value(idx)
        #match_reviews['salesRank'] = clean_serie(meta['salesRank']).get_value(idx)

        processed_reviews = processed_reviews.append(match_reviews)

        if idx %1000 == 0: 
            print('{} in {} done'.format(idx, len(meta['asin'])))
            #break
        
    return processed_reviews

In [32]:
processed_reviews_sbj = pd.DataFrame(matching_meta_reviews(filtered_reviews, meta_dead_musicians))

0 in 26173 done
1000 in 26173 done
2000 in 26173 done
3000 in 26173 done
4000 in 26173 done
5000 in 26173 done
6000 in 26173 done
7000 in 26173 done
8000 in 26173 done
9000 in 26173 done
10000 in 26173 done
11000 in 26173 done
12000 in 26173 done
13000 in 26173 done
14000 in 26173 done
15000 in 26173 done
16000 in 26173 done
17000 in 26173 done
18000 in 26173 done
19000 in 26173 done
20000 in 26173 done
21000 in 26173 done
22000 in 26173 done
23000 in 26173 done
24000 in 26173 done
25000 in 26173 done
26000 in 26173 done


In [33]:
processed_reviews_sbj.head()

Unnamed: 0,asin,reviewText,summary,reviewTime,name,death date,birth date
2685909,B000KB9Y6U,I bought this item because it did not say vyna...,Should be more descriptive in the title.,2013-05-23,paul delph,1996-05-21,1957
2685910,B000KB9Y6U,Chrissy Faith is wonderful I must say!! What ...,You should listen this!!,2013-11-09,paul delph,1996-05-21,1957
2685911,B000KB9Y6U,this product came quickly and there were no is...,great music,2013-04-26,paul delph,1996-05-21,1957
2549964,B000EU1H44,To put it short - the sounds are very explorat...,Enchanting,2008-07-11,jacob druckman,1996-05-24,1928
2696697,B000LP4O1K,This is a delightful recording. It's lite enou...,Pleasing,2011-05-19,jacob druckman,1996-05-24,1928


## 8) Matching controls and reviews

In [34]:
processed_reviews_ctrls = pd.DataFrame(matching_meta_reviews(filtered_reviews, meta_control_musicians))

0 in 10857 done
1000 in 10857 done
2000 in 10857 done
3000 in 10857 done
4000 in 10857 done
5000 in 10857 done
6000 in 10857 done
7000 in 10857 done
8000 in 10857 done
9000 in 10857 done
10000 in 10857 done


## Save files

In [36]:
processed_reviews_sbj.to_csv('DATA/review_filtered/music_reviews_matched_'+cps+'cps.csv')

In [37]:
processed_reviews_ctrls.to_csv('DATA/review_filtered/music_reviews_matched_'+cps+'cps_ctrls.csv')