# Task 1: Using RLTK to perform Entity Resolution (ER)

## Dataset analysis & RLTK components construction

### Task 1-1. Construct RLTK Datasets

First, you need define how a single entry would like for each type of record (for each dataset)

In [1]:
import rltk
import json

tokenizer = rltk.tokenizer.crf_tokenizer.crf_tokenizer.CrfTokenizer()

In [2]:
import pandas as pd
imdb_df = pd.read_json('imdb_tvseries.jsonl', lines=True)

In [3]:
def filtername(x):
    names = [i['name'] for i in x if type(i) == dict]
    return names

imdb_df['top_cast'] = imdb_df['top_cast'].apply(filtername)

In [4]:
imdb_df.drop_duplicates(['title', 'plot', 'num_seasons'], inplace=True)

In [5]:
imdb_df['id'] = range(0, len(imdb_df))

In [6]:
imdb_df['id'] = imdb_df['id'].astype(str)

In [7]:
imdb_df.head()

Unnamed: 0,title,imdb_rating,plot,genre,creators,stars,top_cast,num_seasons,release_date,language,country_of_origin,production_company,id
0,The Peripheral,8.3,Set in the future when technology has subtly a...,"[Drama, Mystery, Sci-Fi]",[],"[Chloë Grace Moretz, Gary Carr, Jack Reynor]","[Chloë Grace Moretz, Gary Carr, Jack Reynor, L...",1,"[October 21, 2022 (United States)]",[English],[United States],"[Amazon Studios, Big Indie Pictures, Kilter Fi...",0
1,KinnPorsche the Series La 'forte,8.8,"Kinn, the second son of a prominent mafia head...","[Action, Comedy, Romance]",[],"[Mile Phakphum Romsaithong, Apo Nattawin Watta...","[Mile Phakphum Romsaithong, Apo Nattawin Watta...",1,"[April 2, 2022 (Thailand)]","[Thai, English]",[Thailand],[Be on Cloud],1
2,Wayne,8.4,"Wayne, a 16 year-old Dirty Harry with a heart ...","[Action, Comedy]",[Shawn Simmons],"[Mark McKenna, Ciara Bravo, Joshua J. Williams]","[Mark McKenna, Ciara Bravo, Joshua J. Williams...",1,"[January 16, 2019 (United States)]",[English],[United States],"[Endeavor Content, Reese Wernick Productions]",2
3,Henry Danger,5.7,"Looking for a part-time job, 13-year-old Henry...","[Action, Comedy, Family]","[Dana Olsen, Dan Schneider, Dana Olsen]","[Jace Norman, Cooper Barnes, Riele Downs]","[Jace Norman, Cooper Barnes, Riele Downs, Sean...",5,"[July 26, 2014 (United States)]",[English],[United States],"[Schneider's Bakery, Uptown Productions]",3
4,The Legend of Korra,8.4,Avatar Korra fights to keep Republic City safe...,"[Animation, Action, Adventure]","[Michael Dante DiMartino, Bryan Konietzko]","[Janet Varney, P.J. Byrne, David Faustino]","[Janet Varney, P.J. Byrne, David Faustino, J.K...",4,"[April 14, 2012 (United States)]",[English],[United States],"[Ginormous Madman, Nickelodeon Animation Studios]",4


In [8]:
rt_df = pd.read_json('tvshows.jsonl', lines=True)

In [9]:
rt_df.drop_duplicates(['title', 'description', 'premiere', 'network'], inplace=True)

In [10]:
rt_df['id'] = range(0, len(rt_df))
rt_df['id'] = rt_df['id'].astype(str)
rt_df.head()

Unnamed: 0,title,creators,ratings,description,starring,network,genre,premiere,producer,id
0,Star Wars: Tales of the Jedi,"[Dave Filoni, Charles Murray]","[100%, 95%]",Six brand-new animated shorts featuring parabl...,"[Ashley Eckstein, Corey Burton, Liam Neeson, M...",Disney+,[Action],"Oct 26, 2022","[Dave Filoni, Charles Murray]",0
1,12 Monkeys,[],"[88%, 77%]","A man from the post-apocalyptic future, Cole u...","[Aaron Stanford, Amanda Schull, Barbara Sukowa...",SYFY,[Drama],"Jan 16, 2015","[Charles Roven, Richard Suckle, Travis Fickett...",1
2,Genndy Tartakovsky's Primal,[Genndy Tartakovsky],"[100%, 95%]","At the dawn of evolution, a caveman and a dino...",[Aaron LaPlante],Cartoon Network,[Action],"Oct 7, 2019",[],2
3,One Piece,[],[88%],Monkey D. Luffy wants to become the King of al...,"[Mayumi Tanaka, Kazuya Nakai, Akemi Okamura, K...",Fox,[Kids family],"Oct 20, 1999",[],3
4,Fullmetal Alchemist Brotherhood,[],"[100%, 92%]",Brothers Edward and Alphonse Elric search for ...,"[Romi Pak, Rie Kugimiya, Miyoko Aso, Megumi To...",MBSTVJP,[Action],"Apr 5, 2009",[],4


In [11]:
wikidata_df = pd.read_json('wikidata.jsonl')
wikidata_df.fillna(0, inplace=True)
wikidata_df['num_seasons'] = wikidata_df['num_seasons'].astype('int32')
wikidata_df.replace(0, '' , inplace=True)

In [12]:
wikidata_df.head()

Unnamed: 0,id,title,country_of_origin,num_seasons,start_time,end_time,cast_members,producers,original_broadcasters,distributed_by,genres,languages,awards_received,review_scores,series_spin_off
0,Q115647,The Hollow Crown,United Kingdom,2,2012-06-30 00:00:00+00:00,2012-07-21 00:00:00+00:00,"Tom Hiddleston, Jeremy Irons, Ben Whishaw, Sim...",,BBC Two,,costume drama,English,British Academy Television Award for Best Acto...,,
1,Q115874,"El Zorro, la espada y la rosa",Colombia,1,2007-02-12 00:00:00+00:00,2007-07-23 00:00:00+00:00,Christian Meier,,Telemundo,Telemundo Internacional,telenovela,Spanish,,,
2,Q494,Beakman's World,United States of America,4,1992-09-16 00:00:00+00:00,1998-08-01 00:00:00+00:00,"Paul Zaloom, Mark Ritts, Eliza Schneider, Sent...",,"TLC, CBS",Columbia Pictures Television,comedic television series,English,,,
3,Q723,Rookie Blue,Canada,6,2010-06-24 00:00:00+00:00,2015-07-29 00:00:00+00:00,"Gregory Smith, Missy Peregrym, Charlotte Sulli...",,Global Television Network,Entertainment One,"drama, police procedural, LGBTI+ related TV se...",English,,,
4,Q961,More Than Life at Stake,Poland,1,1968-10-10 00:00:00+00:00,1968-01-01 00:00:00+00:00,"Jan Englert, Krystyna Feldman, Zygmunt Kęstowi...",,Telewizja Polska,,"espionage television series, war television se...",Polish,,,


In [13]:
import re

class IMDB(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''
    
    def get_date(self, date):
        month_index = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
        if '(' in date:
            date = date.rsplit(' (', maxsplit=1)[0]
        split_date = date.split(' ')
        if '' in split_date:
            return date
        elif len(split_date) == 1:
            return split_date[0]
        elif len(split_date) == 2:
            return str(month_index.index(split_date[0])+1) + '/' + split_date[1]
        else:
            return '/'.join([str(month_index.index(split_date[0])+1), split_date[1].strip(','), split_date[2]])
    
    def get_year(self, date):
        if '(' in date:
            date = date.rsplit(' (', maxsplit=1)[0]
        split_date = date.split(' ')
        if '' in split_date:
            return date
        elif len(split_date) == 1:
            return split_date[0]
        elif len(split_date) == 2:
            return split_date[1]
        else:
            return split_date[2]
        
    @rltk.cached_property
    def id(self):
        return self.raw_object['id']

    @rltk.cached_property
    def title(self):
        return self.raw_object['title']

    @rltk.cached_property
    def title_tokens(self):
        return set(tokenizer.tokenize(self.title))
    
    @rltk.cached_property
    def imdb_rating(self):
        return self.raw_object['imdb_rating']
    
    @rltk.cached_property
    def plot(self):
        return self.raw_object['plot']
    
    @rltk.cached_property
    def genre(self):
        return self.raw_object['genre']
    
    @rltk.cached_property
    def creators(self):
        return self.raw_object['creators']
    
    @rltk.cached_property
    def stars(self):
        return self.raw_object['stars']
    
    @rltk.cached_property
    def top_cast(self):
        return self.raw_object['top_cast']
    
    @rltk.cached_property
    def num_seasons(self):
        return self.raw_object['num_seasons']
    
    @rltk.cached_property
    def release_date(self):
        return self.get_date(self.raw_object['release_date'][0])
    
    @rltk.cached_property
    def release_year(self):
        return self.get_year(self.raw_object['release_date'][0])
    
    @rltk.cached_property
    def language(self):
        return self.raw_object['language']
    
    @rltk.cached_property
    def country_of_origin(self):
        return self.raw_object['country_of_origin']
    
    @rltk.cached_property
    def production_company(self):
        return self.raw_object['production_company']

class RottenTomatoes(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    def get_date(self, date):
        if not date:
            return ''
        month_index = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
        split_date = date.split(' ')
        if '' in split_date:
            return date
        elif len(split_date) == 1:
            return split_date[0]
        elif len(split_date) == 2:
            return str(month_index.index(split_date[0])+1) + '/' + split_date[1]
        else:
            return '/'.join([str(month_index.index(split_date[0])+1), split_date[1].strip(','), split_date[2]])
        
    def get_year(self, date):
        if not date:
            return ''
        split_date = date.split(' ')
        if '' in split_date:
            return date
        elif len(split_date) == 1:
            return split_date[0]
        elif len(split_date) == 2:
            return split_date[1]
        else:
            return split_date[2]
        
    @rltk.cached_property
    def id(self):
        return self.raw_object['id']
    
    @rltk.cached_property
    def title(self):
        return self.raw_object['title']

    @rltk.cached_property
    def title_tokens(self):
        return set(tokenizer.tokenize(self.title))
    
    @rltk.cached_property
    def creators(self):
        return self.raw_object['creators']
    
    @rltk.cached_property
    def rt_rating(self):
        return self.raw_object['ratings']
    
    @rltk.cached_property
    def plot(self):
        return self.raw_object['description']
    
    @rltk.cached_property
    def genre(self):
        return self.raw_object['genre']
    
    @rltk.cached_property
    def stars(self):
        return self.raw_object['starring']
    
    @rltk.cached_property
    def release_date(self):
        return self.get_date(self.raw_object['premiere'])
    
    @rltk.cached_property
    def release_year(self):
        return self.get_year(self.raw_object['premiere'])
    
    @rltk.cached_property
    def producer(self):
        return self.raw_object['producer']
    
    @rltk.cached_property
    def network(self):
        return self.raw_object['network']
    
class Wikidata(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    def get_date(self, date):
        if not date:
            return ''
        month_index = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
        date = date.date()
        year = date.year
        day = date.day
        month = date.month
        return str(month_index[month - 1]) + ' ' + str(day) + ', ' + str(year)
        
    def get_year(self, date):
        if not date:
            return ''
        return date.date().year
        
    @rltk.cached_property
    def id(self):
        return self.raw_object['id']
    
    @rltk.cached_property
    def title(self):
        return self.raw_object['title']

    @rltk.cached_property
    def title_tokens(self):
        return set(tokenizer.tokenize(self.title))
    
    @rltk.cached_property
    def num_seasons(self):
        if self.raw_object['num_seasons']:
            return int(self.raw_object['num_seasons'])
        return self.raw_object['num_seasons']
    
    @rltk.cached_property
    def review_scores(self):
        return self.raw_object['review_scores'].split(', ')
    
    @rltk.cached_property
    def series_spin_off(self):
        return self.raw_object['series_spin_off'].split(', ')
    
    @rltk.cached_property
    def genre(self):
        return self.raw_object['genres'].split(', ')
    
    @rltk.cached_property
    def stars(self):
        return self.raw_object['cast_members'].split(', ')
    
    @rltk.cached_property
    def release_date(self):
        return self.get_date(self.raw_object['start_time'])
    
    @rltk.cached_property
    def end_date(self):
        return self.get_date(self.raw_object['end_time'])
    
    @rltk.cached_property
    def release_year(self):
        return self.get_year(self.raw_object['start_time'])
    
    @rltk.cached_property
    def producer(self):
        return self.raw_object['producers'].split(', ')
    
    @rltk.cached_property
    def network(self):
        return self.raw_object['original_broadcasters'].split(', ')
    
    @rltk.cached_property
    def country_of_origin(self):
        return self.raw_object['country_of_origin'].split(', ')
    
    @rltk.cached_property
    def language(self):
        return self.raw_object['languages'].split(', ')
    
    @rltk.cached_property
    def awards_received(self):
        return self.raw_object['awards_received'].split(', ')

In [14]:
wikidata_df.columns

Index(['id', 'title', 'country_of_origin', 'num_seasons', 'start_time',
       'end_time', 'cast_members', 'producers', 'original_broadcasters',
       'distributed_by', 'genres', 'languages', 'awards_received',
       'review_scores', 'series_spin_off'],
      dtype='object')

In [15]:
ds1 = rltk.Dataset(rltk.DataFrameReader(imdb_df), record_class=IMDB)
ds2 = rltk.Dataset(rltk.DataFrameReader(rt_df), record_class=RottenTomatoes)
ds3 = rltk.Dataset(rltk.DataFrameReader(wikidata_df), record_class=Wikidata)

And we can inspect a few entries:

In [16]:
ds1.generate_dataframe().head()

Unnamed: 0,id,title,title_tokens,imdb_rating,plot,genre,creators,stars,top_cast,num_seasons,release_date,release_year,language,country_of_origin,production_company
0,0,The Peripheral,"{Peripheral, The}",8.3,Set in the future when technology has subtly a...,"[Drama, Mystery, Sci-Fi]",[],"[Chloë Grace Moretz, Gary Carr, Jack Reynor]","[Chloë Grace Moretz, Gary Carr, Jack Reynor, L...",1,10/21/2022,2022,[English],[United States],"[Amazon Studios, Big Indie Pictures, Kilter Fi..."
1,1,KinnPorsche the Series La 'forte,"{the, Series, La, KinnPorsche, forte, '}",8.8,"Kinn, the second son of a prominent mafia head...","[Action, Comedy, Romance]",[],"[Mile Phakphum Romsaithong, Apo Nattawin Watta...","[Mile Phakphum Romsaithong, Apo Nattawin Watta...",1,4/2/2022,2022,"[Thai, English]",[Thailand],[Be on Cloud]
2,2,Wayne,{Wayne},8.4,"Wayne, a 16 year-old Dirty Harry with a heart ...","[Action, Comedy]",[Shawn Simmons],"[Mark McKenna, Ciara Bravo, Joshua J. Williams]","[Mark McKenna, Ciara Bravo, Joshua J. Williams...",1,1/16/2019,2019,[English],[United States],"[Endeavor Content, Reese Wernick Productions]"
3,3,Henry Danger,"{Henry, Danger}",5.7,"Looking for a part-time job, 13-year-old Henry...","[Action, Comedy, Family]","[Dana Olsen, Dan Schneider, Dana Olsen]","[Jace Norman, Cooper Barnes, Riele Downs]","[Jace Norman, Cooper Barnes, Riele Downs, Sean...",5,7/26/2014,2014,[English],[United States],"[Schneider's Bakery, Uptown Productions]"
4,4,The Legend of Korra,"{Korra, of, Legend, The}",8.4,Avatar Korra fights to keep Republic City safe...,"[Animation, Action, Adventure]","[Michael Dante DiMartino, Bryan Konietzko]","[Janet Varney, P.J. Byrne, David Faustino]","[Janet Varney, P.J. Byrne, David Faustino, J.K...",4,4/14/2012,2012,[English],[United States],"[Ginormous Madman, Nickelodeon Animation Studios]"


In [17]:
ds2.generate_dataframe().head()

Unnamed: 0,id,title,title_tokens,creators,rt_rating,plot,genre,stars,release_date,release_year,producer,network
0,0,Star Wars: Tales of the Jedi,"{the, Star, Tales, :, Wars, Jedi, of}","[Dave Filoni, Charles Murray]","[100%, 95%]",Six brand-new animated shorts featuring parabl...,[Action],"[Ashley Eckstein, Corey Burton, Liam Neeson, M...",10/26/2022,2022,"[Dave Filoni, Charles Murray]",Disney+
1,1,12 Monkeys,"{Monkeys, 12}",[],"[88%, 77%]","A man from the post-apocalyptic future, Cole u...",[Drama],"[Aaron Stanford, Amanda Schull, Barbara Sukowa...",1/16/2015,2015,"[Charles Roven, Richard Suckle, Travis Fickett...",SYFY
2,2,Genndy Tartakovsky's Primal,"{Genndy, s, Primal, ', Tartakovsky}",[Genndy Tartakovsky],"[100%, 95%]","At the dawn of evolution, a caveman and a dino...",[Action],[Aaron LaPlante],10/7/2019,2019,[],Cartoon Network
3,3,One Piece,"{Piece, One}",[],[88%],Monkey D. Luffy wants to become the King of al...,[Kids family],"[Mayumi Tanaka, Kazuya Nakai, Akemi Okamura, K...",10/20/1999,1999,[],Fox
4,4,Fullmetal Alchemist Brotherhood,"{Brotherhood, Fullmetal, Alchemist}",[],"[100%, 92%]",Brothers Edward and Alphonse Elric search for ...,[Action],"[Romi Pak, Rie Kugimiya, Miyoko Aso, Megumi To...",4/5/2009,2009,[],MBSTVJP


In [18]:
ds3.generate_dataframe().head()

Unnamed: 0,id,title,title_tokens,num_seasons,review_scores,series_spin_off,genre,stars,release_date,end_date,release_year,producer,network,country_of_origin,language,awards_received
0,Q115647,The Hollow Crown,"{Hollow, Crown, The}",2,[],[],[costume drama],"[Tom Hiddleston, Jeremy Irons, Ben Whishaw, Si...","Jun 30, 2012","Jul 21, 2012",2012,[],[BBC Two],[United Kingdom],[English],[British Academy Television Award for Best Act...
1,Q115874,"El Zorro, la espada y la rosa","{El, la, y, ,, espada, Zorro, rosa}",1,[],[],[telenovela],[Christian Meier],"Feb 12, 2007","Jul 23, 2007",2007,[],[Telemundo],[Colombia],[Spanish],[]
2,Q494,Beakman's World,"{Beakman, s, ', World}",4,[],[],[comedic television series],"[Paul Zaloom, Mark Ritts, Eliza Schneider, Sen...","Sep 16, 1992","Aug 1, 1998",1992,[],"[TLC, CBS]",[United States of America],[English],[]
3,Q723,Rookie Blue,"{Rookie, Blue}",6,[],[],"[drama, police procedural, LGBTI+ related TV s...","[Gregory Smith, Missy Peregrym, Charlotte Sull...","Jun 24, 2010","Jul 29, 2015",2010,[],[Global Television Network],[Canada],[English],[]
4,Q961,More Than Life at Stake,"{Life, Stake, at, More, Than}",1,[],[],"[espionage television series, war television s...","[Jan Englert, Krystyna Feldman, Zygmunt Kęstow...","Oct 10, 1968","Jan 1, 1968",1968,[],[Telewizja Polska],[Poland],[Polish],[]


### Task 1-2. Blocking

First, we'll load dev set to evaluate both blocking (Task 1-2) and entity linking (Task 1-3).

In [19]:
total_number_pairs = len(list(rltk.get_record_pairs(ds1, ds2)))
total_number_pairs

9210718

In [20]:
## blocking on name string
def blocking(ds1, ds2):
    bg = rltk.HashBlockGenerator()
    block = bg.generate(
                bg.block(ds1, function_=lambda r: r.title[:5]),
                bg.block(ds2, function_=lambda r: r.title[:5]))
    return block

### Task 1-3. Entity Linking

In [21]:
## finding similarity between title tokens
def title_tokens_similarity(r1, r2):
    return rltk.jaccard_index_similarity(r1.title_tokens, r2.title_tokens)

## checking if premiere data is same
def creator_similarity(r1, r2):
    if r1.creators and r2.creators:
        if len(set(r1.creators).intersection(set(r2.creators))) == 0:
            return -1
        return len(set(r1.creators).intersection(set(r2.creators))) / len(r2.creators)
    return 0

## checking similarity between title using Needleman Wunsch
def title_similarity(r1, r2):
    return rltk.needleman_wunsch_similarity(r1.title.lower(), r2.title.lower())

## checking similarity between cast
def cast_similarity(r1, r2):
    if r1.top_cast and r2.stars:
        if len(set(r1.top_cast).intersection(set(r2.stars))) == 0:
            return -1
        return len(set(r1.top_cast).intersection(set(r2.stars))) / len(r2.stars)
    return 0

## checking similarity between release year
def premiere_similarity(r1, r2):
    if r1.release_year and r2.release_year and r1.release_year != 'NA':
        if r1.release_date == r2.release_date:
            return 1
        elif r1.release_year == r2.release_year:
            return 0.9
        elif abs(int(r1.release_year) - int(r2.release_year)) <= 5:
            return math.pow(1.8, -1 * abs(int(r1.release_year) - int(r2.release_year)))
        elif abs(int(r1.release_year) - int(r2.release_year)) > 15:
            return -0.8
        elif abs(int(r1.release_year) - int(r2.release_year)) > 10:
            return -0.5
    return 0

In [22]:
# threshold value to determine if we are confident the record match
MY_TRESH = 0.75

# entity linkage scoring function
def rule_based_method(r1, r2):
    score_1 = title_tokens_similarity(r1, r2)
    score_2 = title_similarity(r1, r2)
    score_3 = creator_similarity(r1, r2)
    score_4 = cast_similarity(r1, r2)
    score_5 = premiere_similarity(r1, r2)
     
    total = 0.4 * score_1 + 0.3 * score_2 + 0.08 * score_3 + 0.06 * score_4 + 0.16 * score_5
    
    # return two values: 0 if they don't match and 1 if they match, float to determine confidence
    return int(total >= MY_TRESH), total

### Task 1-4. Record Linking

In [23]:
import math

valid_pairs = []
block_pairs = rltk.get_record_pairs(ds1, ds2, block=blocking(ds1, ds2))
for r1, r2 in block_pairs:
    result, confidence = rule_based_method(r1, r2)
    if result:
        valid_pairs.append((r1.id, r1.title, r2.id, r2.title, confidence))

In [24]:
len(valid_pairs), len(ds1.generate_dataframe()), len(ds2.generate_dataframe())

(658, 5014, 1837)

In [25]:
import csv
with open('valid_predictions_id.csv', mode='w') as file:
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for row in valid_pairs:
        writer.writerow(row)

In [26]:
dt = pd.read_csv('valid_predictions_id.csv', names=['imdb_id', 'imdb_title', 'rt_id', 'rt_title', 'confidence'], dtype=str)


In [27]:
dt[dt.duplicated(['imdb_title','rt_title', 'imdb_id'], False)]

Unnamed: 0,imdb_id,imdb_title,rt_id,rt_title,confidence
557,1213,Taskmaster,1515,Taskmaster,0.874
558,1213,Taskmaster,1614,Taskmaster,0.7754348422496571


In [28]:
dt[dt.duplicated(['imdb_title','rt_title', 'rt_id'], False)]

Unnamed: 0,imdb_id,imdb_title,rt_id,rt_title,confidence
17,990,Dragon Ball Z,1634,Dragon Ball Z,0.88
18,453,Dragon Ball Z,1634,Dragon Ball Z,0.896


In [29]:
df_dict = {}
for index , record in dt.iterrows():
    imdb_id = record['imdb_id']
    rt_id = record['rt_id']
    idx = str(index+1)
    df_dict[idx] = {}
    df_dict[idx]['title'] = imdb_df[imdb_df['id']==imdb_id]['title'].values[0]
    df_dict[idx]['imdb_rating'] = imdb_df[imdb_df['id']==imdb_id]['imdb_rating'].values[0]
    df_dict[idx]['rt_rating'] = rt_df[rt_df['id']==rt_id]['ratings'].values[0]
    if not rt_df[rt_df['id']==rt_id]['description'].isna().bool():
        df_dict[idx]['plot'] = rt_df[rt_df['id']==rt_id]['description'].values[0]
    elif (imdb_df[imdb_df['id']==imdb_id]['plot'] != 'NA').bool():
        df_dict[idx]['plot'] = imdb_df[imdb_df['id']==imdb_id]['plot'].values[0]
    else:
        df_dict[idx]['plot'] = ''

    df_dict[idx]['genre'] = list(set(rt_df[rt_df['id']==rt_id]['genre'].values[0]).union(set(imdb_df[imdb_df['id']==imdb_id]['genre'].values[0])))

    df_dict[idx]['creators'] = []
    if len(imdb_df[imdb_df['id']==imdb_id]['creators']) > 0 and '' not in imdb_df[imdb_df['id']==imdb_id]['creators']:
        df_dict[idx]['creators'].extend(imdb_df[imdb_df['id']==imdb_id]['creators'].values[0])
    if len(rt_df[rt_df['id']==rt_id]['creators']) > 0 and '' not in rt_df[rt_df['id']==rt_id]['creators']:
        df_dict[idx]['creators'].extend(rt_df[rt_df['id']==rt_id]['creators'].values[0])
    df_dict[idx]['creators'] = list(set(df_dict[idx]['creators']))
    if '' in df_dict[idx]['creators']:
        df_dict[idx]['creators'].remove('')
    df_dict[idx]['cast'] = []
    if len(imdb_df[imdb_df['id']==imdb_id]['top_cast']) > 0:
        df_dict[idx]['cast'] = imdb_df[imdb_df['id']==imdb_id]['top_cast'].values[0]
    elif len(rt_df[rt_df['id']==rt_id]['starring']) > 0:
        df_dict[idx]['cast'] = rt_df[rt_df['id']==rt_id]['starring'].values[0]

    df_dict[idx]['release_date'] = ''
    if not rt_df[rt_df['id']==rt_id]['premiere'].isna().bool():
        df_dict[idx]['release_date'] = rt_df[rt_df['id']==rt_id]['premiere'].values[0]
    elif len(imdb_df[imdb_df['id']=='1539']['release_date']) > 0:
        df_dict[idx]['release_date'] = imdb_df[imdb_df['id']==imdb_id]['release_date'].values[0][0]

    df_dict[idx]['num_seasons'] = imdb_df[imdb_df['id']==imdb_id]['num_seasons'].values[0]
    df_dict[idx]['language'] = imdb_df[imdb_df['id']==imdb_id]['language'].values[0]
    df_dict[idx]['country_of_origin'] = imdb_df[imdb_df['id']==imdb_id]['country_of_origin'].values[0]
    df_dict[idx]['production_company'] = imdb_df[imdb_df['id']==imdb_id]['production_company'].values[0]
    df_dict[idx]['network'] = rt_df[rt_df['id']==rt_id]['network'].values[0]
    df_dict[idx]['producer'] = rt_df[rt_df['id']==rt_id]['producer'].values[0]



In [30]:
imdb_df['release_date'] = imdb_df['release_date'].apply(lambda x: x[0])

In [31]:
df_new = pd.DataFrame(df_dict).T

In [32]:
df_new.head()

Unnamed: 0,title,imdb_rating,rt_rating,plot,genre,creators,cast,release_date,num_seasons,language,country_of_origin,production_company,network,producer
1,The Peripheral,8.3,"[73%, 88%]",Set in the future when technology has subtly a...,"[Sci-Fi, Mystery, Drama]",[Scott B. Smith],"[Chloë Grace Moretz, Gary Carr, Jack Reynor, L...","Oct 21, 2022",1,[English],[United States],"[Amazon Studios, Big Indie Pictures, Kilter Fi...",Prime Video,"[Jonathan Nolan, Lisa Joy, Athena Wickham, Sco..."
2,The Purge,6.5,"[42%, 70%]",During a 12-hour period when all crime -- incl...,"[Action, Horror, Drama]",[James DeMonaco],"[Gabriel Chavarria, Derek Luke, Hannah Emily A...","Sep 4, 2018",2,"[English, Spanish, French]",[United States],"[Blumhouse Productions, Platinum Dunes]",USA,"[James DeMonaco, Sebastien K. Lemercier, Thoma..."
3,The Powerpuff Girls,3.5,[32%],In this reboot of the classic Cartoon Network ...,"[Kids family, Short, Action, Animation]",[Craig McCracken],"[Amanda Leighton, Natalie Palamides, Kristen L...","Apr 4, 2016",3,[English],"[United States, Mexico, Spain]","[Cartoon Network Studios, Mediaset España, Pla...",Cartoon Network,[]
4,The Penguins of Madagascar,7.5,[80%],A rookery of penguins with attitude -- leader ...,"[Kids family, Short, Action, Animation]","[Eric Darnell, Tom McGrath]","[John DiMaggio, Tom McGrath, Jeff Bennett, Jam...","Nov 29, 2008",3,[English],[United States],"[DreamWorks Animation, Nickelodeon Animation S...",Nickelodeon,"[Mark McCorkle, Bob Schooley]"
5,The Pact,7.3,[],Family members are strained past the breaking ...,"[Sci-Fi, Adventure, Drama]",[Bobby Barbacioru],"[Rick Ravanello, Natassia Malthe, Eyad Hourani...","Mar 26, 2022",1,[English],[Qatar],"[Katara Studios, Organic Media Group]",The Roku Channel,"[Luca Bercovici, Hussein Fakhri, Ahmed Al Baker]"


In [33]:
imdb_new = imdb_df[~imdb_df['id'].isin(dt['imdb_id'])]

In [34]:
imdb_new = imdb_new.drop(['stars', 'id'], axis=1)
imdb_new = imdb_new.rename(columns={'top_cast': 'cast'})

In [35]:
def get_imdb_date(date):
#     print(date)
    month_index = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    month_map = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    if '(' in date:
        date = date.rsplit(' (', maxsplit=1)[0]
    split_date = date.split(' ')
    if '' in split_date:
        return date
    elif len(split_date) == 1:
        return split_date[0]
    elif len(split_date) == 2:
        return str(month_map[month_index.index(split_date[0])]) + ' ' + split_date[1]
    else:
        return str(month_map[month_index.index(split_date[0])]) + ' ' + split_date[1].strip(',') + ', ' + split_date[2]
    

In [36]:
imdb_new['release_date'] = imdb_new['release_date'].apply(lambda x: get_imdb_date(x))

In [37]:
imdb_new['release_date']

1        Apr 2, 2022
2       Jan 16, 2019
3       Jul 26, 2014
5       Jan 26, 1979
6        Dec 7, 2014
            ...     
5009    Mar 21, 2010
5010    Nov 20, 2005
5011            2022
5012    May 29, 2002
5013    Jun 13, 1979
Name: release_date, Length: 4357, dtype: object

In [38]:
df_new = pd.concat([df_new, imdb_new], axis=0)

In [39]:
rt_new = rt_df[~rt_df['id'].isin(dt['rt_id'])]
rt_new = rt_new.drop(['id'], axis=1)
rt_new = rt_new.rename(columns={'starring': 'cast', 'ratings': 'rt_rating', 'description': 'plot', 'premiere': 'release_date'})


In [40]:
df_new = pd.concat([df_new, rt_new], axis=0)

In [41]:
df_new.head(40)

Unnamed: 0,title,imdb_rating,rt_rating,plot,genre,creators,cast,release_date,num_seasons,language,country_of_origin,production_company,network,producer
1,The Peripheral,8.3,"[73%, 88%]",Set in the future when technology has subtly a...,"[Sci-Fi, Mystery, Drama]",[Scott B. Smith],"[Chloë Grace Moretz, Gary Carr, Jack Reynor, L...","Oct 21, 2022",1.0,[English],[United States],"[Amazon Studios, Big Indie Pictures, Kilter Fi...",Prime Video,"[Jonathan Nolan, Lisa Joy, Athena Wickham, Sco..."
2,The Purge,6.5,"[42%, 70%]",During a 12-hour period when all crime -- incl...,"[Action, Horror, Drama]",[James DeMonaco],"[Gabriel Chavarria, Derek Luke, Hannah Emily A...","Sep 4, 2018",2.0,"[English, Spanish, French]",[United States],"[Blumhouse Productions, Platinum Dunes]",USA,"[James DeMonaco, Sebastien K. Lemercier, Thoma..."
3,The Powerpuff Girls,3.5,[32%],In this reboot of the classic Cartoon Network ...,"[Kids family, Short, Action, Animation]",[Craig McCracken],"[Amanda Leighton, Natalie Palamides, Kristen L...","Apr 4, 2016",3.0,[English],"[United States, Mexico, Spain]","[Cartoon Network Studios, Mediaset España, Pla...",Cartoon Network,[]
4,The Penguins of Madagascar,7.5,[80%],A rookery of penguins with attitude -- leader ...,"[Kids family, Short, Action, Animation]","[Eric Darnell, Tom McGrath]","[John DiMaggio, Tom McGrath, Jeff Bennett, Jam...","Nov 29, 2008",3.0,[English],[United States],"[DreamWorks Animation, Nickelodeon Animation S...",Nickelodeon,"[Mark McCorkle, Bob Schooley]"
5,The Pact,7.3,[],Family members are strained past the breaking ...,"[Sci-Fi, Adventure, Drama]",[Bobby Barbacioru],"[Rick Ravanello, Natassia Malthe, Eyad Hourani...","Mar 26, 2022",1.0,[English],[Qatar],"[Katara Studios, Organic Media Group]",The Roku Channel,"[Luca Bercovici, Hussein Fakhri, Ahmed Al Baker]"
6,The Last of Us,,[],Joel and Ellie must survive ruthless killers a...,"[Action, Adventure, Drama]","[Neil Druckmann, Craig Mazin]","[Pedro Pascal, Bella Ramsey, Gabriel Luna, Nic...","Jan 1, 2023",1.0,[English],"[United States, Canada]","[Naughty Dog, PlayStation Productions, Sony Pi...",HBO,[]
7,The Loud House,7.0,[82%],"Things are crowded in the Loud household, with...","[Kids family, Short, Adventure, Animation]","[Michael Rubiner, Chris Savino]","[Grey Griffin, Lara Jill Miller, Jessica DiCic...","May 2, 2016",6.0,[English],[United States],[Nickelodeon Animation Studios],Nickelodeon,[Chris Savino]
8,The Lincoln Lawyer,7.7,"[79%, 80%]",Idealistic lawyer Mickey Haller runs practice ...,"[Mystery, Crime, Drama]","[David E. Kelley, Ted Humphrey]","[Manuel Garcia-Rulfo, Neve Campbell, Becki New...","May 13, 2022",2.0,[English],[United States],"[A+E Studios, Algorithm Entertainment, David E...",Netflix,"[David E. Kelley, Ted Humphrey, Michael Connel..."
9,The Lord of the Rings: The Rings of Power,6.9,"[85%, 39%]",Prime Video's The Lord of the Rings: The Rings...,"[Fantasy, Action, Adventure, Drama]","[Patrick McKay, J.D. Payne, John D. Payne]","[Morfydd Clark, Ismael Cruz Cordova, Charlie V...","Sep 2, 2022",1.0,[English],[United States],"[Amazon Studios, Harper Collins Publishers, Ne...",Prime Video,"[J.D. Payne, Patrick McKay, Lindsey Weber, Cal..."
10,The Legend of Vox Machina,8.4,"[100%, 93%]","Vox Machina, a band of eight unlikely heroes, ...","[Animation, Fantasy, Action, Adventure]",[],"[Laura Bailey, Taliesin Jaffe, Ashley Johnson,...","Jan 28, 2022",1.0,[English],[United States],"[Amazon Studios, Critical Role Productions, Ti...",Prime Video,"[Brandon Auman, Chris Prynoski]"


In [42]:
import re

class IMDB_RT(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''
        
    @rltk.cached_property
    def id(self):
        return self.raw_object['id']

    @rltk.cached_property
    def title(self):
        return self.raw_object['title']

    @rltk.cached_property
    def title_tokens(self):
        return set(tokenizer.tokenize(self.title))
    
    @rltk.cached_property
    def imdb_rating(self):
        return self.raw_object['imdb_rating']
    
    @rltk.cached_property
    def rt_rating(self):
        return self.raw_object['rt_rating']
    
    @rltk.cached_property
    def plot(self):
        return self.raw_object['plot']
    
    @rltk.cached_property
    def genre(self):
        return self.raw_object['genre']
    
    @rltk.cached_property
    def creators(self):
        return self.raw_object['creators']
    
    @rltk.cached_property
    def cast(self):
        return self.raw_object['cast']
    
    @rltk.cached_property
    def num_seasons(self):
        return self.raw_object['num_seasons']
    
    @rltk.cached_property
    def release_date(self):
        return self.raw_object['release_date']
    
    def get_year(self, date):
        print(date)
        if not date:
            return ''
        split_date = date.split(' ')
        if '' in split_date:
            return date
        elif len(split_date) == 1:
            return split_date[0]
        elif len(split_date) == 2:
            return split_date[1]
        else:
            return split_date[2]
    
    @rltk.cached_property
    def release_year(self):
        return self.get_year(self.raw_object['release_date'])
    
    @rltk.cached_property
    def language(self):
        return self.raw_object['language']
    
    @rltk.cached_property
    def country_of_origin(self):
        return self.raw_object['country_of_origin']
    
    @rltk.cached_property
    def production_company(self):
        return self.raw_object['production_company']
    
    @rltk.cached_property
    def producer(self):
        return self.raw_object['producer']
    
    @rltk.cached_property
    def network(self):
        return self.raw_object['network']

In [43]:
df_new.reset_index(drop=True, inplace=True)

In [44]:
df_new['id'] = range(0, len(df_new))
df_new['id'] = df_new['id'].astype(str)
df_new.fillna('NA', inplace=True)

In [45]:
ds4 = rltk.Dataset(rltk.DataFrameReader(df_new), record_class=IMDB_RT)

Oct 21, 2022
Sep 4, 2018
Apr 4, 2016
Nov 29, 2008
Mar 26, 2022
Jan 1, 2023
May 2, 2016
May 13, 2022
Sep 2, 2022
Jan 28, 2022
Jun 29, 2014
Jun 16, 2022
Oct 10, 2015
Apr 14, 2012
Oct 28, 2022
Aug 30, 2019
Sep 14, 2018
Sep 13, 1996
Sep 13, 1996
Dec 23, 2021
Sep 6, 2010
Oct 28, 2022
Feb 26, 2020
Aug 10, 2022
Mar 4, 2022
Oct 11, 2004
Apr 24, 2022
Oct 28, 2022
Sep 23, 2013
Jul 3, 2020
Jul 26, 2019
Nov 6, 2005
Apr 3, 2011
Dec 29, 2021
Sep 5, 1992
Jan 10, 1999
Oct 20, 1999
Oct 4, 2015
Dec 4, 2011
Jan 25, 2014
Apr 11, 2019
Jan 16, 2018
Oct 31, 2015
Feb 15, 2019
Oct 12, 2021
Oct 5, 2022
Dec 6, 2017
Apr 29, 2014
May 31, 2019
Feb 26, 2018
Jan 28, 2018
Oct 19, 2015
Mar 3, 2022
Sep 26, 2022
Sep 29, 2022
Dec 14, 2015
Sep 23, 2016
Oct 25, 2021
Jan 12, 2020
Jun 16, 2022
Mar 7, 2019
Jan 10, 2020
Oct 3, 2013
Sep 10, 2017
Sep 15, 2022
Jun 25, 2021
Jun 22, 2017
Dec 16, 2015
Oct 7, 2022
Nov 12, 2019
Aug 12, 1995
Jan 9, 2001
Apr 30, 2021
Jan 19, 2014
Jan 15, 2015
Oct 7, 2022
Oct 28, 2014
Apr 20, 2020
Sep 23,

Aug 9, 2020
Aug 6, 2022
Oct 16, 2022
Feb 3, 2021
Jun 15, 2017
Jan 25, 2019
Apr 12, 2020
Aug 18, 2021
Nov 14, 2002
Oct 6, 2017
Sep 24, 1987
Jun 23, 2013
Mar 30, 2020
Apr 3, 2013
Aug 6, 2021
Sep 21, 2014
Jan 9, 2017
Feb 13, 2008
Sep 14, 1971
Sep 16, 2022
Mar 12, 2014
Jan 3, 1990
Jul 10, 2022
Dec 11, 2017
Jul 13, 2020
Feb 9, 2018
May 31, 2010
Jan 1, 2011
Oct 14, 2017
Jan 26, 2022
Apr 4, 2021
Aug 30, 2021
Mar 8, 2019
Oct 1, 1990
Apr 9, 2022
Oct 15, 1971
May 10, 2022
May 27, 2013
Jun 8, 2015
Jun 20, 2005
Sep 21, 1983
May 26, 2017
Mar 20, 2019
Jan 8, 1999
Jan 2002
Jan 7, 2010
Jan 1984
Feb 5, 2021
Jun 30, 1946
Sep 5, 1984
Apr 10, 2014
Oct 3, 1987
NA
Jul 14, 2016
Oct 15, 2015
Jul 12, 2018
Aug 27, 2015
Sep 7, 1996
Jan 29, 1992
Feb 28, 2019
Oct 30, 2020
Jan 4, 2000
Oct 31, 2012
Jun 19, 2017
Oct 30, 1970
Jan 1, 2020
Feb 6, 1988
Jun 24, 2017
Oct 7, 2007
Jun 13, 2021
May 17, 2010
Oct 30, 2009
Mar 11, 2017
Feb 27, 1979
Jan 23, 2020
Mar 2, 2007
Oct 27, 2020
Nov 22, 2016
Nov 1, 2017
Jun 28, 2019
Feb 2

Jan 18, 2013
Jan 4, 1999
Apr 10, 2017
Apr 26, 2020
Jul 2, 2002
Aug 8, 2019
Jan 4, 1999
Feb 14, 2008
May 24, 2017
Feb 2, 2020
May 2, 2011
Jan 1, 2019
Sep 6, 2007
Jun 2, 2021
Jan 1, 2006
Apr 17, 2016
Feb 28, 2019
Jan 10, 2011
Feb 14, 2011
Jun 26, 2007
NA
May 2, 2019
Jan 6, 1998
Jun 27, 2006
Jan 11, 2017
Jan 6, 2011
Jan 1, 2000
Sep 15, 2008
Sep 8, 2008
Oct 19, 2004
Mar 5, 2021
Jan 19, 2016
Oct 13, 2019
May 16, 2017
Sep 10, 2012
Jul 15, 2020
Jul 15, 2003
Oct 15, 2019
Sep 18, 2017
Aug 17, 2009
Apr 28, 2021
Jan 6, 1997
Feb 10, 2000
Oct 1, 2020
Apr 26, 2012
Aug 24, 2022
Apr 3, 2003
Mar 31, 2017
Apr 11, 2013
Oct 10, 2013
Aug 19, 2022
Jan 1, 2021
May 4, 2021
Apr 22, 2012
Mar 6, 2005
Apr 14, 2003
Jan 24, 2020
Aug 12, 2020
Aug 10, 2018
Aug 16, 2019
May 21, 2021
Jul 10, 2019
Feb 7, 2018
Feb 7, 2012
Oct 20, 2018


In [46]:
ds3.generate_dataframe().head(1)

Unnamed: 0,id,title,title_tokens,num_seasons,review_scores,series_spin_off,genre,stars,release_date,end_date,release_year,producer,network,country_of_origin,language,awards_received
0,Q115647,The Hollow Crown,"{Hollow, Crown, The}",2,[],[],[costume drama],"[Tom Hiddleston, Jeremy Irons, Ben Whishaw, Si...","Jun 30, 2012","Jul 21, 2012",2012,[],[BBC Two],[United Kingdom],[English],[British Academy Television Award for Best Act...


In [47]:
ds4.generate_dataframe().head(1)

Unnamed: 0,id,title,title_tokens,imdb_rating,rt_rating,plot,genre,creators,cast,num_seasons,release_date,release_year,language,country_of_origin,production_company,producer,network
0,0,The Peripheral,"{Peripheral, The}",8.3,"[73%, 88%]",Set in the future when technology has subtly a...,"[Sci-Fi, Mystery, Drama]",[Scott B. Smith],"[Chloë Grace Moretz, Gary Carr, Jack Reynor, L...",1,"Oct 21, 2022",2022,[English],[United States],"[Amazon Studios, Big Indie Pictures, Kilter Fi...","[Jonathan Nolan, Lisa Joy, Athena Wickham, Sco...",Prime Video


In [48]:
## finding similarity between title tokens
def title_tokens_similarity_1(r1, r2):
    return rltk.jaccard_index_similarity(r1.title_tokens, r2.title_tokens)

## checking if premiere data is same
def num_seasons_similarity_1(r1, r2):
    if r1.num_seasons and r2.num_seasons and r1.num_seasons != '':
        if r1.num_seasons == r2.num_seasons:
            return 1
        return -1
    return 0

## checking similarity between title using Needleman Wunsch
def title_similarity_1(r1, r2):
    return rltk.needleman_wunsch_similarity(r1.title.lower(), r2.title.lower())

## checking similarity between cast
def cast_similarity_1(r1, r2):
    if r1.stars and r2.cast:
        if len(set(r1.stars).intersection(set(r2.cast))) == 0:
            return -1
        return len(set(r1.stars).intersection(set(r2.cast))) / len(r1.stars)
    return 0

## checking similarity between release year
def premiere_similarity_1(r1, r2):
    print(r2.release_year)
    if r1.release_year and r2.release_year and r2.release_year != 'NA':
        if r1.release_date == r2.release_date:
            return 1
        elif r1.release_year == r2.release_year:
            return 0.9
        elif abs(int(r1.release_year) - int(r2.release_year)) <= 5:
            return math.pow(1.8, -1 * abs(int(r1.release_year) - int(r2.release_year)))
        elif abs(int(r1.release_year) - int(r2.release_year)) > 15:
            return -0.8
        elif abs(int(r1.release_year) - int(r2.release_year)) > 10:
            return -0.5
    return 0

## checking similarity between release year
def country_origin_similarity_1(r1, r2):
    sim = []
    if r1.country_of_origin and r2.country_of_origin and r2.country_of_origin != 'NA':
        for country in r1.country_of_origin:
            for c in r2.country_of_origin:
                sim.append(rltk.needleman_wunsch_similarity(country, c))
        return sum(sim) / len(sim)
    return 0

In [49]:
import numpy as np
# threshold value to determine if we are confident the record match
MY_TRESH = 0.62

# entity linkage scoring function
def rule_based_method_1(r1, r2):
    score_1 = title_tokens_similarity_1(r1, r2)
    score_2 = title_similarity_1(r1, r2)
    score_3 = num_seasons_similarity_1(r1, r2)
    score_4 = cast_similarity_1(r1, r2)
    score_5 = premiere_similarity_1(r1, r2)
    score_6 = country_origin_similarity_1(r1, r2)
    
    total = 0.3 * score_1 + 0.3 * score_2 + 0.04 * score_3 + 0.07 * score_4 + 0.2 * score_5 + 0.09 * score_6
    
    # return two values: 0 if they don't match and 1 if they match, float to determine confidence
    return int(total >= MY_TRESH), total

In [50]:
import math

valid_pairs = []
block_pairs_irw = rltk.get_record_pairs(ds3, ds4, block=blocking(ds3, ds4))
for r1, r2 in block_pairs_irw:
    result, confidence = rule_based_method_1(r1, r2)
    if result:
        valid_pairs.append((r1.id, r1.title, r2.id, r2.title, confidence))

2020
2018
1971
2021
2011
1994
2022
2013
1997
1991
2020
2022
2018
2013
2018
2012
2021
1983
NA
2013
2017
2016
2019
2020
2021
2020
1977
2019
2022
1982
2017
2019
2021
2020
2018
1971
2021
2011
1994
2022
2013
1997
1991
2020
2022
2018
2013
2018
2012
2021
1983
NA
2013
2017
2016
2019
2020
2021
2020
1977
2019
2022
1982
2017
2019
2021
2020
2018
1971
2021
2011
1994
2022
2013
1997
1991
2020
2022
2018
2013
2018
2012
2021
1983
NA
2013
2017
2016
2019
2020
2021
2020
1977
2019
2022
1982
2017
2019
2021
2020
2018
1971
2021
2011
1994
2022
2013
1997
1991
2020
2022
2018
2013
2018
2012
2021
1983
NA
2013
2017
2016
2019
2020
2021
2020
1977
2019
2022
1982
2017
2019
2021
2020
2018
1971
2021
2011
1994
2022
2013
1997
1991
2020
2022
2018
2013
2018
2012
2021
1983
NA
2013
2017
2016
2019
2020
2021
2020
1977
2019
2022
1982
2017
2019
2021
2020
2018
1971
2021
2011
1994
2022
2013
1997
1991
2020
2022
2018
2013
2018
2012
2021
1983
NA
2013
2017
2016
2019
2020
2021
2020
1977
2019
2022
1982
2017
2019
2021
2020
2018
1971
2021
20

1974
2007
2005
2018
1995
2020
1967
2018
2020
2022
2008
2020
2013
2022
2012
2022
1989
2020
1987
1994
2005
2021
2016
2022
1981
2021
2017
2002
2021
2011
2023
2022
2008
2017
2003
2019
1996
2008
2009
2022
2019
2019
2015
2020
2020
1976
2022
2021
2018
2008
1975
2022
2019
2021
2021
2022
2014
1989
1972
2020
2011
2022
2015
2020
1999
2001
2021
2021
1974
2018
2002
1974
2007
2005
2018
1995
2020
1967
2018
2020
2022
2008
2020
2013
2022
2012
2022
1989
2020
1987
1994
2005
2021
2016
2022
1981
2021
2017
2002
2021
2011
2023
2022
2008
2017
2003
2019
1996
2008
2009
2022
2019
2019
2015
2020
2020
1976
2022
2021
2018
2008
1975
2022
2019
2021
2021
2022
2014
1989
1972
2020
2011
2022
2015
2020
1999
2001
2021
2021
1974
2018
2002
1974
2007
2005
2018
1995
2020
1967
2018
2020
2022
2008
2020
2013
2022
2012
2022
1989
2020
1987
1994
2005
2021
2016
2022
1981
2021
2017
2002
2021
2011
2023
2022
2008
2017
2003
2019
1996
2008
2009
2022
2019
2019
2015
2020
2020
1976
2022
2021
2018
2008
1975
2022
2019
2021
2021
2022
2014
1989


2001
2015
1977
2020
2015
2022
2000
2009
2017
2014
2017
2015
2005
1986
2009
1992
2022
2021
2022
2021
2015
2020
2022
2015
2021
2023
1989
2012
2004
2022
2021
2022
2022
2022
2016
2014
2007
2021
2021
2021
2011
2017
NA
2019
2022
2019
1999
2005
1992
2014
2015
2022
2019
1977
2022
2001
2021
2016
2018
2019
2009
1969
2022
2011
2001
2015
1977
2020
2015
2022
2000
2009
2017
2014
2017
2015
2005
1986
2009
1992
2022
2021
2022
2021
2015
2020
2022
2015
2021
2023
1989
2012
2004
2022
2021
2022
2022
2022
2016
2014
2007
2021
2021
2021
2011
2017
NA
2019
2022
2019
1999
2005
1992
2014
2015
2022
2019
1977
2022
2001
2021
2016
2018
2019
2009
1969
2022
2011
2001
2015
1977
2020
2015
2022
2000
2009
2017
2014
2017
2015
2005
1986
2009
1992
2022
2021
2022
2021
2015
2020
2022
2015
2021
2023
1989
2012
2004
2022
2021
2022
2022
2022
2016
2014
2007
2021
2021
2021
2011
2017
NA
2019
2022
2019
1999
2005
1992
2014
2015
2022
2019
1977
2022
2001
2021
2016
2018
2019
2009
1969
2022
2011
2001
2015
1977
2020
2015
2022
2000
2009
2017
2

2020
2017
2021
2010
2015
2007
2000
1997
2021
2021
2015
1990
2017
2012
2019
2020
2005
2022
2020
2022
2020
1973
2021
2021
2020
2021
2020
2022
1998
1999
2002
1972
1965
2022
2002
2008
2021
2022
2021
2022
2019
2021
2011
1999
2021
2016
2002
2022
1988
2022
2020
2011
2020
2020
2017
2021
2010
2015
2007
2000
1997
2021
2021
2015
1990
2017
2012
2019
2020
2005
2022
2020
2022
2020
1973
2021
2021
2020
2021
2020
2022
1998
1999
2002
1972
1965
2022
2002
2008
2021
2022
2021
2022
2019
2021
2011
1999
2021
2016
2002
2022
1988
2022
2020
2011
2020
2020
2017
2021
2010
2015
2007
2000
1997
2021
2021
2015
1990
2017
2012
2019
2020
2005
2022
2020
2022
2020
1973
2021
2021
2020
2021
2020
2022
1998
1999
2002
1972
1965
2022
2002
2008
2021
2022
2021
2022
2019
2021
2011
1999
2021
2016
2002
2022
1988
2022
2020
2011
2020
2020
2017
2021
2010
2015
2007
2000
1997
2021
2021
2015
1990
2017
2012
2019
2020
2005
2022
2020
2022
2020
1973
2021
2021
2020
2021
2020
2022
1998
1999
2002
1972
1965
2022
2002
2008
2021
2020
2022
2017
2017


1962
2012
2004
2022
2021
2002
2000
2018
2022
2021
2019
2021
2022
2011
2004
2022
2022
2010
2022
1979
2020
1987
2010
2021
1959
2013
2019
2022
2016
2020
2011
1969
2017
2016
2013
1976
2018
2013
2017
2018
2003
1972
2003
2017
2019
2017
NA
2017
2015
2021
2014
2022
2011
2021
2023
2011
2019
2002
2004
1972
2020
2022
2007
1971
2017
2006
1983
2005
2004
2013
1962
2012
2004
2022
2021
2002
2000
2018
2022
2021
2019
2021
2022
2011
2004
2022
2022
2010
2022
1979
2020
1987
2010
2021
1959
2013
2019
2022
2016
2020
2011
1969
2017
2016
2013
1976
2018
2013
2017
2018
2003
1972
2003
2017
2019
2017
NA
2017
2015
2021
2014
2022
2011
2021
2023
2011
2019
2002
2004
1972
2020
2022
2007
1971
2017
2006
1983
2005
2004
2013
1962
2012
2004
2022
2021
2002
2000
2018
2022
2021
2019
2021
2022
2011
2004
2022
2022
2010
2022
1979
2020
1987
2010
2021
1959
2013
2019
2022
2016
2020
2011
1969
2017
2016
2013
1976
2018
2013
2017
2018
2003
1972
2003
2017
2019
2017
NA
2017
2015
2021
2014
2022
2011
2021
2023
2011
2019
2002
2004
1972
2020
2

2018
2019
2022
2003
2021
1998
2019
2005
1991
2021
2001
2022
2022
2010
2022
1984
2016
2022
2018
2017
1979
2021
2019
2005
2009
2018
2021
2022
1967
2016
2022
2022
2013
1991
2022
2011
2021
2016
2012
2022
2019
1994
2022
2005
2015
2021
2019
2014
2016
2006
1968
2022
2019
2021
2019
2018
2021
2021
1997
2018
2019
2022
2003
2021
1998
2019
2005
1991
2021
2001
2022
2022
2010
2022
1984
2016
2022
2018
2017
1979
2021
2019
2005
2009
2018
2021
2022
1967
2016
2022
2022
2013
1991
2022
2011
2021
2016
2012
2022
2019
1994
2022
2005
2015
2021
2019
2014
2016
2006
1968
2022
2019
2021
2019
2018
2021
2021
1997
2018
2019
2022
2003
2021
1998
2019
2005
1991
2021
2001
2022
2022
2010
2022
1984
2016
2022
2018
2017
1979
2021
2019
2005
2009
2018
2021
2022
1967
2016
2022
2022
2013
1991
2022
2011
2021
2016
2012
2022
2019
1994
2022
2005
2015
2021
2019
2014
2016
2006
1968
2022
2019
2021
2019
2018
2021
2021
1997
2018
2019
2022
2003
2021
1998
2019
2005
1991
2021
2001
2022
2022
2010
2022
1984
2016
2022
2018
2017
1979
2021
2019


2013
2012
2007
2015
2001
2013
2012
2007
2015
2001
2013
2012
2007
2015
1990
2010
2021
1999
2001
1990
2010
2021
1999
2001
1990
2010
2021
1999
2001
1990
2010
2021
1999
2001
1990
2010
2021
1999
2001
2010
2006
2005
2011
2004
2011
2004
2011
2017
1994
2017
1994
2017
1994
1966
2020
2014
2020
2005
1966
2020
2014
2020
2005
1960
2020
2020
2020
1960
2020
2020
2020
1960
2020
2020
2020
2019
1994
2006
2021
1996
2009
2022
2019
2000
2018
2022
2005
2003
2022
2019
2000
2018
2022
2005
2003
1974
2018
2011
2014
2017
1994
2006
1974
2018
2011
2014
2017
1994
2006
1974
2018
2011
2014
2017
1994
2006
1974
2018
2011
2014
2017
1994
2006
1974
2018
2011
2014
2017
1994
2006
2005
2003
2002
1965
NA
2018
1998
2002
1965
NA
2018
1998
2002
1965
NA
2018
1998
2011
2013
2008
2019
1993
2001
2020
2018
1998
2018
2018
2014
2012
2006
1968
1972
2015
2016
2021
2021
1983
2012
2012
2019
1997
2013
2019
1993
2001
2020
2018
1998
2018
2018
2014
2012
2006
1968
1972
2015
2016
2021
2021
1983
2012
2012
2019
1997
2013
2019
1993
2001
2020
2018
1

2010
NA
2021
2022
2021
2019
2010
NA
2021
2022
2017
2010
2010
2019
2013
2011
2022
1995
NA
2002
2017
1955
1999
2021
2009
2021
2006
2022
2019
1974
2016
2016
2013
2022
2015
2018
2019
2020
2010
2022
1999
2022
2019
1974
2016
2016
2013
2022
2015
2018
2019
2020
2010
2022
1999
2022
2019
1974
2016
2016
2013
2022
2015
2018
2019
2020
2010
2022
1999
2022
2019
1974
2016
2016
2013
2022
2015
2018
2019
2020
2010
2022
1999
2022
2011
1979
2001
2013
2020
2001
2013
2020
2005
2000
1998
2021
2020
2020
2012
2013
2022
2002
2021
2007
2018
2019
2020
2022
2002
2021
1966
2004
2022
2016
2007
1992
2018
2016
2020
2022
1999
2022
1988
2014
2016
2017
2021
1979
2008
2021
1988
2014
2016
2017
2021
1979
2008
2021
2004
2009
2015
2014
2015
2008
2019
2022
2020
2001
1963
2000
1963
2000
1963
2000
2019
2011
2018
2021
2022
1973
1975
1975
2017
2021
2017
2017
1997
1991
2013
2008
1995
2010
2020
2022
2022
2017
2018
2020
2016
2000
2008
1975
2005
2018
2018
2005
2018
2018
2020
1993
1952
1985
2010
2020
1993
1952
1985
2010
2020
1993
1952
1

In [51]:
len(valid_pairs), len(ds3.generate_dataframe()), len(ds4.generate_dataframe())

(523, 2000, 6195)

In [52]:
import csv
with open('valid_predictions_1_id.csv', mode='w') as file:
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for row in valid_pairs:
        writer.writerow(row)

In [53]:
dt = pd.read_csv('valid_predictions_1_id.csv', names=['wikidata_id', 'wikidata_title', 'imdb_rt_id', 'imdb_rt_title', 'confidence'], dtype=str)


In [54]:
wiki_df = ds3.generate_dataframe()

In [55]:
df_dict = {}
for index , record in dt.iterrows():
    wiki_id = record['wikidata_id']
    imdb_rt_id = record['imdb_rt_id']
    idx = str(index+1)
    df_dict[idx] = {}
    df_dict[idx]['title'] = df_new[df_new['id']==imdb_rt_id]['title'].values[0]
    df_dict[idx]['imdb_rating'] = df_new[df_new['id']==imdb_rt_id]['imdb_rating'].values[0]
    df_dict[idx]['rt_rating'] = df_new[df_new['id']==imdb_rt_id]['rt_rating'].values[0]
    df_dict[idx]['plot'] = df_new[df_new['id']==imdb_rt_id]['plot'].values[0]
    df_dict[idx]['genre'] = list(set(df_new[df_new['id']==imdb_rt_id]['genre'].values[0]).union(set(wiki_df[wiki_df['id']==wiki_id]['genre'].values[0])))
    if '' in df_dict[idx]['genre']:
        df_dict[idx]['genre'].remove('')
    df_dict[idx]['creators'] = df_new[df_new['id']==imdb_rt_id]['creators'].values[0]
    df_dict[idx]['cast'] = []
    if len(df_new[df_new['id']==imdb_rt_id]['cast']) > 0:
        df_dict[idx]['cast'] = df_new[df_new['id']==imdb_rt_id]['cast'].values[0]
    elif len(wiki_df[wiki_df['id']==wiki_id]['stars']) > 0:
        df_dict[idx]['cast'] = wiki_df[wiki_df['id']==wiki_id]['stars'].values[0]
        
    df_dict[idx]['release_date'] = ''
    if not df_new[df_new['id']==imdb_rt_id]['release_date'].isna().bool() and len(df_new[df_new['release_date']==imdb_rt_id]['release_date']) > 0:
        df_dict[idx]['release_date'] = df_new[df_new['id']==imdb_rt_id]['release_date'].values[0]
    elif not wiki_df[wiki_df['id']==wiki_id]['release_date'].isna().bool() and wiki_df[wiki_df['id']==wiki_id]['release_date'].values[0] != '':
        df_dict[idx]['release_date'] = wiki_df[wiki_df['id']==wiki_id]['release_date'].values[0]
        
    df_dict[idx]['end_date'] = wiki_df[wiki_df['id'] == wiki_id]['end_date'].values[0]
    df_dict[idx]['num_seasons'] = ''
    if not df_new[df_new['id']==imdb_rt_id]['num_seasons'].isna().bool() and df_new[df_new['id']==imdb_rt_id]['num_seasons'].values[0] != 'NA':
        df_dict[idx]['num_seasons'] = df_new[df_new['id']==imdb_rt_id]['num_seasons'].values[0]
    elif not wiki_df[wiki_df['id']==wiki_id]['num_seasons'].isna().bool():
        df_dict[idx]['num_seasons'] = wiki_df[wiki_df['id']==wiki_id]['num_seasons'].values[0]
    
    if df_new[df_new['id']==imdb_rt_id]['language'].values[0] != 'NA' and 'NA' not in df_new[df_new['id']==imdb_rt_id]['language'].values[0] and '' not in wiki_df[wiki_df['id']==wiki_id]['language'].values[0]:
        df_dict[idx]['language'] = list(set(df_new[df_new['id']==imdb_rt_id]['language'].values[0]).union(set(wiki_df[wiki_df['id']==wiki_id]['language'].values[0])))
    else:
        df_dict[idx]['language'] = wiki_df[wiki_df['id']==wiki_id]['language'].values[0]
        
    if df_new[df_new['id']==imdb_rt_id]['country_of_origin'].values[0] != 'NA' and 'NA' not in df_new[df_new['id']==imdb_rt_id]['country_of_origin'].values[0]:
        df_dict[idx]['country_of_origin'] = list(set(df_new[df_new['id']==imdb_rt_id]['country_of_origin'].values[0]).union(set(wiki_df[wiki_df['id']==wiki_id]['country_of_origin'].values[0])))
    else:
        df_dict[idx]['country_of_origin'] = wiki_df[wiki_df['id']==wiki_id]['country_of_origin'].values[0]
    
    if 'United States of America' in df_dict[idx]['country_of_origin'] and 'United States' in df_dict[idx]['country_of_origin']:
        df_dict[idx]['country_of_origin'].remove('United States of America')
    
    df_dict[idx]['production_company'] = df_new[df_new['id']==imdb_rt_id]['production_company'].values[0]
    
    df_dict[idx]['network'] = []
    if not df_new[df_new['id']==imdb_rt_id]['network'].isna().bool():
        df_dict[idx]['network'].extend(df_new[df_new['id']==imdb_rt_id]['network'].values[0])
    df_dict[idx]['network'] = list(set(wiki_df[wiki_df['id'] == wiki_id]['network'].values[0]))
    
    if df_new[df_new['id']==imdb_rt_id]['producer'].values[0] != 'NA':
        df_dict[idx]['producer'] = list(set(df_new[df_new['id']==imdb_rt_id]['producer'].values[0]).union(set(wiki_df[wiki_df['id']==wiki_id]['producer'].values[0])))
    else:
        df_dict[idx]['producer'] = wiki_df[wiki_df['id']==wiki_id]['producer'].values[0]
    df_dict[idx]['awards_received'] = wiki_df[wiki_df['id']==wiki_id]['awards_received'].values[0]
    df_dict[idx]['series_spin_off'] = wiki_df[wiki_df['id']==wiki_id]['series_spin_off'].values[0]
    df_dict[idx]['review_scores'] = wiki_df[wiki_df['id']==wiki_id]['review_scores'].values[0]
    

In [56]:
df_final = pd.DataFrame(df_dict).T

In [57]:
df_final.tail(50)

Unnamed: 0,title,imdb_rating,rt_rating,plot,genre,creators,cast,release_date,end_date,num_seasons,language,country_of_origin,production_company,network,producer,awards_received,series_spin_off,review_scores
474,Round the Twist,8.0,,Tony Twist and his three children--13-year-old...,"[speculative fiction, Comedy, Fantasy, Family]",[],"[Mark Mitchell, Esben Storm, Rian McLean, Math...","Apr 4, 1989","May 2, 2001",4,[English],[Australia],[Australian Children's Television Foundation],[Seven Network],[],[],[],[]
475,Tequila and Bonetti,6.4,,Nick Bonetti is a cop from New York who accide...,"[Comedy, Crime, Action, comedy drama]",[Donald P. Bellisario],"[Jack Scalia, Mariska Hargitay, Charles Rocket...","Jan 17, 1992","Apr 17, 1992",1,[English],[United States],"[Belisarius Productions, MCA Television]",[CBS],[Donald Bellisario],[],[],[]
476,Dora the Explorer,4.2,,"Along with her friend Monkey Boots, Dora goes ...","[adventure, educational animation, education, ...","[Eric Weiner, Chris Gifford, Valerie Walsh]","[Fatima Ptacek, Regan Mizrahi, Alexandria Suar...","Aug 14, 2000","Jan 1, 2014",8,"[English, Spanish]",[United States],"[Nick Jr. Productions, Nickelodeon Animation S...",[Nickelodeon],[],[],"[Go, Diego, Go!, Dora and Friends: Into the Ci...",[]
477,Only Fools and Horses,9.0,,Comedy that follows two brothers from London's...,"[Comedy, sitcom]",[John Sullivan],"[David Jason, Nicholas Lyndhurst, Roger Lloyd ...","Sep 8, 1981","Dec 25, 2003",9,[English],[United Kingdom],[British Broadcasting Corporation (BBC)],[BBC One],[],[],"[Rock & Chips, The Green Green Grass]",[]
478,Wings,7.3,,Brothers Brian and Joe Hackett attempt to run ...,"[American television sitcom, Comedy, Drama]","[David Angell, Peter Casey, David Lee]","[Tim Daly, Steven Weber, Crystal Bernard, Davi...","Apr 19, 1990","May 21, 1997",8,[English],[United States],"[Grub Street Productions, Paramount Television]",[NBC],[],[],[],[]
479,Women's Murder Club,7.1,,"A homicide detective, a medical examiner, a ne...","[television series based on a novel, Crime, po...","[Elizabeth Craft, Sarah Fain]","[Angie Harmon, Laura Harris, Paula Newsome, Au...","Oct 12, 2007","May 13, 2008",1,[English],[United States],"[20th Century Fox Television, Rat TV]",[American Broadcasting Company],[],[],[],[]
480,Castle,8.1,"[82%, 87%]","Bored with his success, celebrated mystery nov...","[Crime, police procedural, comedy drama, Drama...",[Andrew W. Marlowe],"[Nathan Fillion, Stana Katic, Susan Sullivan, ...","Mar 9, 2009","May 16, 2016",8,[English],[United States],"[Beacon Pictures, Experimental Pictures, ABC S...",[American Broadcasting Company],"[Andrew W. Marlowe, Rob Bowman, Rob Hanning, T...",[],[],[]
481,Lizzie McGuire,6.6,,The daily adventures of an adolescent girl who...,"[teen sitcom, Drama, American television sitco...",[Terri Minsky],"[Hilary Duff, Adam Lamberg, Robert Carradine, ...","Jan 12, 2001","Feb 14, 2004",2,[English],[United States],"[Stan Rogow Productions, Disney Channel]",[Disney Channel],[],[],[],[]
482,Workaholics,8.1,,A single-camera comedy featuring three friends...,"[American television sitcom, Comedy]","[Blake Anderson, Adam Devine, Anders Holm]","[Blake Anderson, Adam Devine, Anders Holm, Mar...","Apr 6, 2011","Mar 15, 2017",7,[English],[United States],"[5th Year Productions, Avalon Television, Giga...",[Comedy Central],[],[],[],[]
483,Point Pleasant,7.0,,A series of supernatural events begins in a sm...,"[Crime, Drama, neo-noir, soap opera, Horror]","[John J. McLaughlin, Marti Noxon]","[Elisabeth Harnois, Grant Show, Sam Page, Aubr...","Jan 19, 2005","Jun 10, 2005",1,[],[United States],[20th Century Fox Television],[Fox Broadcasting Company],[],[],[],[]


In [58]:
wiki_df_new = wiki_df[~wiki_df['id'].isin(dt['wikidata_id'])]
wiki_df_new = wiki_df_new.drop(['title_tokens', 'release_year'], axis=1)
wiki_df_new = wiki_df_new.rename(columns={'stars': 'cast', 'id':'wiki_id'})

In [59]:
df_final = pd.concat([df_final, wiki_df_new], axis=0)

In [60]:
df_new.head(1)

Unnamed: 0,title,imdb_rating,rt_rating,plot,genre,creators,cast,release_date,num_seasons,language,country_of_origin,production_company,network,producer,id
0,The Peripheral,8.3,"[73%, 88%]",Set in the future when technology has subtly a...,"[Sci-Fi, Mystery, Drama]",[Scott B. Smith],"[Chloë Grace Moretz, Gary Carr, Jack Reynor, L...","Oct 21, 2022",1,[English],[United States],"[Amazon Studios, Big Indie Pictures, Kilter Fi...",Prime Video,"[Jonathan Nolan, Lisa Joy, Athena Wickham, Sco...",0


In [61]:
imdb_rt_new = df_new[~df_new['id'].isin(dt['imdb_rt_id'])]
imdb_rt_new = imdb_rt_new.drop(['id'], axis=1)
imdb_rt_new = imdb_rt_new.rename(columns={'starring': 'cast', 'ratings': 'rt_rating', 'description': 'plot', 'premiere': 'release_date'})

In [62]:
imdb_rt_new['network'] = imdb_rt_new['network'].apply(lambda x: [x])

In [63]:
df_final = pd.concat([df_final, imdb_rt_new], axis=0)

In [64]:
df_final.to_csv('TVSeries.csv', index=False)

# Task 2: Using RDFLib for Knowledge Representation

In [65]:
from rdflib import Graph, URIRef, Literal, Namespace, RDF, BNode

In [66]:
SERIES = Namespace('http://dsci558.org/TVSeries/series#')
SCHEMA = Namespace('http://schema.org/')
SERIES_PROP = Namespace('http://dsci558.org/TVSeries/series/')

In [67]:
df_final.head(1)

Unnamed: 0,title,imdb_rating,rt_rating,plot,genre,creators,cast,release_date,end_date,num_seasons,language,country_of_origin,production_company,network,producer,awards_received,series_spin_off,review_scores,wiki_id
1,The Hollow Crown,,"[98%, 82%]",Contemporary adaptation of William Shakespeare...,"[History, costume drama]",[],"[Simon Russell Beale, Tom Hiddleston, James Pu...","Jun 30, 2012","Jul 21, 2012",2,[English],[United Kingdom],,[BBC Two],[],[British Academy Television Award for Best Act...,[],[],


In [68]:
series_kg = Graph()
series_kg.bind('series', SERIES)
series_kg.bind('schema', SCHEMA)
series_kg.bind('series_prop', SERIES_PROP)

for idx, series in df_final.iterrows():
    print(series['title'])
    series_node = URIRef(SERIES[idx])
    series_kg.add((series_node, RDF.type, SCHEMA['TVSeries']))
    series_kg.add((series_node, SCHEMA['name'], Literal(series['title'], datatype=SCHEMA.Text)))
    if  not pd.isnull(series['imdb_rating']) and series['imdb_rating'] != 'NA':
        rating_node = BNode()
        series_kg.add((rating_node, RDF.type, SCHEMA['AggregateRating']))
        series_kg.add((rating_node, SCHEMA['ratingValue'], Literal(series['imdb_rating'], datatype=SCHEMA.Number)))
        series_kg.add((series_node, SCHEMA['aggregateRating'], rating_node))
        rating_author_node = BNode()
        book_kg.add((publisher_node, RDF.type, SCHEMA['Organization']))
        book_kg.add((publisher_node, SCHEMA['name'], Literal(gd_r.Publisher, datatype=SCHEMA.Text)))
        book_kg.add((gd_node, SCHEMA['publisher'], publisher_node))
    
    if type(series['rt_rating']) == list and len(series['rt_rating']) > 0 and series['rt_rating'] != 'NA':
        rating_node = BNode()
        series_kg.add((rating_node, RDF.type, SCHEMA['AggregateRating']))
        series_kg.add((rating_node, SCHEMA['ratingValue'], Literal(series['rt_rating'][0], datatype=SCHEMA.Number)))
        series_kg.add((series_node, SCHEMA['aggregateRating'], rating_node))
    
    
    
#     if series['plot'] and series['plot'] != 'NA':
#         series_kg.add((gd_node, SCHEMA['description'], Literal(series['plot'], datatype=SCHEMA.Text))
                      
#     if series['plot'] and series['plot'] != 'NA':
#         series_kg.add((gd_node, SCHEMA['description'], Literal(series['plot'], datatype=SCHEMA.Text))
    
                      
#     series_kg.add((gd_node, SCHEMA['description'], Literal(gd_r.Description, datatype=SCHEMA.Text))
    
#     series_kg.add((series_node, SCHEMA['name'], Literal(series['title'], datatype=SCHEMA.Text)))
#     series_kg.add((series_node, SCHEMA['name'], Literal(series['title'], datatype=SCHEMA.Text)))
#     series_kg.add((series_node, SCHEMA['name'], Literal(series['title'], datatype=SCHEMA.Text)))
#     series_kg.add((series_node, SCHEMA['name'], Literal(series['title'], datatype=SCHEMA.Text)))
#     series_kg.add((series_node, SCHEMA['name'], Literal(series['title'], datatype=SCHEMA.Text)))

The Hollow Crown
Rookie Blue


NameError: name 'book_kg' is not defined

In [None]:
book_kg = Graph()
book_kg.bind('books', BOOKS)
book_kg.bind('schema', SCHEMA)
book_kg.bind('books_prop', BOOKS_PROP)
gd_ids = []
bn_ids = []

for r1, r2 in valid_sample_pairs:
    gd_r = gd.iloc[int(r1)]
    bn_r = bn.iloc[int(r2)]
    if URIRef(BOOKS[r1]) in book_kg.all_nodes(): ## no need to add the node again if it already exists
        continue
    gd_node = URIRef(BOOKS[r1])
    book_kg.add((gd_node, RDF.type, SCHEMA['Book']))
    if gd_r.Title and gd_r.Title != ' ':
        book_kg.add((gd_node, SCHEMA['name'], Literal(gd_r.Title, datatype=SCHEMA.Text)))
    elif bn_r.Title and bn_r.Title != ' ':
        book_kg.add((gd_node, SCHEMA['name'], Literal(bn_r.Title, datatype=SCHEMA.Text)))
    
    if gd_r.Description and gd_r.Description != ' ':
        book_kg.add((gd_node, SCHEMA['description'], Literal(gd_r.Description, datatype=SCHEMA.Text)))
    
    if gd_r.ISBN and gd_r.ISBN != ' ':
        book_kg.add((gd_node, SCHEMA['isbn'], Literal(gd_r.ISBN, datatype=SCHEMA.Text)))
    
    if gd_r.ISBN13 and gd_r.ISBN13 != ' ':
        book_kg.add((gd_node, BOOKS_PROP['isbn13'], Literal(gd_r.ISBN13, datatype=SCHEMA.Text)))
    if bn_r.ISBN13 and bn_r.ISBN13 != ' ':
        book_kg.add((gd_node, BOOKS_PROP['isbn13'], Literal(bn_r.ISBN13, datatype=SCHEMA.Text)))
    
    if gd_r.PageCount and gd_r.PageCount != ' ':
        book_kg.add((gd_node, SCHEMA['numberOfPages'], Literal(gd_r.PageCount, datatype=SCHEMA.Integer)))
    elif bn_r.Pages and bn_r.Pages != ' ':
        book_kg.add((gd_node, SCHEMA['numberOfPages'], Literal(bn_r.Pages, datatype=SCHEMA.Integer))) 
    
    if gd_r.Publisher and gd_r.Publisher != ' ':
        publisher_node = BNode()
        book_kg.add((publisher_node, RDF.type, SCHEMA['Organization']))
        book_kg.add((publisher_node, SCHEMA['name'], Literal(gd_r.Publisher, datatype=SCHEMA.Text)))
        book_kg.add((gd_node, SCHEMA['publisher'], publisher_node))
    elif bn_r.Publisher and bn_r.Publisher != ' ':
        publisher_node = BNode()
        book_kg.add((publisher_node, RDF.type, SCHEMA['Organization']))
        book_kg.add((publisher_node, SCHEMA['name'], Literal(bn_r.Publisher, datatype=SCHEMA.Text)))
        book_kg.add((gd_node, SCHEMA['publisher'], publisher_node))
    
    if (gd_r.Rating and gd_r.Rating != ' ') or (gd_r.NumberofRatings and gd_r.NumberofRatings != ' ') or (gd_r.NumberofReviews and gd_r.NumberofReviews != ' '):
        rating_node = BNode()
        book_kg.add((rating_node, RDF.type, SCHEMA['AggregateRating']))
        if gd_r.Rating and gd_r.Rating != ' ':
            book_kg.add((rating_node, SCHEMA['ratingValue'], Literal(gd_r.Rating, datatype=SCHEMA.Number)))
        if gd_r.NumberofRatings and gd_r.NumberofRatings != ' ':
            book_kg.add((rating_node, SCHEMA['ratingCount'], Literal(gd_r.NumberofRatings, datatype=SCHEMA.Integer)))
        if gd_r.NumberofReviews and gd_r.NumberofReviews != ' ':
            book_kg.add((rating_node, SCHEMA['reviewCount'], Literal(gd_r.NumberofReviews.replace(',', ''), datatype=SCHEMA.Integer)))
        book_kg.add((gd_node, SCHEMA['aggregateRating'], rating_node))
    
    elif (bn_r.Ratingvalue and bn_r.Ratingvalue != ' ') or (bn_r.Ratingscount and bn_r.Ratingscount != ' '):
        rating_node = BNode()
        book_kg.add((rating_node, RDF.type, SCHEMA['AggregateRating']))
        if bn_r.Ratingvalue and bn_r.Ratingvalue != ' ':
            book_kg.add((rating_node, SCHEMA['ratingValue'], Literal(bn_r.Ratingvalue, datatype=SCHEMA.Number)))
        if bn_r.Ratingscount and bn_r.Ratingscount != ' ':
            book_kg.add((rating_node, SCHEMA['ratingCount'], Literal(bn_r.Ratingscount, datatype=SCHEMA.Integer)))
        book_kg.add((gd_node, SCHEMA['aggregateRating'], rating_node))
    
    if gd_r.PublishDate and gd_r.PublishDate != ' ':
        book_kg.add((gd_node, SCHEMA['datePublished'], Literal(process_goodreads_date(gd_r.PublishDate), datatype=SCHEMA.Date)))
    elif bn_r.PublicationDate and bn_r.PublicationDate != ' ':
        book_kg.add((gd_node, SCHEMA['datePublished'], Literal(process_bn_date(bn_r.PublicationDate), datatype=SCHEMA.Date)))
    
    if gd_r.Format and gd_r.Format != ' ':
        book_kg.add((gd_node, BOOKS_PROP['bookFormat'], Literal(gd_r.Format, datatype=SCHEMA.Text)))
        
    if gd_r.Language and gd_r.Language != ' ':    
        book_kg.add((gd_node, SCHEMA['inLanguage'], Literal(gd_r.Language, datatype=SCHEMA.Text)))
        
    if gd_r.FirstAuthor and gd_r.FirstAuthor != ' ':
        first_author_node = BNode()
        book_kg.add((first_author_node, RDF.type, SCHEMA['Person']))
        book_kg.add((first_author_node, SCHEMA['name'], Literal(gd_r.FirstAuthor, datatype=SCHEMA.Text)))
        book_kg.add((gd_node, BOOKS_PROP['authors'], first_author_node))
    elif bn_r.Author1 and bn_r.Author1 != ' ':
        first_author_node = BNode()
        book_kg.add((first_author_node, RDF.type, SCHEMA['Person']))
        book_kg.add((first_author_node, SCHEMA['name'], Literal(bn_r.Author1, datatype=SCHEMA.Text)))
        book_kg.add((gd_node, BOOKS_PROP['authors'], first_author_node))
        
    if gd_r.SecondAuthor and gd_r.SecondAuthor != ' ':
        second_author_node = BNode()
        book_kg.add((second_author_node, RDF.type, SCHEMA['Person']))
        book_kg.add((second_author_node, SCHEMA['name'], Literal(gd_r.SecondAuthor, datatype=SCHEMA.Text)))
        book_kg.add((gd_node, BOOKS_PROP['authors'], second_author_node))
    elif bn_r.Author2 and bn_r.Author2 != ' ':
        second_author_node = BNode()
        book_kg.add((second_author_node, RDF.type, SCHEMA['Person']))
        book_kg.add((second_author_node, SCHEMA['name'], Literal(bn_r.Author2, datatype=SCHEMA.Text)))
        book_kg.add((gd_node, BOOKS_PROP['authors'], second_author_node))
    
    if gd_r.ThirdAuthor and gd_r.ThirdAuthor != ' ':
        third_author_node = BNode()
        book_kg.add((third_author_node, RDF.type, SCHEMA['Person']))
        book_kg.add((third_author_node, SCHEMA['name'], Literal(gd_r.ThirdAuthor, datatype=SCHEMA.Text)))
        book_kg.add((gd_node, BOOKS_PROP['authors'], third_author_node))
    elif bn_r.Author3 and bn_r.Author3 != ' ':
        third_author_node = BNode()
        book_kg.add((third_author_node, RDF.type, SCHEMA['Person']))
        book_kg.add((third_author_node, SCHEMA['name'], Literal(bn_r.Author3, datatype=SCHEMA.Text)))
        book_kg.add((gd_node, BOOKS_PROP['authors'], third_author_node))
    
    if bn_r.Salesrank and bn_r.Salesrank != ' ':
        book_kg.add((gd_node, BOOKS_PROP['salesRank'], Literal(bn_r.Salesrank, datatype=SCHEMA.Text)))
        
    if bn_r.Productdimensions and bn_r.Productdimensions != ' ':
        book_kg.add((gd_node, BOOKS_PROP['productDimensions'], Literal(bn_r.Productdimensions, datatype=SCHEMA.Text)))
        
    if bn_r.Paperbackprice and bn_r.Paperbackprice != ' ':
        book_kg.add((gd_node, BOOKS_PROP['paperbackPrice'], Literal(bn_r.Paperbackprice, datatype=SCHEMA.Text)))
        
    if bn_r.Hardcoverprice and bn_r.Hardcoverprice != ' ':
        book_kg.add((gd_node, BOOKS_PROP['hardcoverPrice'], Literal(bn_r.Hardcoverprice, datatype=SCHEMA.Text)))
        
    if bn_r.Nookbookprice and bn_r.Nookbookprice != ' ':
        book_kg.add((gd_node, BOOKS_PROP['nookbookPrice'], Literal(bn_r.Nookbookprice, datatype=SCHEMA.Text)))
        
    if bn_r.Audiobookprice and bn_r.Audiobookprice != ' ':
        book_kg.add((gd_node, BOOKS_PROP['audiobookPrice'], Literal(bn_r.Audiobookprice, datatype=SCHEMA.Text)))

In [None]:
book_kg.serialize(dir_ + 'model.ttl', format="turtle")