# Get the id list of all US films from wikipedia

In [None]:
!pip install SPARQLWrapper
from SPARQLWrapper import SPARQLWrapper, JSON

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Collecting rdflib>=6.1.1
  Downloading rdflib-6.2.0-py3-none-any.whl (500 kB)
[K     |████████████████████████████████| 500 kB 7.5 MB/s 
Collecting isodate
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[K     |████████████████████████████████| 41 kB 395 kB/s 
Installing collected packages: isodate, rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0 isodate-0.6.1 rdflib-6.2.0


In [None]:
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
N_max = 60000
index = 0
nums=[]
while index < N_max:
  sql = """
  SELECT DISTINCT ?number
  WHERE
      {
          ?film dbp:country ?country.
          ?film dbo:wikiPageWikiLink dbc:American_films .
          ?film dbo:wikiPageID ?number .
          ?film rdfs:comment ?abstract .
          ?film dbp:name ?name .
        }
  LIMIT 10000
  OFFSET %d.
  """ %(index)
  sparql.setQuery(sql)
  sparql.setReturnFormat(JSON)
  results = sparql.query().convert()
  dic=results['results']['bindings']
  for idx in dic:
    nums.append(idx['number']['value'])
  index += 10000

In [None]:
len(set(nums))

52172

# Filter the json data by the id list

In [None]:
# !unzip 
import json
import pickle
from google.colab import drive
drive.mount('/content/drive')
json_data = []

Mounted at /content/drive


In [None]:
!unzip "/content/drive/MyDrive/dataset/AB.zip" -d "/content/drive/MyDrive/dataset/"

Archive:  /content/drive/MyDrive/dataset/AB.zip
   creating: /content/drive/MyDrive/dataset/AB/
  inflating: /content/drive/MyDrive/dataset/__MACOSX/._AB  
  inflating: /content/drive/MyDrive/dataset/AB/wiki_42  
  inflating: /content/drive/MyDrive/dataset/__MACOSX/AB/._wiki_42  
  inflating: /content/drive/MyDrive/dataset/AB/.DS_Store  
  inflating: /content/drive/MyDrive/dataset/__MACOSX/AB/._.DS_Store  
  inflating: /content/drive/MyDrive/dataset/AB/wiki_45  
  inflating: /content/drive/MyDrive/dataset/__MACOSX/AB/._wiki_45  
  inflating: /content/drive/MyDrive/dataset/AB/wiki_44  
  inflating: /content/drive/MyDrive/dataset/__MACOSX/AB/._wiki_44  
  inflating: /content/drive/MyDrive/dataset/AB/wiki_43  
  inflating: /content/drive/MyDrive/dataset/__MACOSX/AB/._wiki_43  
  inflating: /content/drive/MyDrive/dataset/AB/wiki_32  
  inflating: /content/drive/MyDrive/dataset/__MACOSX/AB/._wiki_32  
  inflating: /content/drive/MyDrive/dataset/AB/wiki_35  
  inflating: /content/drive/MyDri

In [None]:
file_index = 32
while file_index < 69:
  file_path = '/content/drive/MyDrive/dataset/AB/wiki_' + str(file_index)
  for line in open(file_path):
    if json.loads(line)['id'] in nums:
      json_data.append(json.loads(line))
  file_index += 1

In [None]:
df=open('file_4','wb')
pickle.dump(json_data,df)
df.close()

In [None]:
df1=open('file_4','rb')
data_tmp=pickle.load(df1)
df1.close()
len(data_tmp)

14927

# NLP pipline

In [None]:
import spacy
import pickle
from spacy import displacy
from spacy.matcher import Matcher
df1=open('file_4','rb')
film_data=pickle.load(df1)
df1.close()
# len(film_data)

In [None]:
class Processor:
  def __init__(self, data):
    self.nlp = spacy.load("en_core_web_sm")
    self.films = []
    self.data = data
    self.matcher = Matcher(self.nlp.vocab)

  def sentence_segmentation(self):
    for film in self.data:
      film_dict = {}
      film_dict['title'] = film['title']
      doc = self.nlp(film['text'])
      triplets = []
      for sent in doc.sents:
        entities = []
        for ent in sent.ents:
          entities.append((ent.text,ent.label_))
        # entities = entities.append(self.get_entities(sent, entities))
        print(entities)
        # triplets.append(triplet)
      film_dict['triplets'] = triplets
    self.films.append(film_dict)

  def get_entities(self, doc, old_entities):
    entities = []
    ent =""
    prv_tok_dep ="" # dependency tag of previous token in the sentence
    prv_tok_text ="" # previous token in the sentence
    prefix =""
    modifier =""
    for tok in doc:
      if tok.dep_ == "punct":
        continue
      if tok.dep_ == "compound":
        prefix = tok.text
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " " + tok.text

      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        if prv_tok_dep =="compound":
          modifier = prv_tok_text + " " + tok.text

      if tok.dep_.find("subj") == True or tok.dep_.find("obj") == True:
        ent = modifier + " " + prefix + " " + tok.text
        ent = ent.strip()
        if not ent in old_entities:
          entities.append(ent)

      prefix =""
      modifier =""
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text

    return entities
  
  def get_relation(self, doc):
    # Matcher class object
    pattern = [{'DEP':'ROOT'},{'DEP':'prep','OP':"?"},{'DEP':'agent','OP':"?"},{'POS':'ADJ','OP':"?"}]
    self.matcher.add("matching_1", patterns=[pattern])
    matches = self.matcher(doc)
    k =len(matches) -1
    span = doc[matches[k][1]:matches[k][2]]
    return(span.text)



In [None]:
film_data = film_data[:1]
p = Processor(film_data)
p.sentence_segmentation()

[("Scatter My Ashes at Bergdorf's", 'WORK_OF_ART'), ('US', 'GPE'), ('2013', 'DATE'), ('Matthew Miele', 'PERSON'), ('New York City', 'GPE'), ('Bergdorf Goodman', 'PERSON'), ('Fifth Avenue', 'FAC'), ('Grand Army Plaza', 'FAC')]
[('1990', 'DATE'), ('Victoria Roberts', 'PERSON'), ('The New Yorker', 'WORK_OF_ART')]
[]
[('May 3, 2013', 'DATE')]


In [None]:
p.films

[{'title': "Scatter My Ashes at Bergdorf's", 'triplets': []}]

In [None]:
nlp = spacy.load("en_core_web_sm")
str = 'Army of Darkness is a 1992 American comedy horror film directed, co-written and co-edited by Sam Raimi, co-produced by Robert Tapert and Bruce Campbell and co-written by Ivan Raimi.'
doc = nlp(str)
for tok in doc:
  if tok.dep_.find("sub") == True:
    print(tok.dep_)

nsubj


In [None]:
str = 'Army of Darkness is a 1992 American comedy horror film directed, co-written and co-edited by Sam Raimi, co-produced by Robert Tapert and Bruce Campbell and co-written by Ivan Raimi.'
doc = nlp(str)
print(p.get_entities(doc))

['Army', 'American Ivan Raimi']


In [None]:
nlp = spacy.load("en_core_web_sm")
str = 'Army of Darkness is a 1992 American comedy horror film directed, co-written and co-edited by Sam Raimi, co-produced by Robert Tapert and Bruce Campbell and co-written by Ivan Raimi.'
doc1 = nlp(str)
for ent in doc1.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Army of Darkness 0 16 ORG
1992 22 26 DATE
American 27 35 NORP
Sam Raimi 93 102 PERSON
Robert Tapert 119 132 PERSON
Bruce Campbell 137 151 PERSON
Ivan Raimi 170 180 PERSON


# Create Training dataset

In [1]:
!pip install SPARQLWrapper
from SPARQLWrapper import SPARQLWrapper, JSON
import pickle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Collecting rdflib>=6.1.1
  Downloading rdflib-6.2.0-py3-none-any.whl (500 kB)
[K     |████████████████████████████████| 500 kB 8.1 MB/s 
Collecting isodate
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[K     |████████████████████████████████| 41 kB 479 kB/s 
Installing collected packages: isodate, rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0 isodate-0.6.1 rdflib-6.2.0


In [3]:
df=open('file_4','rb')
film_data=pickle.load(df)
df.close()

In [19]:
film_data[0]

{'id': '45596993',
 'revid': '39239779',
 'url': 'https://en.wikipedia.org/wiki?curid=45596993',
 'title': "Scatter My Ashes at Bergdorf's",
 'text': '"Scatter My Ashes at Bergdorf\'s" is a US 2013 documentary feature directed by Matthew Miele about the New York City luxury goods department store Bergdorf Goodman, situated on Fifth Avenue where it meets Grand Army Plaza. The film\'s title is lifted from the caption of a 1990 Victoria Roberts cartoon that appeared in pages of "The New Yorker". The film features celebrities, store executives and employees, designers and customers testifying to their love of the place.\nThe film opened at theatres on May 3, 2013.'}

In [35]:
# title = film_data[4]['title'].split("(")[0].strip()
# film_data[4]['text'].find(title)
d = {'hh':'hh'}
if not 'director' in d.keys():
  print('not in')

not in


In [44]:
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
films = []
for film in film_data[:100]:
  # build subject
  full_name = film['title']
  truncated_name = film['title'].split("(")[0].strip()
  text = film['text']
  name_idx = text.find(truncated_name)
  if name_idx == -1:
    continue
  # not sure the length of name should be full or trancated.
  h = {"name": full_name, "pos":[name_idx, name_idx+len(truncated_name)]}

  # find object
  id = film['id']
  sql = """
  SELECT DISTINCT ?director, ?starring
  WHERE
      {
          ?film dbp:director ?director .
          ?film dbp:starring ?starring .
  """ + "?film dbo:wikiPageID " + id + " .}"
  sparql.setQuery(sql)
  sparql.setReturnFormat(JSON)
  results = sparql.query().convert()
  temp = results['results']['bindings']
  if not temp:
    continue
  
  # process the results of this film
  directors = []
  starrings = []
  for item in temp:
    # find all the directors of film
    if not 'director' in item.keys():
      continue
    director = item['director']
    t = director['type']
    if t == 'literal':
      director = director['value']
    else:
      director = director['value'].rsplit('/', 1)[-1].split('_')
      director = " ".join(director)
    if director not in directors:
      directors.append(director)

    # find all the starrings of film
    if not 'starring' in item.keys():
      continue
    starring = item['starring']
    t = starring['type']
    if t == 'literal':
      starring = starring['value']
    else:
      starring = starring['value'].rsplit('/', 1)[-1].split('_')
      starring = " ".join(starring)
    if starring not in starrings:
      starrings.append(starring)

  # build triplet
  if not directors and not starrings:
    continue
  spo_list = []
  # build the director - direct - film
  for director in directors:
    director_idx = text.find(director)
    if not director_idx:
      continue
    t = {"name": director, "pos": [director_idx, director_idx + len(director)]}
    triplet = {"h":h, "t":t, "relation": "direct"}
    spo_list.append(triplet)
  if not spo_list:
    continue
  # build the starring - act - film
  for starring in starrings:
    starring_idx = text.find(starring)
    if not starring_idx:
      continue
    t = {"name": starring, "pos": [starring_idx, starring_idx + len(starring)]}
    triplet = {"h":h, "t":t, "relation": "act"}
    spo_list.append(triplet)

  #new data
  films.append({'id':id, 'text':text, 'spo_list': spo_list})
#           

In [45]:
df=open('training_data','wb')
pickle.dump(films,df)
df.close()

In [46]:
df=open('training_data','rb')
films=pickle.load(df)
df.close()
len(films)

77