In [114]:
# Import relevant libraries
import os
import nltk
import re
from statistics import mode
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import itertools
import json
from nltk.tag.stanford import StanfordNERTagger
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
from difflib import SequenceMatcher
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [118]:
# Mount from google drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# because i'm using 'utf-8' i have problems with chunking
inputfile='/content/gdrive/My Drive/football_players.txt'
buf = open(inputfile, encoding="UTF-8")
list_of_doc = buf.read().split('\n')

l = []
for i in list_of_doc:
    if len(i) !=0:
        l.append(i)
list_of_doc = l


##**Task 1**

In [0]:
# This function takes each document and performs: 1) sentence segmentation 2) tokenization 3) part-of-speech tagging

def ie_preprocess(document):
  
  pos_sentences = []
 
  for document in list_of_doc:

    # The following performs Sentence Segmentation
    pos_sentences = nltk.sent_tokenize(document) 
    
    # The following performs tokenization
    pos_sentences = [nltk.word_tokenize(sent) for sent in pos_sentences]  
    
    # The following performs part-of-speech tagging
    pos_sentences = [nltk.pos_tag(sent) for sent in pos_sentences]  
    
    # Returns pos_sentences
    return pos_sentences

In [131]:
# The following code checks the result for the first document

first_doc=list_of_doc[0]
# Calling the ie_preprocess function
pos_sent = ie_preprocess(first_doc)

# Display all tagged sentences as given in the assignment sheet. However, to dissplay all tagged sentence, please remove the 
# list index
pos_sent[2]

[('In', 'IN'),
 ('2008', 'CD'),
 (',', ','),
 ('he', 'PRP'),
 ('won', 'VBD'),
 ('his', 'PRP$'),
 ('first', 'JJ'),
 ('Ballon', 'NNP'),
 ("d'Or", 'NN'),
 ('and', 'CC'),
 ('FIFA', 'NNP'),
 ('World', 'NNP'),
 ('Player', 'NNP'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('Year', 'NN'),
 ('awards', 'NNS'),
 ('.', '.')]

##**Task 2**

In [132]:
# This function takes the list of tokens with POS tags for each sentence and returns the named entities (NE).
# NLTK provides a classier that has already been trained to recognize named entities, accesse the function _nltk.ne_chunk()_. 
# If we set the parameter binary=True, then named entities are j tagged as NE; otherwise, the classier adds category labels such as 
# PERSON, ORGANIZATION etc.

def named_entity_finding(pos_sent, x=True, y="NE"):
    
    named_entities = []
        
    # The code below identifies all the named entities like person, place, organisation etc.  
      
    tree = nltk.ne_chunk(pos_sent, binary=x)

    for subtree in tree.subtrees():
        if subtree.label() == y:
            entity = ""
            for leaf in subtree.leaves():
                entity = entity + leaf[0] + " "
            named_entities.append(entity.strip())

    return named_entities
    
        
# Calls the function above - ie_preprocess
pos_sents=ie_preprocess(list_of_doc[0])
# Test
named_entity_finding(pos_sents[0])

['Cristiano Ronaldo',
 'Santos Aveiro',
 'ComM',
 'GOIH',
 'Portuguese',
 'Spanish',
 'Real Madrid',
 'Portugal']

##**Task 3**

In [134]:
# Here i use the named_entity_finding() function to extract all NEs for each document.
# I extract named entities for the whole document and flatten it to a list at the end.
def NE_flat_list_fn(document):
  
  named_entities=[]
  
  # Perform loop through all the documents in question
  for i in document:
    # POS tag the docs
    pos_sents = ie_preprocess(i)
    
    for pos_sent in pos_sents:
      
      #Find the named entity for every tagged sentence in the doc.
      entity = named_entity_finding(pos_sent)
      if len(entity) != 0:
        named_entities.append(entity)
        
        # Flatten the list
        NE_flat_list = list(itertools.chain.from_iterable(named_entities))
  # Return flat list     
  return NE_flat_list

# Test
# 20 named entites displayed
ne_flat = NE_flat_list_fn(list_of_doc)
list(set(ne_flat))[1:10]

['Ballon',
 'Portuguese Football Federation',
 'FIFA Club',
 'Ronaldo',
 'FIFA Ballon',
 'Cristiano Ronaldo',
 'UEFA European',
 'Portugal',
 'Silver Boot']

##**Task 4**

In [137]:
# The following function extracts the players name

def name_of_the_player(doc):
    
    name = []
    
    # Calls ie_preprocess function
    pos_sents = ie_preprocess(doc)
    
    # Returns the best chunk tokens for given structure
    tree = nltk.ne_chunk(pos_sents[0])

    # Returns tree along with joining players name
    # Ensures is person not organisation etc
    for subtree in tree.subtrees():
        if subtree.label() == "PERSON":
            entity = ""
            for leaf in subtree.leaves():
                entity = entity + leaf[0] + " "
            name.append(entity.strip())
    return " ".join(name)
  
# Test
print(name_of_the_player(list_of_doc[0]))

Cristiano Santos Aveiro


In [154]:
# The following function extracts the team of the player

def team_of_the_player(doc): 
    
    # Identiy following patterns
    patterns = [
        'for .*? club (.*?) (and|after)',
        'captains both (.*?) (and|after)',
        'the\s(.*?)\snational (team)'
    ]
    possible_teams = []
    # Loop to generate list of teams.
    for p in patterns:
        if re.findall(p, doc, re.IGNORECASE):
            for m in re.findall(p, doc, re.IGNORECASE):
                possible_teams.append(m)

    filterteams = []
    
    # Loop generated to extrct team. Ensures length is greater than 3 and no full stops included.
    for t in possible_teams:
        if len(t[0]) < 25 and len(t[0]) > 3 and '.' not in t[0]:
            filterteams.append(t[0])
    # Return filtered set of teams        
    return set(filterteams)
  
# Test  
print(team_of_the_player(list_of_doc[0]))

{'Real Madrid', 'Portugal'}


In [155]:
# The following function extracts the players position

def position_of_the_player(doc):

    # List of all possible football positions
    positions = [
        "Goalkeeper",
        "Centre-back",
        "Sweeper",
        "Full-back",
        "Wing-back",
        "Centre midfield",
        "Defensive midfield",
        "Attacking midfield",
        "Wide midfield",
        "Centre forward",
        "Second striker",
        "Winger",
        "Forward"
    ]
    player_position = []
    
    sent = sent_tokenize(doc)
    for i, sent in enumerate(sent):
        for x in positions:
            # Find matching in look up and sentence.
            regex = re.compile(r'\b({0})\b'.format(x), flags=re.IGNORECASE)
                
            # if result is true append to a list for testing at the end.
            r = bool(regex.search(sent))
                
            if r == True:
                player_position.append(x)
    # Returns list of players positions   
    return list(set(player_position))

# Prints the position of the following player in the doc
print(position_of_the_player(list_of_doc[0]))

['Forward']


In [161]:
# The following function extracts the players date of birth

def date_of_birth(doc):
    
    """Firstly I used regex to find the date of birth followed by extracting the particular 
    sentence that contained the word born as this is significant to date of birth"""
    sentence = sent_tokenize(doc)[0]
    
    # Find matching
    match = re.compile(r'born\b\s*((?:\S+\s+){0,3})')
    
    # Search for occurrences in the patern anywhere in the string
    extract_born = match.findall(sentence)[0]
    
    # Replace pattern with the search above
    extract_born = re.sub('\W+',' ', extract_born )
    
    # Returns date of birth
    return extract_born

# Test
print(date_of_birth(list_of_doc[0]))

5 February 1985 


In [162]:
# The following function extracts the players country of origin

def country_of_origin(doc):
  
    # I extracted the following pattern so that the players country of origin is directly before the word professional.
    # The code is as follows.   
    pattern = 'is\san?\s(.*?)\sprofessional'
    
    # Searches for occurrences in the pattern anywhere in the document
    return re.findall(pattern, doc)[0]

# Test
print(country_of_origin(list_of_doc[0]))

Portuguese


##**Task 5**

In [163]:
# The following function uses the outputs from the previous functions by calling it for generation into JSON-LD output

def data_generator(doc):
    
    # Calls all the functions above
    data = [name_of_the_player(doc), date_of_birth(doc), country_of_origin(doc), position_of_the_player(doc), team_of_the_player(doc)]
    
    # Returns the data
    return data

data = data_generator(list_of_doc[0])
# Test
print(data)

['Cristiano Santos Aveiro', '5 February 1985 ', 'Portuguese', ['Forward'], {'Real Madrid', 'Portugal'}]


In [79]:
# The following function uses the output from the previous functions to create JSON-LD output

def generate_jsonld1(arg, con=True):
    if con == True:

        ld = { "@id": "http://my-soccer-ontology.com/footballer/"+arg[0],

            "name": arg[0],
            "born": arg[1],
            "country": arg[2],
            "position": [
                { "@id": "http://my-soccer-ontology.com/position/",
                    "type": arg[3]
                }
             ],   
             "team": [
                { "@id": "http://my-soccer-ontology.com/team/",
                    "name": arg[4]
                }   
             ]
        }

        return json.dumps(ld)
    
    elif con == False:
        
        ld = { "@id": "http://my-soccer-ontology.com/footballer/"+arg[0],

            "name": arg[0],
            "born": arg[1],
            "country": arg[2],
            "position": [
                { "@id": "http://my-soccer-ontology.com/position",
                    "type": arg[3]
                }
             ],   
             "team": [
                { "@id": "http://my-soccer-ontology.com/team",
                    "name": arg[4]
                },
             ],
            "Debut Year": arg[5][0],
            "Debut Age": arg[5][1]
        }

        return json.dumps(ld)
        
try:
    data = data_generator(list_of_doc[0])
    print(generate_jsonld1(data))
except:
    print("Please make sure that list index given is right")

{"@id": "http://my-soccer-ontology.com/footballer/Cristiano Santos Aveiro", "name": "Cristiano Santos Aveiro", "born": "5 February 1985 ", "country": "Portuguese", "position": [{"@id": "http://my-soccer-ontology.com/position/", "type": ["Forward"]}], "team": [{"@id": "http://my-soccer-ontology.com/team/", "name": ["Portugal national team", "Real Madrid"]}]}


##**Task 6**

In [166]:
# The following function extracts the debut year and age of the player

def debut_age_relation(doc):

    sent = sent_tokenize(doc)
    # Debut
    debut = []
    # Age
    age = []
    
    # For each sentence check if the debut key is present or not
    for se in sent:
        sp_sent = se.split()
        
         # Check for debut and age in the one sentence 
        if "debut" in sp_sent and "aged" in sp_sent:
            # Check if age is present by two digit number
            age = re.findall('\d{2}', " ".join(sp_sent))
            age.append(age[0]) 
        
        if "debut" in sp_sent:
            # Check if debut year is present by a four digit number
            date = re.findall('\d{4}', " ".join(sp_sent))
            if len(date) !=0:
                debut.append(date[0])
        
    # Returns debut and age
    return [debut[0], age[0]]

# Test
debut_age_relation(list_of_doc[4])

['2002', '20']

In [0]:
# Add above function to data_generator function 

def data_generator(doc):
    
    data = [name_of_the_player(doc), date_of_birth(doc), country_of_origin(doc), position_of_the_player(doc), team_of_the_player(doc), relation_debutYearAge(doc)]
    return data

data = data_generator(list_of_doc[0])
print(data)

In [0]:
# Generate JSON 

generate_jsonld1(data, False)