In [1]:
from nltk.ccg import chart, lexicon
from nltk.ccg.lexicon import CCGLexicon, Token, augParseCategory
from nltk.ccg.chart import CCGChart,CCGLeafEdge,BinaryCombinatorRule,CCGEdge
from nltk.tree import Tree
import pandas as pd
import numpy as np

# Weighed Lexicon

In [2]:
from warnings import warn

class WeighedToken(Token):
    def __init__(self, token, categ, semantics=None, weight = 1.0):
        super().__init__(token, categ, semantics= semantics)
        self._weight = weight
    def weight(self):
        """1.0 is considered the default weight for any token"""
        try:
            return self._weight
        except AttributeError:
            warn(f"[{self.token} : {str(self)}] : this token has no weight attribute, defaulted to 1.0.")
            return 1.0

class WeighedLexicon(CCGLexicon):
    def __init__(self, start, primitives, families, entries):
        super().__init__(start, primitives, families, entries)

    def weight(self, entry):
        return entry.weight()

# CYK

We define the weight associated to each reduction rule.
`rweight(rule)` should return the weight associated to the rul, using its string representation (i.e. the name of the rule)

In [3]:
valz = {
    '>' : 0.8,
    '<' : 0.7
}
def rweight(rule):
    s = rule.__str__()
    if s in valz:
        return valz[s]
    else:
        return 1.0 # Base rules weight

`weightedParse` implements the CKY algorithm, based on the implementation in the nltk library.
We take the weight from the weighted lexicon for the leafs, and we compute it using the formula for each reduction rule.
$$ w_{node} = \phi_r \times w_{child1} \times w_{child2}$$

In [14]:
# Implements the CYK algorithm, code partly taken from nltk
def weightedParse(tokens, lex, rules):
    """made to take weighed tokens and lexicons"""
    chart = CCGChart(list(tokens))
    
    # Initialize leaf edges.
    for index in range(chart.num_leaves()):
        for token in lex.categories(chart.leaf(index)):
            new_edge = CCGLeafEdge(index, token, chart.leaf(index))
            new_edge.weight = token.weight()
            chart.insert(new_edge, ())

    # Select a span for the new edges
    for span in range(2, chart.num_leaves() + 1):
        for start in range(0, chart.num_leaves() - span + 1):
            
            print("==>",span,start)
            
            bestedge = None
            nedg = 0
            # edges[s] is the best edge generating the category s
            edges = dict()
            
            # Try all possible pairs of edges that could generate
            # an edge for that span
            for part in range(1, span):
                lstart = start
                mid = start + part
                rend = start + span
                
                for left in chart.select(span=(lstart, mid)):
                    for right in chart.select(span=(mid, rend)):
                        # Generate all possible combinations of the two edges
                        for rule in rules:
                            # Can we apply the rule
                            if rule.can_combine(left.categ(), right.categ()):
                                for res in rule.combine(left.categ(), right.categ()):
                                    # res is the new category
                                    edge = CCGEdge(
                                        span=(left.start(), right.end()),
                                        categ=res,
                                        rule=BinaryCombinatorRule(rule),
                                    )
                                    edge.weight = rweight(rule) * left.weight * right.weight
                                    edge.triple = (rule,left,right)
                                    if not(res in edges and edges[res].weight<=edge.weight):
                                        edges[res] = edge
                        # end for rule loop
                    # end for right loop
                # end for left loop
            # end for part loop
            for cat in edges:
                chart.insert(edges[cat], (edges[cat].triple[1], edges[cat].triple[2]))
    return chart

In [15]:
def wpToTree(edge):
    if isinstance(edge,CCGLeafEdge):
        return Tree((edge.token(),"Leaf"),[Tree(edge.token(),[edge.leaf()])])
    else:
        return Tree(
            (chart.Token(None,edge.categ()),edge.triple[0].__str__()),
            [wpToTree(t) for t in (edge.triple[1:])])

In [16]:
def bestTree(tokens, lex, rules):
    # We build the weighgted parse tree using cky
    wChart = weightedParse(tokens, lex, rules)
    # We get the biggest edge
    e = list(wChart.select(start=0,end=len(tokens)))[0]
    print("Edge count:",len(list(wChart.select(start=0,end=len(tokens)))))
    # We get the tree that brought us to this edge
    t = wChart._trees(e, True, dict(), Tree)[0]
    # (wpToTree(e),e.weight)
    return (t,e.weight)

# Application

In [17]:
from numbers import Number
from nltk.sem.logic import Expression
from nltk.ccg.api import PrimitiveCategory

def to_pseudo_entries(table, consider_semantics = True):
    """returns a list of lists in the format ['word', 'category', 'weight', None]
    if consider_semantics == false else ['word', 'category', weight, 'semantic']
    that is left to be converted into tokens by to_wlex_entries"""

    entries = list()
    for line in range(len(table['MOT'])):
        for wdi, word in enumerate(table['MOT'][line].replace(" ", "").split('/')):
            for j in range(3):
                if isinstance(table['Cat'+str(j)][line],str):
                    category = table['Cat'+str(j)][line]
                    weight = float(table['Weights'+str(j)][line]) if isinstance(table['Weights'+str(j)][line], Number) else 1.0
                    if consider_semantics:
                        semantic = (table['Sem'+str(j)][line].replace('\\\\', '\\').split('/'))[wdi]
                    else:
                        semantic = None
                    entries.append([word, category, weight, semantic])
    return entries

def to_wlex_entries(pseudo_entries, primitives, families, var=None):
    """returns the entries to a weighed lexicon from pseudo_entries generated by to_pseudo_entries"""
    entries = dict()
    for entry in pseudo_entries:
        if entry[0] not in entries:
            entries[entry[0]] = list()
        categ, _ = augParseCategory(entry[1], primitives, families, var)
        token = WeighedToken(token= entry[0],
                             categ= categ,
                             semantics= None if entry[-1] is None else Expression.fromstring(entry[-1]),
                             weight= entry[2])
        entries[entry[0]].append(token)
    return entries
    

We create our lexicon using the data from the server

In [18]:
# Catégories primitives et familles
primitives = ['S', 'N', 'Pp', 'pN']
V = augParseCategory("S\\N", primitives = primitives, families={})
families = {'V': V}

# On importe notre lexique sous forme de tableur
table = pd.read_excel("CategoriesGramaticalesCombinatoire.ods", engine="odf")
#print(table.keys())

# On le convertit en Lexique pondéré
pe = to_pseudo_entries(table, consider_semantics = False)
#print(pe)
wEntries = to_wlex_entries(pseudo_entries= pe, primitives= primitives, families= families)
#print([list(map(lambda x: f"{k} : "+ str(x) + str(x._semantics), L)) for k, L in wEntries.items()])
lex = WeighedLexicon(start= 'S', primitives= primitives, families= families, entries= wEntries)


# On crée le parser, on donne l'ensemble des règles qu'il est cencé connaître
from nltk.ccg.combinator import (
    BackwardApplication,
    BackwardBx,
    BackwardComposition,
    BackwardSx,
    ForwardApplication,
    ForwardComposition,
    ForwardSubstitution
)
rulesC  = [ForwardApplication,BackwardApplication] 
rulesC += [ForwardComposition,BackwardComposition,BackwardBx]
rulesC += [ForwardSubstitution,BackwardSx]
rulesR = [BinaryCombinatorRule(c) for c in rulesC]
# chart.ApplicationRuleSet for only < and >

parser = chart.CCGChartParser(lex, rulesR)

On lit les phrases depuis le fichier `phrases.txt`, et pour chacune, on imprime le nombre de dérivations trouvées, ainsi que le meilleur arbre de dérivation (i.e. de meilleur poids)

In [19]:
# On lit les phrases dans le fichier
with open('phrases.txt') as f:
    lines = f.readlines()

    # On ajoute des phrases de test
    lines.append("le chat et la souris dorment")
    
    for phrase in lines:
        # On met tout en minuscule
        phrase = phrase.lower().strip()
        phrase = "le méchant chat dort"
        
        # On compte les arbres de dérivation trouvés
        i = len(list(parser.parse(phrase.split())))
        print(i, "found derivation for sentence:",phrase)
        g = parser.parse(phrase.split())
        for t in g:
            chart.printCCGDerivation(t)
        
        # On affiche la dérivation la meilleure pour l'arbre
        if (i != 0):
            t,d = bestTree(phrase.split(), lex, rulesC)
            print("Best derivation tree has weight",d)
            chart.printCCGDerivation(t)
        
        print("#"*42)

2 found derivation for sentence: le méchant chat dort
   le    méchant  chat  dort
 (N/pN)  (pN/pN)   pN   (S\N)
        --------------->
              pN
----------------------->
           N
------------------------------<
              S
   le    méchant  chat  dort
 (N/pN)  (pN/pN)   pN   (S\N)
----------------->B
     (N/pN)
----------------------->
           N
------------------------------<
              S
==> 2 0
==> 2 1
==> 2 2
==> 3 0
==> 3 1
==> 4 0
Edge count: 1
Best derivation tree has weight 0.24957917865181148
   le    méchant  chat  dort
 (N/pN)  (pN/pN)   pN   (S\N)
        --------------->
              pN
----------------------->
           N
------------------------------<
              S
##########################################
2 found derivation for sentence: le méchant chat dort
   le    méchant  chat  dort
 (N/pN)  (pN/pN)   pN   (S\N)
        --------------->
              pN
----------------------->
           N
------------------------------<
             

In [None]:
print(lex)