In [1]:
from nltk.ccg import chart, lexicon
from nltk.ccg.lexicon import CCGLexicon, Token, augParseCategory
from nltk.ccg.chart import CCGChart,CCGLeafEdge
from nltk.tree import Tree
import pandas as pd
import numpy as np

# Weighed Lexicon

In [2]:
from warnings import warn

class WeighedToken(Token):
    def __init__(self, token, categ, semantics=None, weight = 1.0):
        super().__init__(token, categ, semantics= semantics)
        self._weight = weight
    def weight(self):
        """1.0 is considered the default weight for any token"""
        try:
            return self._weight
        except AttributeError:
            warn(f"[{self.token} : {str(self)}] : this token has no weight attribute, defaulted to 1.0.")
            return 1.0

class WeighedLexicon(CCGLexicon):
    def __init__(self, start, primitives, families, entries):
        super().__init__(start, primitives, families, entries)

    def weight(self, entry):
        return entry.weight()

# CYK

In [3]:
valz = {
    '>' : 0.8,
    '<' : 0.7
}
def rweight(rule):
    s = rule.__str__()
    if s in valz:
        return valz[s]
    else:
        return 1.0 # Base rules weight

# Implements the CYK algorithm, code partly taken from nltk
def weightedParse(tokens, lex, rules):
    """made to take weighed tokens and lexicons"""
    chart = CCGChart(list(tokens))
    
    # Initialize leaf edges.
    for index in range(chart.num_leaves()):
        for token in lex.categories(chart.leaf(index)):
            new_edge = CCGLeafEdge(index, token, chart.leaf(index))
            new_edge.weight = token.weight()
            chart.insert(new_edge, ())

    # Select a span for the new edges
    for span in range(2, chart.num_leaves() + 1):
        for start in range(0, chart.num_leaves() - span + 1):
            
            bestedge = None
            
            # Try all possible pairs of edges that could generate
            # an edge for that span
            for part in range(1, span):
                lstart = start
                mid = start + part
                rend = start + span

                for left in chart.select(span=(lstart, mid)):
                    for right in chart.select(span=(mid, rend)):
                        # Generate all possible combinations of the two edges
                        for rule in rules:
                            edgez = list(rule.apply(chart, lex, left, right))
                            if(len(edgez)==1):
                                edge = edgez[0]
                                edge.weight = rweight(rule) * left.weight * right.weight
                                edge.triple = (rule,left,right)
                                if (bestedge == None) or (bestedge.weight < edge.weight):
                                    bestedge = edge
                            elif(len(edgez)!=0):
                                print("Too many new edges (unsupported rule used)")
                                
                        # end for rule loop
                    # end for right loop
                # end for left loop
            # end for part loop
    return chart

def wpToTree(edge):
    if isinstance(edge,CCGLeafEdge):
        return Tree((edge.token(),"Leaf"),[Tree(edge.token(),[edge.leaf()])])
    else:
        return Tree(
            (chart.Token(None,edge.categ()),edge.triple[0].__str__()),
            [wpToTree(t) for t in (edge.triple[1:])])

def bestTree(tokens, lex, rules):
    # We build the weighgted parse tree using cky
    wChart = weightedParse(tokens, lex, rules)
    # We get the biggest edge
    e = list(wChart.select(start=0,end=len(tokens)))[0]
    # We get the tree that brought us to this edge
    t = wChart._trees(e, True, dict(), Tree)[0]
    # (wpToTree(e),e.weight)
    return (t,e.weight)

# Application

In [4]:
from numbers import Number
from nltk.sem.logic import Expression
from nltk.ccg.api import PrimitiveCategory

def to_pseudo_entries(table, consider_semantics = False):
    """returns a list of lists in the format ['word', 'category', 'weight', None]
    if consider_semantics == false else ['word', 'category', weight, 'semantic']
    that is left to be converted into tokens by to_wlex_entries"""

    entries = list()
    for line in range(len(table['MOT'])):
        for wdi, word in enumerate(table['MOT'][line].replace(" ", "").split('/')):
            for j in range(3):
                if isinstance(table['Cat'+str(j)][line],str):
                    category = table['Cat'+str(j)][line]
                    weight = float(table['Weights'+str(j)][line]) if isinstance(table['Weights'+str(j)][line], Number) else 1.0
                    if consider_semantics:
                        semantic = (table['Sem'+str(j)][line].replace('\\\\', '\\').split('/'))[wdi]
                    else:
                        semantic = None
                    entries.append([word, category, weight, semantic])
    return entries

def to_wlex_entries(pseudo_entries, primitives, families, var=None):
    """returns the entries to a weighed lexicon from pseudo_entries generated by to_pseudo_entries"""
    entries = dict()
    for entry in pseudo_entries:
        if entry[0] not in entries:
            entries[entry[0]] = list()
        categ, _ = augParseCategory(entry[1], primitives, families, var)
        token = WeighedToken(token= entry[0],
                             categ= categ,
                             semantics= Expression.fromstring(entry[-1]),
                             weight= entry[2])
        entries[entry[0]].append(token)
    return entries
    

In [9]:
# Catégories primitives et familles
primitives = ['S', 'N', 'Pp']
V = augParseCategory("S\\N", primitives = primitives, families={})
families = {'V': V}

# On importe notre lexique sous forme de tableur
table = pd.read_excel("CategoriesGramaticalesCombinatoire.ods", engine="odf")
#print(table.keys())

# On le convertit en Lexique pondéré
pe = to_pseudo_entries(table, consider_semantics = True)
#print(pe)
wEntries = to_wlex_entries(pseudo_entries= pe, primitives= primitives, families= families)
#print([list(map(lambda x: f"{k} : "+ str(x) + str(x._semantics), L)) for k, L in wEntries.items()])
lex = WeighedLexicon(start= 'S', primitives= primitives, families= families, entries= wEntries)


# On récupère le nombre de mots qui ont été définis
# n = len(table['MOT'])

# On donne la liste des catégories primitives
# lexstring = ':- S,N,Pp\n'
# On ajoute la notation V pour N\S
# lexstring += 'V :: S\\N\n'

# On lis les données depuis le tableur en une chaine de caractère parsable
#for i in range(n):
#    for j in range(3):
#        if isinstance(table['Cat'+str(j)][i],str):
#            for mot in table['MOT'][i].split('/'):
#                lexstring+=mot+' => ' + table['Cat'+str(j)][i]  + '\n'

# Pour inverser les slash dans le lexicon
#lexstring = lexstring.replace('\\','#').replace('/','\\').replace('#','/')

# On crée notre lexique
# lex = lexicon.fromstring(lexstring)

# On crée le parser, on donne l'ensemble des règles qu'il est cencé connaître
parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
#parser = chart.CCGChartParser(lex, chart.ApplicationRuleSet)

printTotal=True
printDerivations=not printTotal

# On lit les phrases dans le fichier
with open('phrases.txt') as f:
    lines = f.readlines()

    lines.append("le chat et la souris dorment")
    
    for phrase in lines:
        # On met tout en minuscule
        phrase = phrase.lower().strip()
        if printDerivations:
            print("="*77)
            print('#',phrase)
        # lex = lexicon.fromstring(lexstring)
        parser = chart.CCGChartParser(lex, chart.ApplicationRuleSet)

        # Et on affiche tous les arbres de dérivation trouvés
        i=0
        for parse in parser.parse(phrase.split()):
            i+=1
            if printDerivations:
                chart.printCCGDerivation(parse)
        
        if printTotal:
            print(i,phrase)
        
        
        # On affiche la dérivation la meilleure pour l'arbre
        if (i==0):
            print("Pas de dérivation tout court :/")
        else:

            t,d = bestTree(phrase.split(), lex, chart.ApplicationRuleSet)
            print("Found derivation tree with weight",d)
            chart.printCCGDerivation(t)

{}
\n v.à(v,n)
\n v.avec(v,n)
chat
\n m.de(n,m)
dents
\n.donne(n)
\n m.donne(n,m)
\n.mange(n)
\n m.mange(n,m)
donner
\n.donner(n)
\n.dormir(n)
\n.dormir(n)
elle
il
{}
{}
{}
\n m.et(n,m)
\x y.et(x,y)
\v w n.et(v,w,n)
fromage
{}
{}
{}
\P.exists x.P(x)
\P.exists x.P(x)
\P.exists x.P(x)
\P.exists x.P(x)
\P.exists x.P(x)
\n.mangé(n)
\n.mangé(n)
\n.donné(n)
\n.méchant(n)
\n.méchant(n)
\n.noir(n)
\n.noir(n)
\v n.paisiblement(v,n)
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
rat
soeur
\n m.souhaite(m,n)
\n m.pourchasse(m,n)
\n m.attrappe(m,n)
souris
{}
voisin
1 le chat dort
Found derivation tree with weight 0.5599999999999999
            le               chat            dort
 (N/N) {\P.exists x.P(x)}  N {chat}  (S\N) {\n.dormir(n)}
------------------------------------>
        N {exists x.chat(x)}
----------------------------------------------------------<
               S {dormir(exists x.chat(x))}
1 il dort
Found derivation tree with weight 0.7
   il            dort
 N {il}  (S\N) {\n.dormir(n)}
----------

In [10]:
print(lex)

? => (S\S) {{}}
attrape => ((S\N)/N) {\n m.attrappe(m,n)}
avec => (((S\N)\(S\N))/N) {\n v.avec(v,n)}
chat => N {chat}
de => ((N/N)\N) {\n m.de(n,m)}
dents => N {dents}
donne => (S\N) {\n.donne(n)} | ((S\N)/N) {\n m.donne(n,m)}
donner => N {donner} | (N/N) {\n.donner(n)}
donné => Pp {\n.donné(n)}
dorment => (S\N) {\n.dormir(n)}
dort => (S\N) {\n.dormir(n)}
elle => N {elle}
est => ((S\N)/Pp) {{}} | ((S\N)/(N/N)) {{}} | ((S\N)/(N\N)) {{}}
et => ((N/N)\N) {\n m.et(n,m)} | ((S/S)\S) {\x y.et(x,y)} | (((S\N)/(S\N))\(S\N)) {\v w n.et(v,w,n)}
fromage => N {fromage}
il => N {il}
la => ((S\N)/((S\N)/N)) {{}} | (N/N) {\P.exists x.P(x)}
le => ((S\N)/((S\N)/N)) {{}} | (N/N) {\P.exists x.P(x)}
lui => ((S\N)/(S\N)) {{}}
mange => (S\N) {\n.mange(n)} | ((S\N)/N) {\n m.mange(n,m)}
mangé => Pp {\n.mangé(n)}
mangée => Pp {\n.mangé(n)}
mon => (N/N) {\P.exists x.P(x)}
méchant => (N/N) {\n.méchant(n)} | (N\N) {\n.méchant(n)}
noir => (N\N) {\n.noir(n)} | (N/N) {\n.noir(n)}
paisiblement => ((S\N)\(S\N)) {\v n.