In [1]:
import json
import numpy as np
import matplotlib.pyplot as plt
import time
from collections import defaultdict

In [2]:
train_set = json.load(open('./corpus/fr/fr.ftb.train.json', encoding = 'utf-8'))
test_set = json.load(open('./corpus/fr/fr.ftb.test.json', encoding = 'utf-8'))

In [3]:
def pick_out_punct(data_set):
    
    punct = set()
    
    for sentence,labels in data_set:
        for word,label in zip(sentence,labels):
            if label == 'PUNCT':
                punct.add(word)
    
    return punct
        

In [4]:
pick_out_punct(train_set)

{'!',
 '"',
 '(',
 '(*)',
 ')',
 ',',
 '-',
 '.',
 '...',
 '/',
 ':',
 ';',
 '=',
 '?',
 '[',
 '[…]',
 ']'}

In [5]:
def feature_window(i, sentence, l=2):
    
    res = []
    
    word = sentence[i]
    res.append(word)
    
    for k in range(1,l+1):
        
        res.append('win_-'+str(k)+'_'+(sentence[i-k] if i-k>=0 else 'none'))
        res.append('win_+'+str(k)+'_'+(sentence[i+k] if i+k<len(sentence) else 'none'))
        
    return res

In [6]:
def feature_suffix(i,sentence):
    
    res = []
    
    for k in range(1,len(sentence[i])):
        res.append('suffix_'+sentence[i][k:])
        
    return res

In [7]:
def feature_shape(i, sentence):

    def has_digit(s):

        return any(c.isdigit() for c in s)
    
    res = []
    
    word = sentence[i]
    
    if word.istitle():
        res.append('start_capital')
    if word.isupper():
        res.append('only_capital')
    if has_digit(word):
        res.append('has_digit')
    if '-' in word:
        res.append('has_hyphen')
    if '_' in word:
        res.append('has_hyphen_low')
    if not word.isalnum():
        res.append('not_alnum')
    if len(word) > 3:
         res.append('word_len_>_3')
    
    if '\'' in word:
        res.append('abbr')
    
    return res

In [8]:
def get_bigram(data_set):
    
    bigram_left = defaultdict(lambda: defaultdict(int))
    bigram_right = defaultdict(lambda: defaultdict(int))
    for sentence,labels in data_set:
        for i in range(1,len(sentence)):
            bigram_left[sentence[i]][sentence[i-1]] += 1
            bigram_right[sentence[i-1]][sentence[i]] += 1
            
            
    return bigram_left,bigram_right

In [9]:
def feature_distributional(i,sentence,bigram,direction):
    
    res = []
    
    word = sentence[i]
    
    bigram_words = bigram[word]

    m = sorted(bigram_words.items(), key = lambda item: item[1])
    
    freq_level = 0
    
    for k in m:
        freq_level += 1
        res.append(str(freq_level)+'_freq_'+k[0])
        
    
    
    return res

In [10]:
def collect_features_and_labels(data_set):
    
    data = []
    label = []
    
    punct = pick_out_punct(data_set)
    
    bigram_left,bigram_right = get_bigram(data_set)
    
    for sentence,labels in data_set:
        
        for i in range(len(sentence)):
            
            if sentence[i] in punct:
                data.append(list(sentence[i]))
            
            else:
                data_of_word = []

                data_of_word += feature_window(i, sentence)
                data_of_word += feature_suffix(i, sentence)
                data_of_word += feature_shape(i, sentence)
                #data_of_word += feature_distributional(i, sentence, bigram_left,'_left_')

                data.append(data_of_word)
        
        label += labels
            
    return data,label

In [11]:
def oov_features_and_labels(train_data,test_data,test_label):
    
    words = []
    data = []
    labels = []
    
    for word in train_data:
        words.append(word[0])
    
    train_words = set(words)
    
    for word,label in zip(test_data,test_label):
        if word[0] not in train_words:
            data.append(word)
            labels.append(label)
            
    return data,labels

In [21]:
def ambiguous_features_and_labels(input_data,input_label):
    
    words = defaultdict(lambda: set())
    data = []
    labels = []
    
    for word,label in zip(input_data,input_label):
        words[word[0]].add(label)
    
    for word,label in zip(input_data,input_label):
        if(len(words[word[0]]) > 1):
            data.append(word)
            labels.append(label)
    
    return data,labels
        

In [13]:
class simple_perceptron:
    
    def __init__(self):
        
        self.labels = []
        
        self.weights = defaultdict(lambda: defaultdict(float))
        
        self.weights_average = defaultdict(lambda: defaultdict(float))
        
    def fit(self,sentences,labels):
        
        self.labels = list(set(labels))
        
        for features,label in zip(sentences,labels):
            
            self.update(features,label)
            
            
            
    def predict(self,features):
        
        label_volt = np.zeros(len(self.labels))
        
        for feature in features:
            
            for label in self.labels:
                
                label_volt[self.labels.index(label)] += self.sigmoid(self.weights[feature][label])
         
        label_predict = self.labels[np.argmax(label_volt)]
        
        return label_predict
    
    
    
    def predict_all(self,sentences):
        
        labels_predict = []
        
        for features in sentences:
            
            labels_predict.append(self.predict(features))
            
        return labels_predict
    
    
    
    def evaluate(self,sentences,labels):
        
        labels_predict = self.predict_all(sentences)
        
        correct_num = 0.
        
        for label_predict,label_real in zip(labels_predict,labels):
            if label_predict == label_real:
                correct_num += 1.
        
        print('accuracy of simple perceptron model: '+str(correct_num/len(labels) * 100)+'%')
    
    
    def update(self,features,label_real):
        
        label_predict = self.predict(features)
        delta = 10**(-7)
            
        if (label_predict == label_real):
            return 
        
        for feature in features:
            
            self.weights_average[feature][label_predict] += np.log(0.9)**2
            self.weights_average[feature][label_real] += np.log(1.1)**2
            
            self.weights[feature][label_predict] += np.log(0.9)/(delta+np.sqrt(self.weights_average[feature][label_predict]))
            
            self.weights[feature][label_real] += np.log(1.1)/(delta+np.sqrt(self.weights_average[feature][label_real]))
            #print(self.weights[feature][label_real])
    
    def sigmoid(self,x):
        
        z = 1/(1 + np.exp(-x))
        
        return z

In [14]:
begin = time.time()
train_data,train_label = collect_features_and_labels(train_set)
end = time.time()
print('total time = ',end - begin)

test_data,test_label = collect_features_and_labels(test_set)

total time =  3.002885341644287


In [15]:
print(test_data[0])

['La', 'win_-1_none', 'win_+1_limite', 'win_-2_none', 'win_+2_des', 'suffix_a', 'start_capital']


In [16]:
oov_data,oov_label = oov_features_and_labels(train_data,test_data,test_label)

In [22]:
ambiguous_data,ambiguous_label = ambiguous_features_and_labels(train_data,train_label)

In [25]:
print(len(ambiguous_data))
print(len(train_data))

212787
442228


In [26]:
p = simple_perceptron()

In [27]:
p.fit(train_data,train_label)

In [28]:
p.evaluate(test_data,test_label)

accuracy of simple perceptron model: 94.94225620396148%


In [29]:
p.evaluate(oov_data,oov_label)

accuracy of simple perceptron model: 80.37974683544303%


In [30]:
p.evaluate(ambiguous_data,ambiguous_label)

accuracy of simple perceptron model: 95.0946251415735%
