In [146]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

In [132]:
def read_corpus_to_dataframe(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) == 2:
                token, label = parts
                if parts[1] == 'unk':
                    continue
                elif parts[1] == 'ne':
                    continue
                if parts[1] == 'fw':
                    continue
                if parts[1] == 'ambiguous':
                    continue
                else:
                    data.append({'token': token, 'label': label})
                
    return pd.DataFrame(data)

df_train = read_corpus_to_dataframe('./data/es-en/train.conll')
df_dev = read_corpus_to_dataframe('./data/es-en/dev.conll')

df_train[:1719]

Unnamed: 0,token,label
0,11:11,other
1,.....,other
2,make,lang1
3,a,lang1
4,wish,lang1
...,...,...
1714,Picheale,lang2
1715,al,lang2
1716,",",other
1717,comete,lang2


In [133]:
df_train['label'].value_counts()

label
lang2    112988
lang1     80437
other     54060
mixed        42
Name: count, dtype: int64

In [134]:
def extract_features(df):
    def n_grams(word, n):
        """Generate n-grams for a given word."""
        return [word[i:i+n] for i in range(len(word)-(n-1))]
    
    features_list = []
    for index, row in df.iterrows():
        token = row['token']
        features = {
            'word': token,
            'prev_word': df.iloc[index - 1]['token'] if index > 0 else '',
            'next_word': df.iloc[index + 1]['token'] if index < len(df) - 1 else '',
            'bigram': n_grams(token, 2),
            'trigram': n_grams(token, 3)
        }
        features_list.append(features)
    return features_list

In [164]:
X_train = extract_features(df_train)
y_train = df_train['label']
X_test = extract_features(df_dev)
y_test = df_dev['label']

label
lang2    112988
lang1     80437
other     54060
mixed        42
Name: count, dtype: int64

In [145]:
X_train[:3]

[{'word': '11:11',
  'prev_word': '',
  'next_word': '.....',
  'bigram': ['11', '1:', ':1', '11'],
  'trigram': ['11:', '1:1', ':11']},
 {'word': '.....',
  'prev_word': '11:11',
  'next_word': 'make',
  'bigram': ['..', '..', '..', '..'],
  'trigram': ['...', '...', '...']},
 {'word': 'make',
  'prev_word': '.....',
  'next_word': 'a',
  'bigram': ['ma', 'ak', 'ke'],
  'trigram': ['mak', 'ake']}]

In [136]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
# AttributeError: 'CRF' object has no attribute 'keep_tempfiles'
# ou alors il faut installer une version en-dessous de 0.24 de scikit-learn
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass



In [158]:
predictions = crf.predict(X_test)
print(metrics.flat_accuracy_score(y_test, predictions))

0.7170240234918867
