In [11]:
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers


In [12]:
def read_corpus_to_dataframe(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) == 2:
                token, label = parts
                data.append({'token': token, 'label': label})
                
    return pd.DataFrame(data)
    

CRFs rely heavily on feature extraction. For language detection in code-mixed text, consider these features:

### Word-level features: 
Word identity, word suffix, word length, etc.
### Contextual features: 
Previous and next word, previous and next label.
### Orthographic features: 
Capitalization, presence of digits, punctuation, etc.
### Lexical features: 
Whether the word is in a pre-compiled list of common words for a specific language.

In [13]:
def extract_features(df):
    # Create a list to hold feature dictionaries for each token
    features_list = []
    # Iterate through each row in the dataframe
    for index, row in df.iterrows():
        token = row['token']
        # Extract features for the current token
        features = {
            'word': token,
            'is_first': index == 0,
            'is_last': index == len(df) - 1,
            'is_capitalized': token[0].upper() == token[0] if token else False,
            'is_all_caps': token.upper() == token if token else False,
            'is_all_lower': token.lower() == token if token else False,
            'prefix-1': token[0] if len(token) > 0 else '',
            'prefix-2': token[:2] if len(token) > 1 else token[0] if len(token) > 0 else '',
            'suffix-1': token[-1] if len(token) > 0 else '',
            'suffix-2': token[-2:] if len(token) > 1 else token[-1] if len(token) > 0 else '',
            'prev_word': df.iloc[index - 1]['token'] if index > 0 else '',
            'next_word': df.iloc[index + 1]['token'] if index < len(df) - 1 else '',
            'has_hyphen': '-' in token,
            'is_numeric': token.isdigit(),
        }
        # Append the dictionary to the list
        features_list.append(features)
    return features_list


In [14]:
def prepare_data(df):
    # Extract features for the entire DataFrame
    X_train = extract_features(df)
    y_train = df['label'].tolist()
    return X_train, y_train

# Assuming 'read_corpus_to_dataframe' and 'extract_features' are already defined
filepath = "./data/es-en/train.conll"
df = read_corpus_to_dataframe(filepath)
X_train, y_train = prepare_data(df)


In [15]:
# CRF Model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

ValueError: The numbers of items and labels differ: |x| = 14, |y| = 5