In [5]:
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
def read_corpus_to_dataframe(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) == 2:
                token, label = parts
                data.append({'token': token, 'label': label})
                
    return pd.DataFrame(data)

df_train = read_corpus_to_dataframe('./data/es-en/train.conll')
df_dev = read_corpus_to_dataframe('./data/es-en/dev.conll')

df_train.head()

Unnamed: 0,token,label
0,11:11,other
1,.....,other
2,make,lang1
3,a,lang1
4,wish,lang1


CRFs rely heavily on feature extraction. For language detection in code-mixed text, consider these features:

### Word-level features: 
Word identity, word suffix, word length, etc.
### Contextual features: 
Previous and next word, previous and next label.
### Orthographic features: 
Capitalization, presence of digits, punctuation, etc.
### Lexical features: 
Whether the word is in a pre-compiled list of common words for a specific language.

In [13]:
def extract_features(df):
    features_list = []
    for index, row in df.iterrows():
        token = row['token']
        features = {
            'word': token,
            'is_first': index == 0,
            'is_last': index == len(df) - 1,
            'is_capitalized': token[0].upper() == token[0] if token else False,
            'is_all_caps': token.upper() == token if token else False,
            'is_all_lower': token.lower() == token if token else False,
            'prefix-1': token[0] if len(token) > 0 else '',
            'prefix-2': token[:2] if len(token) > 1 else token[0] if len(token) > 0 else '',
            'suffix-1': token[-1] if len(token) > 0 else '',
            'suffix-2': token[-2:] if len(token) > 1 else token[-1] if len(token) > 0 else '',
            'prev_word': df.iloc[index - 1]['token'] if index > 0 else '',
            'next_word': df.iloc[index + 1]['token'] if index < len(df) - 1 else '',
            'has_hyphen': '-' in token,
            'is_numeric': token.isdigit(),
        }
        # Append the dictionary to the list
        features_list.append(features)
    return features_list

features_train = extract_features(df_train)
features_dev = extract_features(df_dev)



In [16]:
features_train[:10]

[{'word': '11:11',
  'is_first': True,
  'is_last': False,
  'is_capitalized': True,
  'is_all_caps': True,
  'is_all_lower': True,
  'prefix-1': '1',
  'prefix-2': '11',
  'suffix-1': '1',
  'suffix-2': '11',
  'prev_word': '',
  'next_word': '.....',
  'has_hyphen': False,
  'is_numeric': False},
 {'word': '.....',
  'is_first': False,
  'is_last': False,
  'is_capitalized': True,
  'is_all_caps': True,
  'is_all_lower': True,
  'prefix-1': '.',
  'prefix-2': '..',
  'suffix-1': '.',
  'suffix-2': '..',
  'prev_word': '11:11',
  'next_word': 'make',
  'has_hyphen': False,
  'is_numeric': False},
 {'word': 'make',
  'is_first': False,
  'is_last': False,
  'is_capitalized': False,
  'is_all_caps': False,
  'is_all_lower': True,
  'prefix-1': 'm',
  'prefix-2': 'ma',
  'suffix-1': 'e',
  'suffix-2': 'ke',
  'prev_word': '.....',
  'next_word': 'a',
  'has_hyphen': False,
  'is_numeric': False},
 {'word': 'a',
  'is_first': False,
  'is_last': False,
  'is_capitalized': False,
  'is_all

In [24]:
X_train = extract_features(df_train)
y_train = df_train['label']
X_test = extract_features(df_dev)
y_test = df_dev['label']
X_test


[{'word': '@_easanti',
  'is_first': True,
  'is_last': False,
  'is_capitalized': True,
  'is_all_caps': False,
  'is_all_lower': True,
  'prefix-1': '@',
  'prefix-2': '@_',
  'suffix-1': 'i',
  'suffix-2': 'ti',
  'prev_word': '',
  'next_word': '@mememecaigo',
  'has_hyphen': False,
  'is_numeric': False},
 {'word': '@mememecaigo',
  'is_first': False,
  'is_last': False,
  'is_capitalized': True,
  'is_all_caps': False,
  'is_all_lower': True,
  'prefix-1': '@',
  'prefix-2': '@m',
  'suffix-1': 'o',
  'suffix-2': 'go',
  'prev_word': '@_easanti',
  'next_word': '#todossomoscarlosperez',
  'has_hyphen': False,
  'is_numeric': False},
 {'word': '#todossomoscarlosperez',
  'is_first': False,
  'is_last': False,
  'is_capitalized': True,
  'is_all_caps': False,
  'is_all_lower': True,
  'prefix-1': '#',
  'prefix-2': '#t',
  'suffix-1': 'z',
  'suffix-2': 'ez',
  'prev_word': '@mememecaigo',
  'next_word': '#hashtaglargo',
  'has_hyphen': False,
  'is_numeric': False},
 {'word': '#ha

In [26]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

# Fit the CRF model directly with the prepared data
crf.fit(X_train, y_train)


ValueError: The numbers of items and labels differ: |x| = 14, |y| = 5