In [4]:
!pip install sklearn_crfsuite
!pip install sklearn

Defaulting to user installation because normal site-packages is not writeable
Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn_crfsuite)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn_crfsuite
Successfully installed python-crfsuite-0.9.11 sklearn_crfsuite-0.5.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mp

In [10]:
import json
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
import sklearn_crfsuite
from sklearn.preprocessing import LabelEncoder


In [11]:


# -------------------------------
# 1. Read data into a DataFrame
# -------------------------------
# Here we use pd.read_json to read the JSON file into a DataFrame.
df = pd.read_json('Augmented_Annotated_JSON_1103.json')

# The DataFrame should have columns 'tokens', 'labels', and 'text'.
# Convert tokens and labels columns to lists (one per sentence)
sentences = df['tokens'].tolist()
labels = df['labels'].tolist()

# -------------------------------
# 2. Define feature extraction functions
# -------------------------------
def word2features(sent, i):
    token = sent[i]
    features = {
        'bias': 1.0,
        'word.lower()': token.lower(),
        'word.isupper()': token.isupper(),
        'word.istitle()': token.istitle(),
        'word.isdigit()': token.isdigit(),
        'prefix(3)': token[:3],
        'suffix(3)': token[-3:],
    }
    if i > 0:
        token_prev = sent[i-1]
        features.update({
            '-1:word.lower()': token_prev.lower(),
            '-1:word.istitle()': token_prev.istitle(),
            '-1:word.isupper()': token_prev.isupper(),
        })
    else:
        features['BOS'] = True  # beginning of sentence
    if i < len(sent)-1:
        token_next = sent[i+1]
        features.update({
            '+1:word.lower()': token_next.lower(),
            '+1:word.istitle()': token_next.istitle(),
            '+1:word.isupper()': token_next.isupper(),
        })
    else:
        features['EOS'] = True  # end of sentence
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# -------------------------------
# 3. Prepare features for CRF (sequence data) and Random Forest (flattened)
# -------------------------------

# For CRF: create a list of feature dictionaries (one list per sentence) and corresponding label lists.
X_seq = [sent2features(s) for s in sentences]
y_seq = labels  # each element is a list of labels for the sentence

# For Random Forest: flatten the token-level features so that each token is an independent instance.
X_flat = []
y_flat = []
for sent_feats, sent_labels in zip(X_seq, y_seq):
    X_flat.extend(sent_feats)
    y_flat.extend(sent_labels)

# -------------------------------
# 4. Split data into training and test sets (at sentence level)
# -------------------------------
sent_indices = list(range(len(sentences)))
train_idx, test_idx = train_test_split(sent_indices, test_size=0.3, random_state=42)

X_train_seq = [X_seq[i] for i in train_idx]
y_train_seq = [y_seq[i] for i in train_idx]
X_test_seq = [X_seq[i] for i in test_idx]
y_test_seq = [y_seq[i] for i in test_idx]

# Function to flatten a list of sentence features or labels
def flatten(sent_list):
    flat = []
    for sent in sent_list:
        flat.extend(sent)
    return flat

X_train_flat = flatten([X_seq[i] for i in train_idx])
y_train_flat = flatten([y_seq[i] for i in train_idx])
X_test_flat = flatten([X_seq[i] for i in test_idx])
y_test_flat = flatten([y_seq[i] for i in test_idx])

# -------------------------------
# 5. Train Random Forest classifier
# -------------------------------
# Vectorize dictionary features
vec = DictVectorizer(sparse=False)
X_train_vec = vec.fit_transform(X_train_flat)
X_test_vec = vec.transform(X_test_flat)

# Encode labels as integers
le = LabelEncoder()


In [14]:
y_train_enc = le.fit_transform(y_train_flat)
y_test_enc = le.transform(y_test_flat)

# Initialize and train Random Forest
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_vec, y_train_enc)
y_pred_rf_enc = rf_clf.predict(X_test_vec)
y_pred_rf = le.inverse_transform(y_pred_rf_enc)

# Evaluate Random Forest
print("Random Forest Classification Report (Token-level):")
print(classification_report(y_test_flat, y_pred_rf))
print("Random Forest Accuracy:", accuracy_score(y_test_flat, y_pred_rf))

# -------------------------------
# 6. Train CRF using sklearn_crfsuite
# -------------------------------
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

crf.fit(X_train_seq, y_train_seq)
y_pred_crf = crf.predict(X_test_seq)

# Flatten predictions and true labels for CRF evaluation
y_test_crf_flat = flatten(y_test_seq)
y_pred_crf_flat = flatten(y_pred_crf)

print("\nCRF Classification Report (Token-level):")
print(classification_report(y_test_crf_flat, y_pred_crf_flat))
print("CRF Accuracy:", accuracy_score(y_test_crf_flat, y_pred_crf_flat))

Random Forest Classification Report (Token-level):
              precision    recall  f1-score   support

  B-BY-VALUE       0.33      0.04      0.07        52
   B-CY-UNIT       0.50      0.44      0.47       195
  B-CY-VALUE       0.93      0.99      0.96      1563
       B-KPI       0.98      0.98      0.98      1974
       B-ORG       0.93      0.84      0.88       164
   B-PY-UNIT       0.41      0.62      0.49       107
  B-PY-VALUE       0.95      0.98      0.97      1211
   B-PY-YEAR       1.00      1.00      1.00        10
      B-UNIT       0.64      0.37      0.47       112
     B-VALUE       0.99      0.85      0.91       276
  E-BY-VALUE       1.00      1.00      1.00        52
   E-CY-UNIT       1.00      0.85      0.92       222
  E-CY-VALUE       0.86      0.92      0.89      1405
       E-KPI       0.99      0.99      0.99      1931
       E-ORG       0.95      0.99      0.97       128
   E-PY-UNIT       0.82      1.00      0.90       127
  E-PY-VALUE       0.95      0

In [29]:
# -------------------------------
# Inference Functions
# -------------------------------

def infer_rf(sentence_tokens):
    """
    Perform inference on a tokenized sentence using the Random Forest classifier.
    
    Parameters:
    - sentence_tokens: list of tokens (strings) from the sentence.
    
    Returns:
    - List of predicted labels for each token.
    """
    # Extract features for each token in the sentence.
    features = [word2features(sentence_tokens, i) for i in range(len(sentence_tokens))]
    
    # Transform dictionary features into a feature matrix using the trained DictVectorizer.
    X_vec = vec.transform(features)
    
    # Predict with Random Forest. These predictions are encoded.
    y_pred_enc = rf_clf.predict(X_vec)
    
    # Convert encoded predictions back to original labels.
    y_pred = le.inverse_transform(y_pred_enc)
    
    return y_pred

def infer_crf(sentence_tokens):
    """
    Perform inference on a tokenized sentence using the CRF model.
    
    Parameters:
    - sentence_tokens: list of tokens (strings) from the sentence.
    
    Returns:
    - List of predicted labels for each token.
    """
    # Extract features for the entire sentence.
    features = sent2features(sentence_tokens)
    
    # The CRF expects a list of sentences (each sentence is a list of feature dicts).
    y_pred = crf.predict([features])
    
    # y_pred is a list with one element (our sentence's labels)
    return y_pred[0]

# -------------------------------
# Example Inference
# -------------------------------

# Sample sentence for inference (tokenized)
sample_sentence = "As of 31 December 2023, 154,8621 employees hold a working contract with Allianz."
# For simplicity, we use a basic split. In production, you may use a more robust tokenizer.
tokens = sample_sentence.split()

print("Tokens:", tokens)

# Inference using Random Forest
rf_predictions = infer_rf(tokens)
print("Random Forest Predictions:", rf_predictions)

# Inference using CRF
crf_predictions = infer_crf(tokens)
print("CRF Predictions:", crf_predictions)


Tokens: ['As', 'of', '31', 'December', '2023,', '154,8621', 'employees', 'hold', 'a', 'working', 'contract', 'with', 'Allianz.']
Random Forest Predictions: ['O' 'O' 'O' 'S-MONTH' 'S-CY-YEAR' 'S-VALUE' 'B-KPI' 'O' 'O' 'O' 'O' 'O'
 'O']
CRF Predictions: ['O' 'O' 'O' 'S-MONTH' 'O' 'O' 'B-KPI' 'E-KPI' 'O' 'O' 'O' 'O' 'O']


In [31]:
df_test= pd.read_csv('example_output.csv')
sample_sentence= df_test['Sentence'][155:165]

for sentence in sample_sentence:
    tokens = sentence.split()
    print("Tokens:", tokens)
    
    # Inference using Random Forest
    rf_predictions = infer_rf(tokens)
    print("Random Forest Predictions:", rf_predictions)
    
    # Inference using CRF
    crf_predictions = infer_crf(tokens)
    print("CRF Predictions:", crf_predictions)

Tokens: ['pension', 'obligations', 'fully', 'covered', 'provisions.']
Random Forest Predictions: ['I-KPI' 'E-KPI' 'O' 'O' 'O']
CRF Predictions: ['B-KPI' 'E-KPI' 'O' 'O' 'O']
Tokens: ['Provisions', 'pension', 'obligations', '(pension', 'provisions)', 'ar']
Random Forest Predictions: ['O' 'I-KPI' 'E-KPI' 'O' 'O' 'O']
CRF Predictions: ['B-KPI' 'I-KPI' 'I-KPI' 'I-KPI' 'E-KPI' 'O']
Tokens: ['€112', 'million', '(prior', 'year:', '€370', 'million;', 'difference', 'pursuant', 'section', '253', '(6)', 'HGB)', 'lower', 'carrying', 'pension', 'provisions', 'recorded', 'December', '31,', '2023', 'seven-year', 'average', 'interest', 'rate', 'applied.']
Random Forest Predictions: ['B-CY-VALUE' 'E-CY-VALUE' 'O' 'O' 'B-PY-VALUE' 'E-PY-VALUE' 'O' 'O' 'O'
 'O' 'O' 'O' 'O' 'O' 'I-KPI' 'O' 'O' 'S-MONTH' 'O' 'S-CY-YEAR' 'O' 'O' 'O'
 'O' 'O']
CRF Predictions: ['O' 'O' 'O' 'O' 'B-PY-VALUE' 'E-PY-VALUE' 'B-KPI' 'I-KPI' 'I-KPI' 'I-KPI'
 'I-KPI' 'E-KPI' 'O' 'O' 'B-KPI' 'E-KPI' 'O' 'S-MONTH' 'O' 'S-CY-YEAR' 'O'
