In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF, metrics
from sklearn_crfsuite.metrics import flat_classification_report
import pickle

In [3]:


# Load the dataset
file_path = '/home/ahmedabdullahi/NLP590/NLPJobsFinder/Data/ner_training_data.csv'  # Replace with your file path
training_data = pd.read_csv(file_path)


In [4]:

# Prepare data for NER
def prepare_data_for_crf(data):
    """
    Converts the dataframe into a format suitable for CRF training.
    Groups tokens and labels by sentence.
    """
    sentences = []
    current_sentence = []
    current_labels = []

    for _, row in data.iterrows():
        token, label = row['Token'], row['Label']
        current_sentence.append(token)
        current_labels.append(label)

        # Simulate end of sentence based on some condition (e.g., a period or max tokens)
        if token.endswith('.') or len(current_sentence) >= 10:
            sentences.append((current_sentence, current_labels))
            current_sentence = []
            current_labels = []

    if current_sentence:  # Add any remaining tokens as the last sentence
        sentences.append((current_sentence, current_labels))

    return sentences



In [5]:
# Convert the dataset into sentences for CRF training
prepared_sentences = prepare_data_for_crf(training_data)

# Split into train and test datasets
train_data, test_data = train_test_split(prepared_sentences, test_size=0.2, random_state=42)

In [6]:


# Convert into a feature format suitable for sklearn-crfsuite
def extract_features(sentence):
    """
    Extracts features from tokens in a sentence for CRF training.
    """
    return [{"word": word} for word in sentence]

def prepare_for_crf(data):
    """
    Prepares tokens and labels for CRF model training/testing.
    """
    X, y = [], []
    for tokens, labels in data:
        X.append(extract_features(tokens))
        y.append(labels)
    return X, y

X_train, y_train = prepare_for_crf(train_data)
X_test, y_test = prepare_for_crf(test_data)



In [7]:
# Train CRF model
crf = CRF(algorithm='lbfgs', max_iterations=100, all_possible_transitions=True)
crf.fit(X_train, y_train)



In [8]:
# Evaluate the model
y_pred = crf.predict(X_test)
report = flat_classification_report(y_test, y_pred)




In [9]:
print("Evaluation Report:")
print(report)


Evaluation Report:
              precision    recall  f1-score   support

      B-City       0.98      1.00      0.99      9586
   B-Country       1.00      1.00      1.00      9556
    B-Remote       1.00      1.00      1.00      9553
      I-City       0.99      0.91      0.95      2637
   I-Country       1.00      0.99      0.99      1801
           O       0.83      1.00      0.91        50

    accuracy                           0.99     33183
   macro avg       0.97      0.98      0.97     33183
weighted avg       0.99      0.99      0.99     33183



In [10]:
# Save the trained model
model_file = 'ner_model.pkl'
with open(model_file, 'wb') as f:
    pickle.dump(crf, f)
print(f"Model saved to {model_file}")

Model saved to ner_model.pkl


In [13]:


# Test the model on new input
def predict_ner(text, model):
    """
    Tokenize input text and predict NER labels using the trained model.
    """
    tokens = text.split()  # Simple tokenization
    features = extract_features(tokens)
    predictions = model.predict([features])[0]
    return list(zip(tokens, predictions))



In [15]:
# Example input
input_text = "This location Krakow, Poland and Remote description: Series of 2 interviews first on soft skills, second on technical skills."
with open(model_file, 'rb') as f:
    loaded_model = pickle.load(f)

predicted_ner = predict_ner(input_text, loaded_model)
print("\nPredictions on input text:")
for token, label in predicted_ner:
    print(f"{token}: {label}")



Predictions on input text:
This: O
location: O
Krakow,: O
Poland: O
and: O
Remote: O
description:: O
Series: O
of: O
2: O
interviews: O
first: O
on: O
soft: O
skills,: O
second: O
on: O
technical: O
skills.: O
