In [11]:
import spacy
from sklearn_crfsuite import CRF, metrics
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [12]:
data = pd.read_csv("preprocessed_commentary.csv")  # Use the uploaded file path here
print(data.head())


   ball score                                         commentary
0  19.2     W  Parag to Madushanka, OUTParag closes it out wi...
1  19.1     W  Parag to Theekshana, OUTParag helps himself to...
2  18.6   â€¢  Siraj to Asitha Fernando, no runmighty full on...
3  18.5     W  Siraj to Pathirana, OUTon a length and slantin...
4  18.4     4  Siraj to Pathirana,  4 runsPathirana gets a ta...


In [14]:
nlp = spacy.load("en_core_web_sm")

# Function to extract features using spaCy
def extract_features(doc):
    features = []
    for token in doc:
        features.append({
            'text': token.text,
            'lemma': token.lemma_,
            'pos': token.pos_,
            'tag': token.tag_,
            'dep': token.dep_,
            'shape': token.shape_,
            'is_alpha': token.is_alpha,
            'is_stop': token.is_stop
        })
    return features

# Apply spaCy on each commentary row to extract features
data['features'] = data['commentary'].apply(lambda x: extract_features(nlp(x)))
data

Unnamed: 0,ball,score,commentary,features
0,19.2,W,"Parag to Madushanka, OUTParag closes it out wi...","[{'text': 'Parag', 'lemma': 'Parag', 'pos': 'P..."
1,19.1,W,"Parag to Theekshana, OUTParag helps himself to...","[{'text': 'Parag', 'lemma': 'Parag', 'pos': 'P..."
2,18.6,â€¢,"Siraj to Asitha Fernando, no runmighty full on...","[{'text': 'Siraj', 'lemma': 'Siraj', 'pos': 'P..."
3,18.5,W,"Siraj to Pathirana, OUTon a length and slantin...","[{'text': 'Siraj', 'lemma': 'Siraj', 'pos': 'P..."
4,18.4,4,"Siraj to Pathirana, 4 runsPathirana gets a ta...","[{'text': 'Siraj', 'lemma': 'Siraj', 'pos': 'P..."
...,...,...,...,...
113,0.5,1,"Arshdeep Singh to Nissanka, 1 runmistimes a d...","[{'text': 'Arshdeep', 'lemma': 'arshdeep', 'po..."
114,0.4,4,"Arshdeep Singh to Nissanka, FOUR runslovely ha...","[{'text': 'Arshdeep', 'lemma': 'arshdeep', 'po..."
115,0.3,4,"Arshdeep Singh to Nissanka, FOUR runsright in ...","[{'text': 'Arshdeep', 'lemma': 'arshdeep', 'po..."
116,0.2,â€¢,"Arshdeep Singh to Nissanka, no runslants a len...","[{'text': 'Arshdeep', 'lemma': 'arshdeep', 'po..."


In [15]:
# Here, we will add a dummy 'Label' column as the CSV does not contain labels.
# In real use, you would replace this with the actual labels.
data['Label'] = 'O'  # 'O' is often used to denote 'Outside' of a named entity


In [26]:
# Step 6: Prepare Data for CRF Model
def prepare_data_for_crf(data):
    X = []
    y = []

    for index, row in data.iterrows():
        # Extract features for each commentary
        doc_features = []
        doc_labels = []

        for token_feature in row['features']:
            # Custom features, e.g., POS tag, is alpha, etc.
            doc_features.append({
                'pos': token_feature['pos'],
                'is_alpha': token_feature['is_alpha'],
                'is_stop': token_feature['is_stop']
            })
            # Use placeholder label, replace with actual labels as needed
            doc_labels.append(row['Label'])

        X.append(doc_features)
        y.append(doc_labels)

    return X, y


X, y = prepare_data_for_crf(data)
y

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
crf = CRF(
    algorithm='lbfgs',  # Optimization algorithm
    c1=0.1,  # Coefficient for L1 regularization
    c2=0.1,  # Coefficient for L2 regularization
    max_iterations=100,
    all_possible_transitions=True
)

# Train CRF model
crf.fit(X_train, y_train)

In [21]:
# Step 10: Evaluate the CRF Model
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, digits=3))


              precision    recall  f1-score   support

           O      1.000     1.000     1.000       785

    accuracy                          1.000       785
   macro avg      1.000     1.000     1.000       785
weighted avg      1.000     1.000     1.000       785



In [22]:
new_commentary = "Smith bowls a bouncer, and Kohli hooks it for a six."
new_commentary_doc = nlp(new_commentary)
new_features = extract_features(new_commentary_doc)
X_new = [[{
    'pos': token_feature['pos'],
    'is_alpha': token_feature['is_alpha'],
    'is_stop': token_feature['is_stop']
} for token_feature in new_features]]

# Predict labels using the trained CRF model
y_new_pred = crf.predict(X_new)
print("Predicted Labels:", y_new_pred)

Predicted Labels: [['O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O']]


In [23]:
# Step 12: Save and Load the Model (Optional)
import pickle
with open('crf_model.pkl', 'wb') as f:
    pickle.dump(crf, f)
with open('crf_model.pkl', 'rb') as f:
    crf_loaded = pickle.load(f)