In [11]:
import spacy
from sklearn_crfsuite import CRF, metrics
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [12]:
data = pd.read_csv("preprocessed_commentary.csv")  # Use the uploaded file path here
print(data.head())


   ball score                                         commentary
0  19.2     W  Parag to Madushanka, OUTParag closes it out wi...
1  19.1     W  Parag to Theekshana, OUTParag helps himself to...
2  18.6   â€¢  Siraj to Asitha Fernando, no runmighty full on...
3  18.5     W  Siraj to Pathirana, OUTon a length and slantin...
4  18.4     4  Siraj to Pathirana,  4 runsPathirana gets a ta...


In [66]:
nlp = spacy.load("en_core_web_sm")

# Function to extract features using spaCy
def extract_features(doc):
    features = []
    for token in doc:
        features.append({
            'text': token.text,
            'lemma': token.lemma_,
            'pos': token.pos_,
            'tag': token.tag_,
            'dep': token.dep_,
            'shape': token.shape_,
            'is_alpha': token.is_alpha,
            'is_stop': token.is_stop
        })
    return features

# Apply spaCy on each commentary row to extract features
data['features'] = data['commentary'].apply(lambda x: extract_features(nlp(x)))
data['features'][0]

[{'text': 'Parag',
  'lemma': 'Parag',
  'pos': 'PROPN',
  'tag': 'NNP',
  'dep': 'advcl',
  'shape': 'Xxxxx',
  'is_alpha': True,
  'is_stop': False},
 {'text': 'to',
  'lemma': 'to',
  'pos': 'ADP',
  'tag': 'IN',
  'dep': 'prep',
  'shape': 'xx',
  'is_alpha': True,
  'is_stop': True},
 {'text': 'Madushanka',
  'lemma': 'Madushanka',
  'pos': 'PROPN',
  'tag': 'NNP',
  'dep': 'pobj',
  'shape': 'Xxxxx',
  'is_alpha': True,
  'is_stop': False},
 {'text': ',',
  'lemma': ',',
  'pos': 'PUNCT',
  'tag': ',',
  'dep': 'punct',
  'shape': ',',
  'is_alpha': False,
  'is_stop': False},
 {'text': 'OUTParag',
  'lemma': 'OUTParag',
  'pos': 'PROPN',
  'tag': 'NNP',
  'dep': 'nsubj',
  'shape': 'XXXXxxxx',
  'is_alpha': True,
  'is_stop': False},
 {'text': 'closes',
  'lemma': 'close',
  'pos': 'VERB',
  'tag': 'VBZ',
  'dep': 'ROOT',
  'shape': 'xxxx',
  'is_alpha': True,
  'is_stop': False},
 {'text': 'it',
  'lemma': 'it',
  'pos': 'PRON',
  'tag': 'PRP',
  'dep': 'dobj',
  'shape': 'xx',

In [59]:
def assign_label(token):
    # Example rules for labeling, modify as needed
    if token.text.lower() in ["hits", "bowls", "flicks", "pulls"]:
        return "Action"
    elif token.text.lower() in ["six", "four", "wicket", "boundary"]:
        return "Outcome"
    elif token.ent_type_ == "PERSON" and token.i == 0:  # Named entity and first token
        return "Bowler"
    elif token.ent_type_ == "PERSON":  # Named entity
        return "Batsman"
    elif token.text.lower() in ["fast", "short", "length", "off", "leg", "middle"]:
        return "Delivery"
    elif token.text.lower() in ["cover", "point", "slip", "midwicket"]:
        return "Field_Position"
    else:
        return "Other"


In [60]:
# Step 6: Prepare Data for CRF Model

def prepare_data_for_crf(data):
    X = []
    y = []

    for index, row in data.iterrows():
        doc = nlp(row['commentary'])
        
        doc_features = []
        doc_labels = []

        for token in doc:
            token_features = {
                'text': token.text,
                'lemma': token.lemma_,
                'pos': token.pos_,
                'tag': token.tag_,
                'dep': token.dep_,
                'shape': token.shape_,
                'is_alpha': str(token.is_alpha),  # Convert boolean to string
                'is_stop': str(token.is_stop)    # Convert boolean to string
            }
            doc_features.append(token_features)

            token_label = assign_label(token)
            doc_labels.append(str(token_label))  # Ensure label is a string

        X.append(doc_features)
        y.append(doc_labels)

    return X, y

# Prepare data for CRF using the updated function
X, y = prepare_data_for_crf(data)


In [61]:
# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [62]:

# Train CRF model
from sklearn_crfsuite import CRF

# Train CRF model
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

In [63]:
# Evaluate the CRF Model
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, digits=3))


                precision    recall  f1-score   support

        Action      1.000     1.000     1.000         1
       Batsman      0.907     0.817     0.860        60
        Bowler      1.000     0.846     0.917        13
      Delivery      1.000     1.000     1.000        33
Field_Position      1.000     1.000     1.000         8
         Other      0.976     0.992     0.984       661
       Outcome      1.000     0.667     0.800         9

      accuracy                          0.973       785
     macro avg      0.983     0.903     0.937       785
  weighted avg      0.973     0.973     0.972       785



In [64]:
# Test on new commentary
new_commentary = "Smith bowls a bouncer, and Kohli hooks it for a six."
new_commentary_doc = nlp(new_commentary)
new_features = extract_features(new_commentary_doc)
X_new = [[{
    'text': token_feature['text'],  # Include 'text' for better context
    'pos': token_feature['pos'],
    'is_alpha': str(token_feature['is_alpha']),  # Convert to string
    'is_stop': str(token_feature['is_stop'])  # Convert to string
} for token_feature in new_features]]


# Predict labels using the trained CRF model
y_new_pred = crf.predict(X_new)
print("Predicted Labels:", y_new_pred)

Predicted Labels: [['Other' 'Other' 'Other' 'Other' 'Other' 'Other' 'Other' 'Other' 'Other'
  'Other' 'Other' 'Other' 'Other']]


In [65]:
# Step 12: Save and Load the Model (Optional)
import pickle
with open('crf_model.pkl', 'wb') as f:
    pickle.dump(crf, f)
with open('crf_model.pkl', 'rb') as f:
    crf_loaded = pickle.load(f)

In [68]:
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Sample sentence with misclassified entity
doc = nlp("Virat Kohli scored a century for Royal Challengers Bangalore.")

# Custom rules for correcting entity types
for ent in doc.ents:
    # If the entity is classified as ORG, but is actually a person
    if ent.label_ == "ORG" and ent.text in ["Virat Kohli", "MS Dhoni", "Sachin Tendulkar"]:
        print(f"Correcting entity: {ent.text} from {ent.label_} to PERSON")
        # In a real case, you would modify the entity label in your data
    else:
        print(f"Entity: {ent.text}, Label: {ent.label_}")


Correcting entity: Virat Kohli from ORG to PERSON
Entity: a century, Label: DATE
Entity: Royal Challengers Bangalore, Label: ORG


In [70]:

doc = nlp("Virat Kohli plays for Royal Challengers Bangalore in IPL 2021.")

for token in doc:
    print(f"Token: {token.text}, Entity Type: {token.ent_type_}")


Token: Virat, Entity Type: ORG
Token: Kohli, Entity Type: ORG
Token: plays, Entity Type: 
Token: for, Entity Type: 
Token: Royal, Entity Type: ORG
Token: Challengers, Entity Type: ORG
Token: Bangalore, Entity Type: ORG
Token: in, Entity Type: 
Token: IPL, Entity Type: 
Token: 2021, Entity Type: DATE
Token: ., Entity Type: 
