# Calculating recall and precision

## Workflow

In this analysis we followed this workflow


![workflow](../images/workflow.png)

# Instructions

To run this code set up as folder structure as follows:
- main_project_folder
    - data
        - data_file 
    - script
        - jupyternotebook
    
       
# Data format

- csv format

- three columns
    - column 1 : words (optional)
    - column 2 : predicted part of speech (column header/name = prediction)
    - column 3 : true part of speech (column header/name = truth)
    
  


In [6]:
# load panda's package
import pandas as pd

In [7]:
# create an object from spreadsheet
csv_data = pd.read_csv("../data/test.csv")

In [8]:
# inspect data
print(csv_data)

             pot prediction truth
0        Kudliwa          V     V
1        Umgodla          N     N
2       Ovulweko        REL   REL
3      USkhosana          N     N
4    banoMasango       POSS   ADV
..           ...        ...   ...
220   akhululuma          V     V
221    iinkulumo          N     N
222      zomuntu       POSS  POSS
223        osele        REL   REL
224       asutha        REL   REL

[225 rows x 3 columns]


In [9]:
# find all part of speech tags in dataset
all_pot = pd.concat([csv_data['truth'],csv_data['prediction']])
unique_pot = all_pot.unique()

# calculate recall

In [10]:
# function for calculating recall
def calculate_recall (truth_tag, prediction_tag, pos_tag):
    """
    Calculate the recall for a specific part of speech tag.
    Parameters:
    truth_tag (list): The ground truth POS tags.
    prediction_tag (list): The predicted POS tags by the tagger.
    pos_tag (str): The part of speech tag to calculate recall for.

    Returns:
    float: The recall value for the specified POS tag.
    """
    
    tp = sum(1 for truth, pred in zip(truth_tag, prediction_tag) if truth == pos_tag and pred == pos_tag)
    fn = sum(1 for truth, pred in zip(truth_tag, prediction_tag) if truth == pos_tag and pred != pos_tag)
    
    if tp + fn == 0:
        return 0.0 
    
    recall = tp/(tp + fn)
    return recall

In [11]:
# calculate recall for each part of speech tag
for pot in unique_pot:
    recall = calculate_recall(truth_tag=csv_data['truth'], prediction_tag=csv_data['prediction'], pos_tag= pot)
    print (f'{pot}: ', recall)

V:  0.5
N:  0.8444444444444444
REL:  0.7142857142857143
ADV:  0.6111111111111112
POSS:  0.9
CDEM:  1.0
ADJ:  0.5
PRO:  1.0
COP:  0.6666666666666666
CONJ:  0.375
PREP:  0.0
INT:  0.0
IDEO:  0.0


# Calculating precision

In [12]:
# function for calculating precision
def calculate_precision (truth_tag, prediction_tag, pos_tag):
    """
    Calculate the precision for a specific part of speech tag.
    Parameters:
    truth_tag (list): The ground truth POS tags.
    prediction_tag (list): The predicted POS tags by the tagger.
    pos_tag (str): The part of speech tag to calculate recall for.

    Returns:
    float: The precision value for the specified POS tag.
    """
    
    tp = sum(1 for truth, pred in zip(truth_tag, prediction_tag) if truth == pos_tag and pred == pos_tag)
    fp = sum(1 for truth, pred in zip(truth_tag, prediction_tag) if truth != pos_tag and pred == pos_tag)
    
    if tp + fp == 0:
        return 0.0 
    
    precision = tp/(tp + fp)
    return precision

In [13]:
# calculate precision for each part of speech tag
for pot in unique_pot:
    precision = calculate_precision(truth_tag=csv_data['truth'], prediction_tag=csv_data['prediction'], pos_tag= pot)
    print (f'{pot}: ', precision)

V:  0.75
N:  0.5277777777777778
REL:  0.625
ADV:  0.8461538461538461
POSS:  0.5142857142857142
CDEM:  0.8333333333333334
ADJ:  0.6666666666666666
PRO:  1.0
COP:  1.0
CONJ:  1.0
PREP:  0.0
INT:  0.0
IDEO:  0.0


In [24]:
# Get the unique POS tags (classes) from truth
classes = sorted(set(truth))

truth = csv_data['truth']
prediction = csv_data['prediction']

# Initialize variables for weighted precision calculation
weighted_precision = 0
weighted_recall = 0
total_instances = len(truth)
# Loop through each POS tag (class) to calculate precision
for pot in classes:
    # Calculate True Positives (TP) for this class
    tp = sum(1 for i in range(total_instances) if truth[i] == pot and prediction[i] == pot)
    
    
    # Calculate False Positives (FP) for this class
    fp = sum(1 for i in range(total_instances) if truth[i] != pot and prediction[i] == pot)
    
    # Calculate False Negatives (FN) for this class
    fn = sum(1 for i in range(total_instances) if truth[i] == pot and prediction[i] != pot)
   
    # Calculate recall for this class (handling divide by zero)
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    
    # Calculate the support (number of true instances for this class)
    support = sum(1 for i in range(total_instances) if truth[i] == pot)
    
    
    # Weight the recall by the support (proportion of total instances)
    weighted_recall += recall * (support / total_instances)
    

    # Calculate precision for this class (handling divide by zero)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    
    
    
    # Weight the precision by the support (proportion of total instances)
    weighted_precision += precision * (support / total_instances)
# Output the weighted precision
print(f"Weighted Precision: {weighted_precision:.4f}")

# Output the weighted recall
print(f"Weighted Recall: {weighted_recall:.4f}")

Weighted Precision: 0.7103
Weighted Recall: 0.6711
