In [1]:
import numpy as np
import chardet
import re

def wer(reference, hypothesis):
    """
    Computes the Word Error Rate (WER) between two text files, ignoring case and punctuation.
    
    Args:
    - reference (str): path to the reference text file
    - hypothesis (str): path to the hypothesis text file
    
    Returns:
    - wer (float): the WER between the reference and hypothesis files
    """
    # Detect the encoding of the input files
    with open(reference, 'rb') as ref_file:
        ref_data = ref_file.read()
        ref_encoding = chardet.detect(ref_data)['encoding']
    with open(hypothesis, 'rb') as hyp_file:
        hyp_data = hyp_file.read()
        hyp_encoding = chardet.detect(hyp_data)['encoding']
    
    # Read the files and split them into lists of words, ignoring case and punctuation
    with open(reference, "r", encoding=ref_encoding) as ref_file:
        ref_words = re.findall(r'\w+', ref_file.read().lower())
    with open(hypothesis, "r", encoding=hyp_encoding) as hyp_file:
        hyp_words = re.findall(r'\w+', hyp_file.read().lower())
    
    # Initialize the matrix to store the edit distances between all pairs of words
    # We add 1 to the dimensions because the matrix is 1-indexed
    D = np.zeros((len(ref_words) + 1, len(hyp_words) + 1))
    for i in range(len(ref_words) + 1):
        D[i, 0] = i
    for j in range(len(hyp_words) + 1):
        D[0, j] = j
    
    # Compute the edit distances between all pairs of words
    for i in range(1, len(ref_words) + 1):
        for j in range(1, len(hyp_words) + 1):
            if ref_words[i-1] == hyp_words[j-1]:
                D[i, j] = D[i-1, j-1]
            else:
                D[i, j] = min(D[i-1, j], D[i, j-1], D[i-1, j-1]) + 1
    
    # Compute the WER as the normalized edit distance between the reference and hypothesis
    wer = D[len(ref_words), len(hyp_words)] / len(ref_words)
    
    return wer


In [None]:
reference_file = ""
hypothesis_file = ""
wer_score = wer(reference_file, hypothesis_file)
print("WER:", wer_score)
