In [1]:
import numpy as np
import pandas as pd 
import os 

In [2]:
data_path = '../dataset/SARS-CoV-2_concentrations_measured_in_NYC_Wastewater_20240403.csv'
df = pd.read_csv(data_path)
df

Unnamed: 0,Sample Date,Test date,WRRF Name,WRRF Abbreviation,Concentration SARS-CoV-2 gene target (N1 Copies/L),Per capita SARS-CoV-2 load (N1 copies per day per population),Annotation,"Population Served, estimated",Technology
0,08/31/2020,09/01/2020,26th Ward,26W,389.0,264000.0,Concentration below Method Limit of Quantifica...,290608,RT-qPCR
1,08/31/2020,09/01/2020,Bowery Bay,BB,1204.0,444000.0,,924695,RT-qPCR
2,08/31/2020,09/01/2020,Coney Island,CI,304.0,169000.0,Concentration below Method Limit of Quantifica...,682342,RT-qPCR
3,08/31/2020,09/01/2020,Hunts Point,HP,940.0,574000.0,,755948,RT-qPCR
4,08/31/2020,09/01/2020,Jamaica Bay,JA,632.0,233000.0,,748737,RT-qPCR
...,...,...,...,...,...,...,...,...,...
4643,02/27/2024,02/28/2024,Port Richmond,PR,98472.0,,,226167,dPCR
4644,02/27/2024,02/28/2024,Red Hook,RH,70464.0,50000000.0,,224029,dPCR
4645,02/27/2024,02/28/2024,Rockaway,RK,28848.0,,,120539,dPCR
4646,02/27/2024,02/28/2024,Tallman Island,TI,67608.0,42100000.0,,449907,dPCR


In [17]:
import re
def generate_abbreviation(name):
    words = name.split()
    abbr = ''
    for word in words:
        if re.match(r'^\d+', word):
            abbr += re.findall(r'^\d+', word)[0]
        else:
            abbr += word[0].upper()
    return abbr

def check_abbreviation(full_name, abbreviation):
    expected_abbr = generate_abbreviation(full_name)
    return expected_abbr == abbreviation

df['Expected Abbreviation'] = df['WRRF Name'].apply(generate_abbreviation)

df['Is Correct Abbreviation'] = df.apply(lambda x: check_abbreviation(x['WRRF Name'], x['WRRF Abbreviation']), axis=1)

correct_count = df['Is Correct Abbreviation'].sum()
total_count = len(df)
incorrect_examples = df[df['Is Correct Abbreviation'] == False][['WRRF Name', 'WRRF Abbreviation', 'Expected Abbreviation']]
error_counts = incorrect_examples.groupby('WRRF Name').size()
print(f"Count of incorrect abbreviations by WRRF Name: {error_counts}")
print(f"Total entries: {total_count}")
print(f"Correct abbreviations: {correct_count}")
print(f"Correct abbreviation percentage: {correct_count / total_count * 100:.2f}%")
print("Examples of incorrect abbreviations:")
print(incorrect_examples)

output_path = 'abbreviation_output.csv'
df.to_csv(output_path, index=False)

Count of incorrect abbreviations by WRRF Name: WRRF Name
Jamaica Bay    332
Rockaway       332
dtype: int64
Total entries: 4648
Correct abbreviations: 3984
Correct abbreviation percentage: 85.71%
Examples of incorrect abbreviations:
        WRRF Name WRRF Abbreviation Expected Abbreviation
4     Jamaica Bay                JA                    JB
11       Rockaway                RK                     R
18    Jamaica Bay                JA                    JB
25       Rockaway                RK                     R
32    Jamaica Bay                JA                    JB
...           ...               ...                   ...
4617     Rockaway                RK                     R
4624  Jamaica Bay                JA                    JB
4631     Rockaway                RK                     R
4638  Jamaica Bay                JA                    JB
4645     Rockaway                RK                     R

[664 rows x 3 columns]
