Library Imports

In [2]:
import csv
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Deficiency Logic (Part 1)

In [3]:
#code for reading file
def readf(file):
    f = open(file, encoding="utf-8")
    data = list(csv.reader(f))
    f.close()
    return data

In [4]:
#code for formatting deficiency_code
def padding(defcode):
    defcode = str(defcode)
    if len(defcode) < 5:
        defcode = "0" + defcode
    return defcode

In [5]:
scores = {}
count = {}
average = {}
conversion = {"Not a deficiency": 0, "Low": 1, "Medium": 2, "High": 3}
data = readf("psc_severity_train.csv")
# print(data[5015][4])
for row in data[1:]:
    defcode = padding(row[1])
    if row[4] != '':
        if defcode not in scores:
            count[defcode] = 1
            scores[defcode] = conversion[row[4]]

        else:
            count[defcode] += 1
            scores[defcode] += conversion[row[4]]
for defcode in scores:
    average[defcode] = round(scores[defcode]/count[defcode],3)
# print(scores)
# print(count)
# print(average)

In [None]:
# charting
input_code = input("Enter code: ")

x = []
y = []

for row in data[1:]:
    if padding(input_code) == row[1]:
        x.append(row[0])
        y.append(conversion[row[4]])

# Create the plot
plt.plot(x, y, marker='o', linestyle='', color='b', label='All')
plt.axhline(y=average[padding(input_code)], color='r', linestyle='--', linewidth=2, label='Mean')

plt.xticks([])
tick_positions = [1, 2, 3]  # Custom positions
tick_labels = ['Low', 'Medium', 'High']  # Custom labels
plt.yticks(tick_positions, tick_labels)

# Add labels and title
plt.xlabel('Annotations')
plt.ylabel('Severity')
plt.title(input_code)

# Show the legend
plt.legend()

# Display the chart
plt.show()

In [6]:
df = pd.read_csv('psc_severity_train.csv')
# Maps all highs to 3, meds to 2, lows to 1 
df['annotation_severity'] = df['annotation_severity'].map(conversion)
# finds statistics summary for each deficiency code
describedf = df.groupby("deficiency_code")["annotation_severity"].describe()
print(describedf.head(10))

consesusmetrics = df.groupby("deficiency_code")["annotation_severity"].agg(['mean', 'std'])
stdev_75p = consesusmetrics['std'].quantile(0.75)

#  The consensus severity logic as a function
def consensus_sev(mean, stdev, threshold):
    if stdev < threshold:  # Threshold for low disagreement
        if mean <= 0 : # Not a deficiency
            return 0 
        elif mean <= 1:
            return 1  # Low
        elif mean <= 2:
            return 2  # Medium
        else:
            return 3  # High
    else:
        return np.ceil(mean)  # Always round up for high disagreement

#finding the consesus of each defcode as a function so we can use it in apply()
def processbyrow(row):
    return consensus_sev(row['mean'], row['std'], stdev_75p)

#applying it row by row 
consesusmetrics['consensus_severity'] = consesusmetrics.apply(processbyrow, axis=1)

reverse_conversion = {0: "Not a deficiency", 1: "Low", 2: "Medium", 3: "High"} 

# edit into the original DataFrame
df['annotation_severity'] = df['deficiency_code'].map(consesusmetrics['consensus_severity'])

df['annotation_severity'] = df['annotation_severity'].map(reverse_conversion)

# Output
print(df.head(10))

                 count      mean       std  min  25%  50%  75%  max
deficiency_code                                                    
1101              55.0  1.327273  0.771111  0.0  1.0  1.0  2.0  3.0
1102              40.0  1.900000  1.081310  0.0  1.0  2.0  3.0  3.0
1104              29.0  1.241379  0.576639  1.0  1.0  1.0  1.0  3.0
1105              21.0  2.000000  0.774597  1.0  1.0  2.0  3.0  3.0
1106              24.0  1.416667  0.717282  1.0  1.0  1.0  2.0  3.0
1107              29.0  1.379310  0.621852  1.0  1.0  1.0  2.0  3.0
1108              46.0  1.608696  0.682423  1.0  1.0  1.5  2.0  3.0
1113              35.0  1.314286  0.582663  1.0  1.0  1.0  1.5  3.0
1117              47.0  1.361702  0.605250  1.0  1.0  1.0  2.0  3.0
1119              25.0  1.240000  0.435890  1.0  1.0  1.0  1.0  2.0
   PscInspectionId  deficiency_code  annotation_id username  \
0          1702496             1104       42180251   mihail   
1          1702496             1104       42532116     mar

# Model (Part 2)