In [1]:
import pandas as pd

df = pd.read_csv("diagnoses.csv")
df

Unnamed: 0,diagnosis_id,diagnosis_code,description
0,1,D005,COVID-19
1,2,D004,Hypertension
2,3,D002,Flu
3,4,D004,COVID-19
4,5,D002,COVID-19
...,...,...,...
121,61,D005,Asthma
122,117,D003,Asthma
123,19,D002,Diabetes
124,5,D002,COVID-19


In [2]:
df = df.drop_duplicates()
df

Unnamed: 0,diagnosis_id,diagnosis_code,description
0,1,D005,COVID-19
1,2,D004,Hypertension
2,3,D002,Flu
3,4,D004,COVID-19
4,5,D002,COVID-19
...,...,...,...
115,116,D005,Asthma
116,117,D003,Asthma
117,118,D005,Flu
118,119,D003,Flu


In [3]:
from sklearn.impute import SimpleImputer

code_imputer = SimpleImputer(strategy="most_frequent")
df.loc[:, "diagnosis_code"] = code_imputer.fit_transform(df[["diagnosis_code"]])
df

Unnamed: 0,diagnosis_id,diagnosis_code,description
0,1,D005,COVID-19
1,2,D004,Hypertension
2,3,D002,Flu
3,4,D004,COVID-19
4,5,D002,COVID-19
...,...,...,...
115,116,D005,Asthma
116,117,D003,Asthma
117,118,D005,Flu
118,119,D003,Flu


In [4]:
desc_imputer = SimpleImputer(strategy="most_frequent")
df.loc[:, "description"] = desc_imputer.fit_transform(df[["description"]])
df

Unnamed: 0,diagnosis_id,diagnosis_code,description
0,1,D005,COVID-19
1,2,D004,Hypertension
2,3,D002,Flu
3,4,D004,COVID-19
4,5,D002,COVID-19
...,...,...,...
115,116,D005,Asthma
116,117,D003,Asthma
117,118,D005,Flu
118,119,D003,Flu


In [5]:
df = df[df["diagnosis_code"] != "XXX"]
df

Unnamed: 0,diagnosis_id,diagnosis_code,description
0,1,D005,COVID-19
1,2,D004,Hypertension
2,3,D002,Flu
3,4,D004,COVID-19
4,5,D002,COVID-19
...,...,...,...
115,116,D005,Asthma
116,117,D003,Asthma
117,118,D005,Flu
118,119,D003,Flu


In [6]:
df.loc[:, "description"] = df["description"].str.strip().str.title()
df

Unnamed: 0,diagnosis_id,diagnosis_code,description
0,1,D005,Covid-19
1,2,D004,Hypertension
2,3,D002,Flu
3,4,D004,Covid-19
4,5,D002,Covid-19
...,...,...,...
115,116,D005,Asthma
116,117,D003,Asthma
117,118,D005,Flu
118,119,D003,Flu


In [7]:
df.loc[:, "description"] = df["description"].str.replace("Unk", "Unknown")
df

Unnamed: 0,diagnosis_id,diagnosis_code,description
0,1,D005,Covid-19
1,2,D004,Hypertension
2,3,D002,Flu
3,4,D004,Covid-19
4,5,D002,Covid-19
...,...,...,...
115,116,D005,Asthma
116,117,D003,Asthma
117,118,D005,Flu
118,119,D003,Flu


In [8]:
code_mapping = df.groupby("description")["diagnosis_code"].agg(
    lambda x: x.mode()[0] if not x.mode().empty else np.nan
)

In [9]:
code_map_dict = code_mapping.to_dict()

In [10]:
print("Diagnosis Code Consistency Mapping:", code_mapping)

Diagnosis Code Consistency Mapping: description
Asthma          D001
Covid-19        D004
Diabetes        D002
Flu             D005
Hypertension    D001
Unknown         D002
Name: diagnosis_code, dtype: object


In [11]:
df.loc[:, "diagnosis_code"] = df["description"].map(code_map_dict)

In [12]:
print(df.groupby("description")["diagnosis_code"].unique())

description
Asthma          [D001]
Covid-19        [D004]
Diabetes        [D002]
Flu             [D005]
Hypertension    [D001]
Unknown         [D002]
Name: diagnosis_code, dtype: object


In [13]:
df = df.astype({
    "diagnosis_id": "int",
    "diagnosis_code": "str",
    "description": "str"
})

In [17]:
print("Cleaned Dataset:\n", df)

Cleaned Dataset:
      diagnosis_id diagnosis_code   description
0               1           D004      Covid-19
1               2           D001  Hypertension
2               3           D005           Flu
3               4           D004      Covid-19
4               5           D004      Covid-19
..            ...            ...           ...
115           116           D001        Asthma
116           117           D001        Asthma
117           118           D005           Flu
118           119           D005           Flu
119           120           D004      Covid-19

[118 rows x 3 columns]


In [15]:
cleaned_diagnoses = "diagnoses_cleaned.csv"
df.to_csv(cleaned_diagnoses, index=False)