In [9]:
#load modules etc
import pandas as pd


In [10]:
#load symptoms data
symptoms_df = pd.read_csv("data/raw/Animal Symptoms.csv")

In [11]:
# load diagnostic data
diagnosis_df = pd.read_csv("data/raw/Disease Prediction.csv")

In [12]:
# inspect size of both datasets

print(len(symptoms_df))

431


In [14]:
print(len(diagnosis_df))

431


Both datasets contain 431 rows

In [15]:
# inspect data types
symptoms_df.info()

<class 'pandas.DataFrame'>
RangeIndex: 431 entries, 0 to 430
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Animal_Type        431 non-null    str    
 1   Breed              431 non-null    str    
 2   Age                431 non-null    int64  
 3   Gender             431 non-null    str    
 4   Weight             431 non-null    float64
 5   Symptom_1          431 non-null    str    
 6   Symptom_2          431 non-null    str    
 7   Symptom_3          431 non-null    str    
 8   Symptom_4          431 non-null    str    
 9   Duration           431 non-null    str    
 10  Appetite_Loss      431 non-null    str    
 11  Vomiting           431 non-null    str    
 12  Diarrhea           431 non-null    str    
 13  Coughing           431 non-null    str    
 14  Labored_Breathing  431 non-null    str    
 15  Lameness           431 non-null    str    
 16  Skin_Lesions       431 non-null    st

Body Temperture is str type - should be float

In [16]:
# inspect prediction dataframe

diagnosis_df["Disease_Prediction"].value_counts()

Disease_Prediction
Bovine Tuberculosis                15
Bovine Respiratory Disease         14
Equine Influenza                   13
Canine Parvovirus                  12
Caprine Arthritis Encephalitis     12
                                   ..
Goat Pox                            1
Porcine Epidemic Diarrhea Virus     1
Canine Infectious Hepatitis         1
Feline Panleukopenia Virus          1
Porcine Circovirus Disease          1
Name: count, Length: 139, dtype: int64

139 different predictions

In [17]:
disease_counts = diagnosis_df["Disease_Prediction"].value_counts()
disease_counts

Disease_Prediction
Bovine Tuberculosis                15
Bovine Respiratory Disease         14
Equine Influenza                   13
Canine Parvovirus                  12
Caprine Arthritis Encephalitis     12
                                   ..
Goat Pox                            1
Porcine Epidemic Diarrhea Virus     1
Canine Infectious Hepatitis         1
Feline Panleukopenia Virus          1
Porcine Circovirus Disease          1
Name: count, Length: 139, dtype: int64

In [20]:
single_cases = disease_counts[disease_counts ==1]

print(len(single_cases))

single_cases

57


Disease_Prediction
Strangles                                        1
Tick-Borne Disease                               1
Arthritis                                        1
Heartworm Disease                                1
Equine Viral Arteritis                           1
Conjunctivitis                                   1
Equine Piroplasmosis                             1
Chronic Bronchitis                               1
Equine Pneumonia                                 1
Laminitis                                        1
Cryptosporidiosis                                1
Bordetella Infection                             1
Inflammatory Bowel Disease                       1
Degenerative Joint Disease                       1
Bovine Leukemia Virus                            1
Allergic Rhinitis                                1
Bovine Coccidiosis                               1
Feline Renal Disease                             1
Bovine Pneumonia                                 1
Canine Flu  

### Target Label Quality

The disease labels contain multiple inconsistencies, including:
- The same disease represented by different names
- Species-specific and generic labels for the same condition
- Pathogen names mixed with clinical disease names
- Minor formatiing differences (hyohens, capitalisation)

These issues will require label standardisation during preprocessing to reduce class fragmentation and improve capacity for models to learn.

In [21]:
# check for missing values

symptoms_df.isna().sum()

Animal_Type          0
Breed                0
Age                  0
Gender               0
Weight               0
Symptom_1            0
Symptom_2            0
Symptom_3            0
Symptom_4            0
Duration             0
Appetite_Loss        0
Vomiting             0
Diarrhea             0
Coughing             0
Labored_Breathing    0
Lameness             0
Skin_Lesions         0
Nasal_Discharge      0
Eye_Discharge        0
Body_Temperature     0
Heart_Rate           0
dtype: int64

No missing values

In [22]:
symptoms_df["Age"].describe()

count    431.000000
mean       5.044084
std        2.553685
min        1.000000
25%        3.000000
50%        5.000000
75%        6.000000
max       16.000000
Name: Age, dtype: float64

In [23]:
symptoms_df["Weight"].describe()

count    431.000000
mean     214.441067
std      259.594305
min        1.000000
25%        8.000000
50%       80.000000
75%      500.000000
max      850.000000
Name: Weight, dtype: float64

In [24]:
symptoms_df["Body_Temperature"].describe()

count        431
unique        19
top       39.2Â°C
freq          57
Name: Body_Temperature, dtype: object

In [25]:
symptoms_df["Heart_Rate"].describe()

count    431.000000
mean     105.582367
std       27.892936
min       70.000000
25%       80.000000
50%       92.000000
75%      130.000000
max      165.000000
Name: Heart_Rate, dtype: float64

In [26]:
# inspect categorical values

symptoms_df["Animal_Type"].value_counts()

Animal_Type
Dog       75
Cat       72
Cow       68
Horse     66
Sheep     39
Goat      39
Pig       38
Rabbit    34
Name: count, dtype: int64

In [27]:
symptoms_df["Gender"].value_counts()

Gender
Male      225
Female    206
Name: count, dtype: int64

In [28]:
symptoms_df["Breed"].value_counts()

Breed
Jersey               13
Arabian              12
Clydesdale           12
Thoroughbred         11
Maine Coon           11
                     ..
Red Poll              1
English Lop           1
Belted Galloway       1
Yorkshire Terrier     1
Chester White         1
Name: count, Length: 120, dtype: int64

In [29]:
# inspect binary columns to make sure in binary output (yes/no, 0/1)

symptom_cols=[
    "Appetite_Loss",
    "Vomiting",
    "Diarrhea",
    "Coughing",
    "Labored_Breathing",
    "Lameness",
    "Skin_Lesions",
    "Nasal_Discharge",
    "Eye_Discharge"
]


for col in symptom_cols:
    print(f"\n{col}:")
    print(symptoms_df[col].unique())


Appetite_Loss:
<StringArray>
['Yes', 'No']
Length: 2, dtype: str

Vomiting:
<StringArray>
['Yes', 'No']
Length: 2, dtype: str

Diarrhea:
<StringArray>
['No', 'Yes']
Length: 2, dtype: str

Coughing:
<StringArray>
['No', 'Yes']
Length: 2, dtype: str

Labored_Breathing:
<StringArray>
['No', 'Yes']
Length: 2, dtype: str

Lameness:
<StringArray>
['No', 'Yes']
Length: 2, dtype: str

Skin_Lesions:
<StringArray>
['No', 'Yes']
Length: 2, dtype: str

Nasal_Discharge:
<StringArray>
['No', 'Yes']
Length: 2, dtype: str

Eye_Discharge:
<StringArray>
['No', 'Yes']
Length: 2, dtype: str


All symptom columns are in an appropriate forma for binary counting

In [30]:
# dataset assuption check

len(symptoms_df) == len(diagnosis_df)

True

The dataset does not incluse a unique case identifier.  Here we are going to assume row-wise correspondance between symptom records and diagnoses.