In [38]:
#Import required packages
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler


### Data Import

In [39]:
#Import CSV file as dataframe
data_df = pd.read_csv("canine_cushings_disease_clean.csv")

#Set PatientID as index
data_df.index = data_df['PatientID']

#Remove PatientID column
del data_df['PatientID']

#Check df
data_df.head()

Unnamed: 0_level_0,Diagnosis,Breed,Age,Sex,Neutered,WeightChange,Weight,Hospitalised,Vomiting,Diarrhoea,...,Neurological,Polydipsia,Polyuria,Polyphagia,IncreasedALKP,IncreasedATL/GGT/AST,Hypertensive,LowBloodSugar,Hypothyroid,CruciateSurgery
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,Jack Russell Terrier,12,M,True,No change,13.0,False,False,False,...,False,True,True,False,True,True,False,False,False,False
2,1,Bichon Frise,12,F,True,No change,9.3,False,False,False,...,False,True,True,False,True,True,False,False,False,False
3,1,West Highland White Terrier,13,F,True,No change,9.82,False,False,False,...,False,True,True,True,True,True,False,False,False,False
4,1,Cocker Spaniel,12,F,True,No change,11.3,True,False,False,...,False,True,True,False,,,False,False,False,False
5,1,Crossbreed,9,F,False,No change,9.3,False,False,True,...,False,True,True,False,True,True,False,False,False,False


### Data Cleaning

In [40]:
#Check data types
data_df.dtypes

Diagnosis                  int64
Breed                     object
Age                        int64
Sex                       object
Neutered                    bool
WeightChange              object
Weight                   float64
Hospitalised                bool
Vomiting                    bool
Diarrhoea                   bool
Hepatomegaly                bool
ThinDrySkin                 bool
Alopecia                    bool
Pruritus                    bool
MuscleWastageWeakness       bool
Lethargy                    bool
Panting                     bool
Neurological                bool
Polydipsia                  bool
Polyuria                    bool
Polyphagia                  bool
IncreasedALKP             object
IncreasedATL/GGT/AST      object
Hypertensive                bool
LowBloodSugar               bool
Hypothyroid                 bool
CruciateSurgery             bool
dtype: object

### Breed Binning

In [41]:
# Choose a cutoff value and create a list of source breeds to be replaced
# use the variable name `breeds_to_replace`
# Need to change value counts to list
breed_counts = data_df['Breed'].value_counts()
breeds_to_replace = data_df['Breed'].value_counts()[breed_counts < 15].index.tolist()

# print(application_types_to_replace)
# Replace in dataframe
for breed in breeds_to_replace:
    data_df['Breed'] = data_df['Breed'].replace(breed,"Others")

# Check to make sure binning was successful
data_df['Breed'].value_counts()

Crossbreed                       250
Others                           249
Jack Russell Terrier              94
Staffordshire Bull Terrier        74
Bichon Frise                      70
West Highland White Terrier       63
Labrador Retriever                55
Yorkshire Terrier                 52
Border Terrier                    50
Schnauzer                         41
Cocker Spaniel                    31
Shih Tzu                          26
Cavalier King Charles Spaniel     22
Collie                            21
Boxer                             17
Poodle                            16
Springer Spaniel                  15
Chihuahua                         15
Name: Breed, dtype: int64

### Scaling Features with NA values

In [42]:
#Check categories in ALKP variable
data_df['IncreasedALKP'].value_counts()

True     581
False     76
Name: IncreasedALKP, dtype: int64

In [43]:
#Replace NA values with object
data_df['IncreasedALKP'] = data_df['IncreasedALKP'].replace(np.nan, 'N/A')

#Get dummies for breed
alkp_df = pd.get_dummies(data_df['IncreasedALKP'])

#Save column names
alkp_cols = alkp_df.columns.values
alkp_cols 

array([False, True, 'N/A'], dtype=object)

In [44]:
#Replace column names
alkp_col_names = {alkp_cols[0]: 'ALKPFalse', alkp_cols[1]: 'ALKPTrue', 'N/A': 'ALKPN/A'}
alkp_df = alkp_df.rename(columns=alkp_col_names)

alkp_df.head()

Unnamed: 0_level_0,ALKPFalse,ALKPTrue,ALKPN/A
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,1,0
2,0,1,0
3,0,1,0
4,0,0,1
5,0,1,0


In [45]:
#Check categories in ATL variable
data_df['IncreasedATL/GGT/AST'].value_counts()

True     420
False    143
Name: IncreasedATL/GGT/AST, dtype: int64

In [46]:
#Replace NA values with object
data_df['IncreasedATL/GGT/AST'] = data_df['IncreasedATL/GGT/AST'].replace(np.nan, 'N/A')

#Get dummies for breed
atl_df = pd.get_dummies(data_df['IncreasedATL/GGT/AST'])

#Save column names
atl_cols = atl_df.columns.values
atl_cols 


array([False, True, 'N/A'], dtype=object)

In [47]:
#Replace column names
atl_col_names = {atl_cols[0]: 'ATLFalse', atl_cols[1]: 'ATLTrue', 'N/A': 'ATLPN/A'}
atl_df = atl_df.rename(columns=atl_col_names)

atl_df.head()

Unnamed: 0_level_0,ATLFalse,ATLTrue,ATLPN/A
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,1,0
2,0,1,0
3,0,1,0
4,0,0,1
5,0,1,0


### Scaling Feature Values

In [49]:
#Create list of numerical column names
num_columns = ['Age', 'Weight']

#Standard Scale the numerical values
scaled_num_cols = StandardScaler().fit_transform(data_df[num_columns])

#Check scaled data
scaled_num_cols

array([[ 0.70799614, -0.32233216],
       [ 0.70799614, -0.64873217],
       [ 1.05526421, -0.60285974],
       ...,
       [ 0.70799614,  0.55983001],
       [ 1.74980035, -0.63991055],
       [ 1.40253228,  0.07464082]])

In [50]:
#Create df of scaled numerical data
scaled_num_cols_df = pd.DataFrame(scaled_num_cols, columns=num_columns)

#Add the PatientID column as the df index
scaled_num_cols_df.index = data_df.index

#Check df
scaled_num_cols_df.head()

Unnamed: 0_level_0,Age,Weight
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.707996,-0.322332
2,0.707996,-0.648732
3,1.055264,-0.60286
4,0.707996,-0.4723
5,-0.333808,-0.648732


In [51]:
#Create df of boolean data
bool_df = data_df.select_dtypes(include='bool')
bool_df['Diagnosis'] = data_df['Diagnosis']

bool_df.head()

Unnamed: 0_level_0,Neutered,Hospitalised,Vomiting,Diarrhoea,Hepatomegaly,ThinDrySkin,Alopecia,Pruritus,MuscleWastageWeakness,Lethargy,Panting,Neurological,Polydipsia,Polyuria,Polyphagia,Hypertensive,LowBloodSugar,Hypothyroid,CruciateSurgery,Diagnosis
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,True,False,False,False,False,True,False,False,False,False,False,False,True,True,False,False,False,False,False,1
2,True,False,False,False,False,False,False,False,False,False,True,False,True,True,False,False,False,False,False,1
3,True,False,False,False,True,True,False,False,False,False,False,False,True,True,True,False,False,False,False,1
4,True,True,False,False,True,False,False,False,False,False,True,False,True,True,False,False,False,False,False,1
5,False,False,False,True,True,False,False,False,False,True,False,False,True,True,False,False,False,False,False,1


In [52]:
#Encode all boolean variables with 1 = True, 0 = False
bool_df = bool_df.replace({True: 1, False: 0})

#Check data types
bool_df.dtypes

Neutered                 int64
Hospitalised             int64
Vomiting                 int64
Diarrhoea                int64
Hepatomegaly             int64
ThinDrySkin              int64
Alopecia                 int64
Pruritus                 int64
MuscleWastageWeakness    int64
Lethargy                 int64
Panting                  int64
Neurological             int64
Polydipsia               int64
Polyuria                 int64
Polyphagia               int64
Hypertensive             int64
LowBloodSugar            int64
Hypothyroid              int64
CruciateSurgery          int64
Diagnosis                int64
dtype: object

In [54]:
#Create df of object variables & boolean with NA
object_df = data_df[['Breed', 'Sex', 'WeightChange']]

#Check df
object_df.head()

Unnamed: 0_level_0,Breed,Sex,WeightChange
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Jack Russell Terrier,M,No change
2,Bichon Frise,F,No change
3,West Highland White Terrier,F,No change
4,Cocker Spaniel,F,No change
5,Crossbreed,F,No change


In [55]:
#Get dummies for sex
sex_df = pd.get_dummies(object_df['Sex'])

#Check df
sex_df.head()

Unnamed: 0_level_0,F,M
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,1
2,1,0
3,1,0
4,1,0
5,1,0


In [56]:
#Get dummies for WeightChange
weightchange_df = pd.get_dummies(object_df['WeightChange'])

#Check df
weightchange_df.head()

Unnamed: 0_level_0,No change,Weight gain,Weight loss
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,1,0,0


In [57]:
#Get dummies for breed
breed_df = pd.get_dummies(object_df['Breed'])

breed_df.head()

Unnamed: 0_level_0,Bichon Frise,Border Terrier,Boxer,Cavalier King Charles Spaniel,Chihuahua,Cocker Spaniel,Collie,Crossbreed,Jack Russell Terrier,Labrador Retriever,Others,Poodle,Schnauzer,Shih Tzu,Springer Spaniel,Staffordshire Bull Terrier,West Highland White Terrier,Yorkshire Terrier
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [58]:
#Concatonate all scaled dataframes
scaled_df = pd.concat([scaled_num_cols_df, sex_df, weightchange_df, breed_df, alkp_df, atl_df, bool_df], axis=1)

#Check df
scaled_df.head()

Unnamed: 0_level_0,Age,Weight,F,M,No change,Weight gain,Weight loss,Bichon Frise,Border Terrier,Boxer,...,Panting,Neurological,Polydipsia,Polyuria,Polyphagia,Hypertensive,LowBloodSugar,Hypothyroid,CruciateSurgery,Diagnosis
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.707996,-0.322332,0,1,1,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
2,0.707996,-0.648732,1,0,1,0,0,1,0,0,...,1,0,1,1,0,0,0,0,0,1
3,1.055264,-0.60286,1,0,1,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
4,0.707996,-0.4723,1,0,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,1
5,-0.333808,-0.648732,1,0,1,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1


### Drop NA Values

In [59]:
#NAN summary
scaled_df.isna().sum()

Age                                0
Weight                           127
F                                  0
M                                  0
No change                          0
Weight gain                        0
Weight loss                        0
Bichon Frise                       0
Border Terrier                     0
Boxer                              0
Cavalier King Charles Spaniel      0
Chihuahua                          0
Cocker Spaniel                     0
Collie                             0
Crossbreed                         0
Jack Russell Terrier               0
Labrador Retriever                 0
Others                             0
Poodle                             0
Schnauzer                          0
Shih Tzu                           0
Springer Spaniel                   0
Staffordshire Bull Terrier         0
West Highland White Terrier        0
Yorkshire Terrier                  0
ALKPFalse                          0
ALKPTrue                           0
A

In [60]:
#Remove rows with na values
scaled_df = scaled_df.dropna(subset=['Weight'])

#Check df
scaled_df.isna().sum()

Age                              0
Weight                           0
F                                0
M                                0
No change                        0
Weight gain                      0
Weight loss                      0
Bichon Frise                     0
Border Terrier                   0
Boxer                            0
Cavalier King Charles Spaniel    0
Chihuahua                        0
Cocker Spaniel                   0
Collie                           0
Crossbreed                       0
Jack Russell Terrier             0
Labrador Retriever               0
Others                           0
Poodle                           0
Schnauzer                        0
Shih Tzu                         0
Springer Spaniel                 0
Staffordshire Bull Terrier       0
West Highland White Terrier      0
Yorkshire Terrier                0
ALKPFalse                        0
ALKPTrue                         0
ALKPN/A                          0
ATLFalse            

In [61]:
#Check df
scaled_df.head()

Unnamed: 0_level_0,Age,Weight,F,M,No change,Weight gain,Weight loss,Bichon Frise,Border Terrier,Boxer,...,Panting,Neurological,Polydipsia,Polyuria,Polyphagia,Hypertensive,LowBloodSugar,Hypothyroid,CruciateSurgery,Diagnosis
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.707996,-0.322332,0,1,1,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
2,0.707996,-0.648732,1,0,1,0,0,1,0,0,...,1,0,1,1,0,0,0,0,0,1
3,1.055264,-0.60286,1,0,1,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
4,0.707996,-0.4723,1,0,1,0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,1
5,-0.333808,-0.648732,1,0,1,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1


In [62]:
#Number of entries remaining
scaled_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1034 entries, 1 to 1161
Data columns (total 51 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Age                            1034 non-null   float64
 1   Weight                         1034 non-null   float64
 2   F                              1034 non-null   uint8  
 3   M                              1034 non-null   uint8  
 4   No change                      1034 non-null   uint8  
 5   Weight gain                    1034 non-null   uint8  
 6   Weight loss                    1034 non-null   uint8  
 7   Bichon Frise                   1034 non-null   uint8  
 8   Border Terrier                 1034 non-null   uint8  
 9   Boxer                          1034 non-null   uint8  
 10  Cavalier King Charles Spaniel  1034 non-null   uint8  
 11  Chihuahua                      1034 non-null   uint8  
 12  Cocker Spaniel                 1034 non-null   u

### Prepare the data

In [63]:
#Save the dependent variable into a single-column array
Y = scaled_df.iloc[:, -1:]

#Check df
Y[0:5]

Unnamed: 0_level_0,Diagnosis
PatientID,Unnamed: 1_level_1
1,1
2,1
3,1
4,1
5,1


In [64]:
#Save the independent variables
X = scaled_df.iloc[:, :-1]

#Check df
X.head()

Unnamed: 0_level_0,Age,Weight,F,M,No change,Weight gain,Weight loss,Bichon Frise,Border Terrier,Boxer,...,Lethargy,Panting,Neurological,Polydipsia,Polyuria,Polyphagia,Hypertensive,LowBloodSugar,Hypothyroid,CruciateSurgery
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.707996,-0.322332,0,1,1,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
2,0.707996,-0.648732,1,0,1,0,0,1,0,0,...,0,1,0,1,1,0,0,0,0,0
3,1.055264,-0.60286,1,0,1,0,0,0,0,0,...,0,0,0,1,1,1,0,0,0,0
4,0.707996,-0.4723,1,0,1,0,0,0,0,0,...,0,1,0,1,1,0,0,0,0,0
5,-0.333808,-0.648732,1,0,1,0,0,0,0,0,...,1,0,0,1,1,0,0,0,0,0


In [65]:
#Split the data into training and testing datasets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42, test_size=0.2)

### Initiate and Train the Model

In [66]:
#Initiate model limited to 200 iterations
LR_model = LogisticRegression(solver='lbfgs', max_iter=200, random_state=42)

#Fit training data to the model
LR_model.fit(X_train, Y_train)

#Print the training and test scores for the model
training_score = LR_model.score(X_train, Y_train)
test_score = LR_model.score(X_test, Y_test)

print(f"Training Data Score: {training_score}")
print(f"Test Data Score: {test_score}")

Training Data Score: 0.7061668681983071
Test Data Score: 0.7777777777777778


  y = column_or_1d(y, warn=True)


In [67]:
#Predict the test values
test_predictions = LR_model.predict(X_test)

#Create df of predicted vs. actual test values
predict_dict = {'Prediction': test_predictions, 'Actual': Y_test["Diagnosis"]}
predict_df = pd.DataFrame(predict_dict)

#Check df
predict_df.head()

Unnamed: 0_level_0,Prediction,Actual
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1
340,1,1
521,1,1
728,0,0
163,1,1
620,0,0


In [68]:
#Calculate prediction accuracy scores
accuracy_score(Y_test['Diagnosis'], test_predictions)

0.7777777777777778

In [69]:
# Specify the file path to save the model
model_filename = 'trained_model.pkl'

# Open the file in binary write mode and save the model
with open(model_filename, 'wb') as file:
    pickle.dump(LR_model, file)

In [70]:
#Produce Classification report
target_names = ['No Disease (0)', 'Disease (1)']
print("Classification Report:")
print(classification_report(Y_test, test_predictions, target_names=target_names))

Classification Report:
                precision    recall  f1-score   support

No Disease (0)       0.79      0.76      0.78       106
   Disease (1)       0.76      0.79      0.78       101

      accuracy                           0.78       207
     macro avg       0.78      0.78      0.78       207
  weighted avg       0.78      0.78      0.78       207

