In [1]:
## this code cell is also marked as 'narrative'
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic

import duckdb, sqlalchemy

from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.dummy import DummyClassifier

In [2]:
clean_new_dogs_df = pd.read_csv('new_dog_data.csv')
clean_new_dogs_df = clean_new_dogs_df.drop(columns = ['Unnamed: 0'])
clean_new_dogs_df['Birthdate'] = pd.to_datetime(clean_new_dogs_df['Birthdate'])
clean_new_dogs_df['Current Status Date'] = pd.to_datetime(clean_new_dogs_df['Current Status Date'])
clean_new_dogs_df.head()

Unnamed: 0,Dog Name,Tattoo,Gender,Breed,Color,Birthdate,Dam,Sire,Current Status Date,Status,Pass_Fail
0,Almond,1A322,Female,LR,Y,2022-08-26,Demi - 5D319 - LRYF,Clay - 5C319 - LRYM,2022-10-19,Released,False
1,William,10WW22,Male,LR,Y,2022-08-20,Liza - 5L318 - LRBF,Everett - 1E419 - LRYM,2022-10-21,Released,False
2,Wind,4WW22,Female,LR,B,2022-08-20,Liza - 5L318 - LRBF,Everett - 1E419 - LRYM,2022-10-21,Other School,True
3,Whispy,2WW22,Female,LR,Y,2022-08-20,Liza - 5L318 - LRBF,Everett - 1E419 - LRYM,2022-10-21,Other School,True
4,Vega (Tilly),2VV22,Female,LR,Y,2022-08-06,Bianca - P - 5B317 - LRYF,Clay - 5C319 - LRYM,2022-09-30,Released,False


In [3]:
new_dogs_to_predict_df = pd.read_csv('new_predict_dog_data.csv')
new_dogs_to_predict_df = new_dogs_to_predict_df.drop(columns = ['Unnamed: 0'])
new_dogs_to_predict_df.head()

Unnamed: 0,Dog Name,Tattoo,Gender,Breed,Color,Birthdate,Dam,Sire,Current Status Date,Status
0,Pearl,7P322,Female,LR,Y,2022-10-31,Orchid - 6O417 - LRYF,Elton - 4EE20 - LRBM,2022-10-31,Too Young to Test
1,Paisley,3P322,Female,LR,B,2022-10-31,Orchid - 6O417 - LRYF,Elton - 4EE20 - LRBM,2022-10-31,Too Young to Test
2,Prairie,1P322,Female,LR,B,2022-10-31,Orchid - 6O417 - LRYF,Elton - 4EE20 - LRBM,2022-10-31,Too Young to Test
3,Pembroke,5P322,Male,LR,Y,2022-10-31,Orchid - 6O417 - LRYF,Elton - 4EE20 - LRBM,2022-10-31,Too Young to Test
4,Petal,2P322,Female,LR,B,2022-10-31,Orchid - 6O417 - LRYF,Elton - 4EE20 - LRBM,2022-10-31,Too Young to Test


# Models

We started playing around with different models that could potentially be used to show patterns or relationships between variables that address our research question.

We were leaning towards utilizing a logistic regression model for binary outputs. This would require us to create dummy tables for our binary inputs of interest, so we started to do so for the variables of gender and then sire.

In [5]:
gender_dummy = pd.get_dummies(clean_new_dogs_df['Gender'], drop_first=True)
#X = pd.concat([X, gender_dummy], axis=1)
gender_dummy

Unnamed: 0,Male
0,0
1,1
2,0
3,0
4,0
...,...
3129,0
3130,1
3131,0
3132,0


In [6]:
#phase 4
gender_dummy = pd.get_dummies(clean_new_dogs_df['Gender'])
#X = pd.concat([X, gender_dummy], axis=1)
gender_dummy

Unnamed: 0,Female,Male
0,1,0
1,0,1
2,1,0
3,1,0
4,1,0
...,...,...
3129,1,0
3130,0,1
3131,1,0
3132,1,0


In [7]:
#https://stackoverflow.com/questions/20703733/logit-regression-and-singular-matrix-error-in-python
Xtrain = gender_dummy
ytrain = clean_new_dogs_df['Pass_Fail']
results = sm.Logit(ytrain, Xtrain).fit()
results.summary2()

Optimization terminated successfully.
         Current function value: 0.689946
         Iterations 4


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.001
Dependent Variable:,Pass_Fail,AIC:,4328.5816
Date:,2022-11-21 19:33,BIC:,4340.6817
No. Observations:,3134,Log-Likelihood:,-2162.3
Df Model:,1,LL-Null:,-2163.6
Df Residuals:,3132,LLR p-value:,0.10842
Converged:,1.0000,Scale:,1.0
No. Iterations:,4.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Female,-0.2102,0.0521,-4.0340,0.0001,-0.3123,-0.1081
Male,-0.0950,0.0494,-1.9230,0.0545,-0.1917,0.0018


In [8]:
#phase 4
breed_dummy = pd.get_dummies(clean_new_dogs_df['Breed'])
#X = pd.concat([X, gender_dummy], axis=1)
breed_dummy

Unnamed: 0,GS,LR
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
3129,0,1
3130,0,1
3131,0,1
3132,0,1


In [9]:
#https://stackoverflow.com/questions/20703733/logit-regression-and-singular-matrix-error-in-python
Xtrain = breed_dummy
ytrain = clean_new_dogs_df['Pass_Fail']
results = sm.Logit(ytrain, Xtrain).fit()
results.summary2()

Optimization terminated successfully.
         Current function value: 0.690354
         Iterations 3


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.0
Dependent Variable:,Pass_Fail,AIC:,4331.1361
Date:,2022-11-21 19:33,BIC:,4343.2363
No. Observations:,3134,Log-Likelihood:,-2163.6
Df Model:,1,LL-Null:,-2163.6
Df Residuals:,3132,LLR p-value:,0.88071
Converged:,1.0000,Scale:,1.0
No. Iterations:,3.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
GS,-0.1335,0.1129,-1.1823,0.2371,-0.3549,0.0878
LR,-0.1514,0.0378,-4.0079,0.0001,-0.2254,-0.0774


In [10]:
#pass_new_dogs_df = clean_new_dogs_df.loc[clean_new_dogs_df['Pass_Fail']==1]
#pass_new_dogs_df.head()
#pass_new_dogs_df['Dam'].value_counts()

below is the code for finding accuracy for the sm.Logit. There's a screen shot in your notes app of what the output should be

In [11]:
yhat = results_logit.predict(dogs_test[dam_dummy.columns])
prediction = list(map(round, yhat))

accuracy_score(dogs_test['Pass_Fail'], prediction)
#logit_train_accuracy = metrics.accuracy_score(dogs_train['Pass_Fail'], training_predictions)
#logit_test_accuracy = metrics.accuracy_score(dogs_test['Pass_Fail'], testing_predictions)

NameError: name 'results_logit' is not defined

In [13]:
yhat_2 = results_logit.predict(dogs_test[dam_dummy.columns])
prediction_2 = list(map(round, yhat_2))

accuracy_score(dogs_test['Pass_Fail'], prediction_2)

NameError: name 'results_logit' is not defined

below is the code for finding accuracy for the sm.Logit for sire hypotheses. There's a screen shot in your notes app of what the output should be

In [None]:
yhat_2_sire = results_logit2.predict(sire_test[sire_dummy.columns])
prediction_2_sire = list(map(round, yhat_2_sire))

accuracy_score(sire_test['Pass_Fail'], prediction_2_sire)

In [None]:
yhat = results_logit.predict(sire_test[sire_dummy.columns])
prediction = list(map(round, yhat))

accuracy_score(sire_test['Pass_Fail'], prediction)
#logit_train_accuracy = metrics.accuracy_score(dogs_train['Pass_Fail'], training_predictions)
#logit_test_accuracy = metrics.accuracy_score(dogs_test['Pass_Fail'], testing_predictions)

code from the "No longer important section"

In [14]:
pass_new_dogs_df = clean_new_dogs_df.loc[clean_new_dogs_df['Pass_Fail']==1]
pass_new_dogs_df.head()
pass_new_dogs_df['Dam'].value_counts()

PBBSophie - PBBSophie - LRYF *NOP*      22
Rasha - 2R314 - GSB&TF *NOP*            20
Delphi - 7D413 - LRYF                   17
Sugar - 2S413 - LRYF *NOP*              17
Dakota - S - 4D417 - LRBF *NOP*         17
                                        ..
Oats - 7OO17 - LRBF                      1
Camille - 5C417 - LRBF                   1
Elise TSE - ELISE TSE - GSB&TF *NOP*     1
Ona - 6O19 - LRBF                        1
Lace - 8L18 - LRBF                       1
Name: Dam, Length: 239, dtype: int64

i'm leaving the code below here so you remember how to use predict_proba

In [None]:
passfail_prediction = regression.predict_proba(X)[:,1]
passfail_prediction
clean_new_dogs_copy_df = clean_new_dogs_df
clean_new_dogs_copy_df['passfail_prediction'] = passfail_prediction
clean_new_dogs_copy_df.sort_values(by=['passfail_prediction'], ascending=True)

Phase II

1. Do we need to make more csv files in our cleaning process, or are the files we currently have sufficient? We can add additional ones before and after dropping certain columns or rows, or after creating dataframes for dummy variables, but we didn’t know if that is unnecessary.
2. What other summary statistics can we add if both our variables are categorical? For example, is it necessary to do a heatmap?
3. Should we change the index when cleaning a dataset? We did not for now and we explained the logic why, but we wanted your input.
4. What are your thoughts at our attempt at a regression thus far?
5. We know the data description part is a draft for this phase. Do you think we need to add mroe detail when we do the final version or our current work is sufficient?