In [1]:
import pandas
dataset = pandas.read_csv('doggy-illness.csv', delimiter="\t")
dataset

Unnamed: 0,male,attended_training,age,body_fat_percentage,core_temperature,ate_at_tonys_steakhouse,needed_intensive_care,protein_content_of_last_meal
0,0,1,6.9,38,38.423169,0,0,7.66
1,0,1,5.4,32,39.015998,0,0,13.36
2,1,1,5.4,12,39.148341,0,0,12.90
3,1,0,4.8,23,39.060049,0,0,13.45
4,1,0,4.8,15,38.655439,0,0,10.53
...,...,...,...,...,...,...,...,...
93,0,0,4.5,38,37.939942,0,0,7.35
94,1,0,1.8,11,38.790426,1,1,12.18
95,0,0,6.6,20,39.489962,0,0,15.84
96,0,0,6.9,32,38.575742,1,1,9.79


In [3]:
import graphing

graphing.box_and_whisker(dataset, "male", "core_temperature", show = True)
graphing.box_and_whisker(dataset, "attended_training", "core_temperature", show = True)
graphing.box_and_whisker(dataset, "ate_at_tonys_steakhouse", "core_temperature", show = True)
graphing.scatter_2D(dataset, "body_fat_percentage", "core_temperature", show = True)
graphing.scatter_2D(dataset, "protein_content_of_last_meal", "core_temperature", show = True)
graphing.scatter_2D(dataset, "age", "core_temperature")

In [5]:
import statsmodels.formula.api as smf
import graphing

for feature in ["male", "age", "protein_content_of_last_meal", "body_fat_percentage"]:
    # Perform linear regression. This method takes care of
    # the entire fitting procedure for us.
    formula = "core_temperature ~ " + feature
    simple_model = smf.ols(formula = formula, data = dataset).fit()
    
    print(feature)
    print("R-sqaure:", simple_model.rsquared)
    
    graphing.scatter_2D(dataset, label_x=feature,
                                 label_y="core_temperature",
                                 title = feature,
                                 trendline = lambda x: simple_model.params[1]*x + simple_model.params[0],
                                 show=True)

male
R-sqaure: 0.0999007443071992


age
R-sqaure: 0.2648116081342463


protein_content_of_last_meal
R-sqaure: 0.9155158150005709


body_fat_percentage
R-sqaure: 0.00020809002637822704


In [17]:
formula = "core_temperature ~ age"
age_trained_model = smf.ols(formula = formula, data = dataset).fit()
age_naive_model = smf.ols(formula = formula, data = dataset).fit()
age_naive_model.params[0] = dataset['core_temperature'].mean()
age_naive_model.params[1] = 0

print("naive R-squared:", age_naive_model.rsquared)
print("trained R-squared:", age_trained_model.rsquared)

graphing.scatter_2D(dataset, label_x="age",
                   label_y="core_temperature",
                   title="Naive model",
                   trendline= lambda x: age_naive_model.params[1]*x + age_naive_model.params[0],
                   show=True)

graphing.scatter_2D(dataset, label_x="age",
                   label_y="core_temperature",
                   title="Trained model",
                   trendline=lambda x: age_trained_model.params[1]*x + age_trained_model.params[0])

naive R-squared: 0.0
trained R-squared: 0.2648116081342463


In [38]:
model = smf.ols(formula = "core_temperature ~ age + male", data = dataset).fit()

print("R-squared:", model.rsquared)

R-squared: 0.3148512699768009


In [39]:
import numpy as np
# Show a graph of the result
# this needs to be 3D, because we now have three variables in play: two features and one label

def predict(age, male):
    '''
    This converts given age and male values into a prediction from the model
    '''
    # to make a prediction with statsmodels, we need to provide a dataframe
    # so create a dataframe with just the age and male variables
    df = pandas.DataFrame(dict(age=[age], male=[male]))
    return model.predict(df)

# Create the surface graph
fig = graphing.surface(
    x_values=np.array([min(dataset.age), max(dataset.age)]),
    y_values=np.array([0, 1]),
    calc_z=predict,
    axis_title_x="Age",
    axis_title_y="Male",
    axis_title_z="Core temperature"
)

# Add our datapoints to it and display
fig.add_scatter3d(x=dataset.age, y=dataset.male, z=dataset.core_temperature, mode='markers')
fig.show()

In [40]:
model.summary()

0,1,2,3
Dep. Variable:,core_temperature,R-squared:,0.315
Model:,OLS,Adj. R-squared:,0.3
Method:,Least Squares,F-statistic:,21.83
Date:,"Tue, 14 Jun 2022",Prob (F-statistic):,1.58e-08
Time:,12:53:40,Log-Likelihood:,-85.295
No. Observations:,98,AIC:,176.6
Df Residuals:,95,BIC:,184.3
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,37.9793,0.135,282.094,0.000,37.712,38.247
age,0.1406,0.026,5.459,0.000,0.089,0.192
male,0.3182,0.121,2.634,0.010,0.078,0.558

0,1,2,3
Omnibus:,21.61,Durbin-Watson:,2.369
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5.227
Skew:,0.121,Prob(JB):,0.0733
Kurtosis:,1.895,Cond. No.,12.9


In [31]:
age_trained_model.summary()

0,1,2,3
Dep. Variable:,core_temperature,R-squared:,0.265
Model:,OLS,Adj. R-squared:,0.257
Method:,Least Squares,F-statistic:,34.58
Date:,"Tue, 14 Jun 2022",Prob (F-statistic):,5.94e-08
Time:,12:31:16,Log-Likelihood:,-88.749
No. Observations:,98,AIC:,181.5
Df Residuals:,96,BIC:,186.7
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,38.0879,0.132,288.373,0.000,37.826,38.350
age,0.1533,0.026,5.880,0.000,0.102,0.205

0,1,2,3
Omnibus:,43.487,Durbin-Watson:,2.492
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6.605
Skew:,0.087,Prob(JB):,0.0368
Kurtosis:,1.74,Cond. No.,11.3
