**Aim is to compare mean of dependent var for each cat and mean of dependent var of ref cat**

In [1]:
# 4th notebook 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
student_health = pd.read_csv("./datasets/student_health.csv")
student_health.head()

Unnamed: 0,Grade,Gender,Height_cm,Weight_kg
0,First,Male,105,21
1,First,Female,126,25
2,First,Male,126,25
3,First,Male,112,20
4,First,Female,133,32


In [3]:
# group students by grade and get the mean height and weight
grade_mean = student_health.groupby(by = "Grade").mean()

grade_mean

Unnamed: 0_level_0,Height_cm,Weight_kg
Grade,Unnamed: 1_level_1,Unnamed: 2_level_1
First,122.357143,26.183673
Second,126.168317,33.148515
Third,134.775701,36.070093


In [4]:
# later on will be using First grade as ref level

# calc diff in mean btw second and first grade manually
grade_mean.loc["Second"]["Weight_kg"] - grade_mean.loc["First"]["Weight_kg"]

6.96484138209739

In [5]:
# calc diff in mean btw second and first grade manually
grade_mean.loc["Third"]["Weight_kg"] - grade_mean.loc["First"]["Weight_kg"]

9.88641998855617

***
**Linear reg to check effect of grade on weight student**

In [6]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
import numpy as np

In [7]:
mod = ols(
    # reg of weight against grade and it dummy encodes grade by default, weight is target
    "Weight_kg ~ Grade",   
         student_health)

res = mod.fit()

res.summary()

# under coef section we same same result as above for second and third grade

0,1,2,3
Dep. Variable:,Weight_kg,R-squared:,0.465
Model:,OLS,Adj. R-squared:,0.463
Method:,Least Squares,F-statistic:,178.3
Date:,"Wed, 06 Jan 2021",Prob (F-statistic):,1.89e-56
Time:,15:45:40,Log-Likelihood:,-1186.2
No. Observations:,413,AIC:,2378.0
Df Residuals:,410,BIC:,2391.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,26.1837,0.434,60.382,0.000,25.331,27.036
Grade[T.Second],6.9648,0.609,11.443,0.000,5.768,8.161
Grade[T.Third],9.8864,0.524,18.882,0.000,8.857,10.916

0,1,2,3
Omnibus:,23.848,Durbin-Watson:,2.854
Prob(Omnibus):,0.0,Jarque-Bera (JB):,25.558
Skew:,0.582,Prob(JB):,2.82e-06
Kurtosis:,2.641,Cond. No.,4.51


***
**Dummy Encoding done explicitly**

In [8]:
student_health.drop(columns = ["Gender", "Height_cm"], inplace = True)

student_health.head()

Unnamed: 0,Grade,Weight_kg
0,First,21
1,First,25
2,First,25
3,First,20
4,First,32


In [9]:
from sklearn.preprocessing import LabelEncoder

grade_encoder = LabelEncoder()

student_health["Grade"] = grade_encoder.fit_transform(student_health.Grade)

student_health.sample(5)

Unnamed: 0,Grade,Weight_kg
153,1,33
360,2,46
361,2,31
128,1,30
404,2,35


In [10]:
# check ordering 
grade_encoder.classes_

array(['First', 'Second', 'Third'], dtype=object)

In [11]:
# dummy encoding(treatment encoding) with patsy
from patsy.contrasts import Treatment

In [12]:
# levels to rep unique cat of cat col to be encoded
levels = [0, 1, 2]

# ref is first cat
contrast_without_intercept_0 = Treatment(reference=0)\
                            .code_without_intercept(levels)

print(contrast_without_intercept_0.matrix)

[[0. 0.]
 [1. 0.]
 [0. 1.]]


In [13]:
# levels to rep unique cat of cat col to be encoded
levels = [0, 1, 2]

# ref is first cat, with intercept is just same as one hot encoding
contrast_with_intercept = Treatment(reference=0)\
                            .code_with_intercept(levels)

print(contrast_with_intercept.matrix)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [14]:
health_data_dummy = student_health


# dummy code Grade col
health_data_contrast = contrast_without_intercept_0.matrix[health_data_dummy.Grade, :]

# shows sample of dummy coded var
health_data_contrast[90:105]

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [15]:
# df with dummy coded grade
student_health_contrast_df = pd.DataFrame(health_data_contrast, columns = ["grade_2", "grade_3"], dtype = np.int)

student_health_contrast_df.sample(6)

Unnamed: 0,grade_2,grade_3
29,0,0
371,0,1
318,0,1
167,1,0
195,1,0
254,0,1


In [16]:
# concat with orig data
health_data_dummy = pd.concat([health_data_dummy, student_health_contrast_df], axis = 1)

health_data_dummy.head()

Unnamed: 0,Grade,Weight_kg,grade_2,grade_3
0,0,21,0,0
1,0,25,0,0
2,0,25,0,0
3,0,20,0,0
4,0,32,0,0


In [17]:
# drop orignal grade col, since using only dummy encoded
health_data_dummy.drop(columns = ["Grade"], inplace = True)

health_data_dummy.sample(5)

Unnamed: 0,Weight_kg,grade_2,grade_3
392,31,0,1
37,25,0,0
313,38,0,1
327,31,0,1
18,25,0,0


In [18]:
# axis could also be = 1
X = health_data_dummy.drop("Weight_kg", axis = "columns")
y = health_data_dummy.Weight_kg

In [19]:
# lin reg stats model

X_with_constant = sm.add_constant(X)  # adds intercept to dummy encoded data

mod = sm.OLS(y, X_with_constant)
res = mod.fit()
res.summary()

# same coef stuff as before when we used sm with dummy encoding by default

0,1,2,3
Dep. Variable:,Weight_kg,R-squared:,0.465
Model:,OLS,Adj. R-squared:,0.463
Method:,Least Squares,F-statistic:,178.3
Date:,"Wed, 06 Jan 2021",Prob (F-statistic):,1.89e-56
Time:,15:45:42,Log-Likelihood:,-1186.2
No. Observations:,413,AIC:,2378.0
Df Residuals:,410,BIC:,2391.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,26.1837,0.434,60.382,0.000,25.331,27.036
grade_2,6.9648,0.609,11.443,0.000,5.768,8.161
grade_3,9.8864,0.524,18.882,0.000,8.857,10.916

0,1,2,3
Omnibus:,23.848,Durbin-Watson:,2.854
Prob(Omnibus):,0.0,Jarque-Bera (JB):,25.558
Skew:,0.582,Prob(JB):,2.82e-06
Kurtosis:,2.641,Cond. No.,4.51


In [20]:
# lin reg using linear model
from sklearn.linear_model import LinearRegression

linear_model_dummy = LinearRegression(fit_intercept=True) # adds intercept, default tho

In [21]:
linear_model_dummy.fit(X, y)

print("Training score: ", linear_model_dummy.score(X, y))  # score is same as r2 in stats models

Training score:  0.465215496110326


In [22]:
# same as that of stats model and intercept same as stats model i.e mean of ref level
linear_model_dummy.coef_ , linear_model_dummy.intercept_ 

(array([6.96484138, 9.88641999]), 26.18367346938776)

***
**Encoding using one-hot encode to check something**

In [23]:
health_data_ohe = student_health

contrast_with_intercept.matrix

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [24]:
health_data_contrast = contrast_with_intercept.matrix[health_data_ohe.Grade, :]

health_data_contrast[90:105]

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [25]:
health_data_contrast = pd.DataFrame(health_data_contrast,
                                    columns = ["grade_1", "grade_2", "grade_3"], dtype =np.int)
health_data_contrast.sample(5)

Unnamed: 0,grade_1,grade_2,grade_3
250,0,0,1
400,0,0,1
5,1,0,0
296,0,0,1
91,1,0,0


In [26]:
health_data_ohe = pd.concat([health_data_ohe, health_data_contrast], axis = 1)

health_data_ohe.sample(5)

Unnamed: 0,Grade,Weight_kg,grade_1,grade_2,grade_3
87,0,33,1,0,0
272,2,39,0,0,1
88,0,21,1,0,0
401,2,31,0,0,1
110,1,30,0,1,0


In [28]:
health_data_ohe.drop(columns = ["Grade"], inplace = True)

In [33]:
X = health_data_ohe.drop("Weight_kg", axis = "columns")
y = health_data_ohe.Weight_kg

In [34]:
# since adding interceot increases k and makes model unstable
linear_model_ohe = LinearRegression(fit_intercept = False)  

In [35]:
linear_model_ohe.fit(X, y)

print("Training score: ", linear_model_ohe.score(X, y))

Training score:  0.46521549611032575


In [37]:
# diff from treatment encoding but since no interceot, intercept is 0
linear_model_ohe.coef_, linear_model_ohe.intercept_

(array([26.18367347, 33.14851485, 36.07009346]), 0.0)