In [27]:
from pyforest import *
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.preprocessing import LabelEncoder
from patsy.contrasts import Treatment
from sklearn.linear_model import LinearRegression

In [2]:
students_data = pd.DataFrame({
    'Grade' : ['First', 'First', 'First', 'Second', 'Third', 'Second'],
    'Gender' : ['Male', 'Male', 'Female', 'Female', 'Female', 'Female'],
    'Height' : [100, 130, 120, 122, 111, 105],
    'Weight' : [21, 12, 34, 44, 23, 23]
})

In [3]:
students_data

Unnamed: 0,Grade,Gender,Height,Weight
0,First,Male,100,21
1,First,Male,130,12
2,First,Female,120,34
3,Second,Female,122,44
4,Third,Female,111,23
5,Second,Female,105,23


In [4]:
grade_mean = students_data.groupby(by = 'Grade').mean()
grade_mean

Unnamed: 0_level_0,Height,Weight
Grade,Unnamed: 1_level_1,Unnamed: 2_level_1
First,116.666667,22.333333
Second,113.5,33.5
Third,111.0,23.0


In [5]:
grade_mean.loc['Second']['Weight'] - grade_mean.loc['First']['Weight']

11.166666666666668

In [6]:
model = ols('Weight ~ Grade', data = students_data)

In [7]:
res = model.fit()
res.summary()



0,1,2,3
Dep. Variable:,Weight,R-squared:,0.258
Model:,OLS,Adj. R-squared:,-0.237
Method:,Least Squares,F-statistic:,0.5213
Date:,"Sun, 19 Apr 2020",Prob (F-statistic):,0.639
Time:,14:54:25,Log-Likelihood:,-21.566
No. Observations:,6,AIC:,49.13
Df Residuals:,3,BIC:,48.51
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,22.3333,7.189,3.106,0.053,-0.546,45.213
Grade[T.Second],11.1667,11.367,0.982,0.398,-25.009,47.342
Grade[T.Third],0.6667,14.378,0.046,0.966,-45.092,46.425

0,1,2,3
Omnibus:,,Durbin-Watson:,1.692
Prob(Omnibus):,,Jarque-Bera (JB):,0.573
Skew:,0.118,Prob(JB):,0.751
Kurtosis:,1.504,Cond. No.,3.4


In [8]:
encoder = LabelEncoder()
students_data['Grade'] = encoder.fit_transform(students_data['Grade'])

In [9]:
students_data

Unnamed: 0,Grade,Gender,Height,Weight
0,0,Male,100,21
1,0,Male,130,12
2,0,Female,120,34
3,1,Female,122,44
4,2,Female,111,23
5,1,Female,105,23


In [10]:
encoder.classes_

array(['First', 'Second', 'Third'], dtype=object)

In [11]:
students_data.Grade.unique()
levels = [0, 1, 2]

In [12]:
contrast_without_intercept_0 = Treatment(reference=0).code_without_intercept(levels)

In [13]:
print(contrast_without_intercept_0.matrix)

[[0. 0.]
 [1. 0.]
 [0. 1.]]


In [14]:
contrast_with_intercept = Treatment(reference=0).code_with_intercept(levels)
print(contrast_without_intercept_0)

ContrastMatrix(array([[0., 0.],
                      [1., 0.],
                      [0., 1.]]), ['[T.1]', '[T.2]'])


In [15]:
health_dummy_data = students_data

In [16]:
health_data_contrast = contrast_without_intercept_0.matrix[health_dummy_data.Grade, :]

In [17]:
health_data_contrast

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.]])

In [18]:
students_health_contrast_df = pd.DataFrame(health_data_contrast, columns = ['grade_2', 'grade_3'], dtype=np.int
                                          )

In [19]:
students_health_contrast_df

Unnamed: 0,grade_2,grade_3
0,0,0
1,0,0
2,0,0
3,1,0
4,0,1
5,1,0


In [20]:
health_dummy_data = pd.concat([health_dummy_data, students_health_contrast_df], axis=1)

In [21]:
health_dummy_data

Unnamed: 0,Grade,Gender,Height,Weight,grade_2,grade_3
0,0,Male,100,21,0,0
1,0,Male,130,12,0,0
2,0,Female,120,34,0,0
3,1,Female,122,44,1,0
4,2,Female,111,23,0,1
5,1,Female,105,23,1,0


In [22]:
X = health_dummy_data[['grade_2', 'grade_3']]
Y = health_dummy_data['Weight']

In [26]:
X_with_constant = sm.add_constant(X)

model = sm.OLS(Y, X_with_constant)

res = model.fit()

res.summary()



0,1,2,3
Dep. Variable:,Weight,R-squared:,0.258
Model:,OLS,Adj. R-squared:,-0.237
Method:,Least Squares,F-statistic:,0.5213
Date:,"Sun, 19 Apr 2020",Prob (F-statistic):,0.639
Time:,14:57:05,Log-Likelihood:,-21.566
No. Observations:,6,AIC:,49.13
Df Residuals:,3,BIC:,48.51
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,22.3333,7.189,3.106,0.053,-0.546,45.213
grade_2,11.1667,11.367,0.982,0.398,-25.009,47.342
grade_3,0.6667,14.378,0.046,0.966,-45.092,46.425

0,1,2,3
Omnibus:,,Durbin-Watson:,1.692
Prob(Omnibus):,,Jarque-Bera (JB):,0.573
Skew:,0.118,Prob(JB):,0.751
Kurtosis:,1.504,Cond. No.,3.4


In [28]:
linear_model = LinearRegression(fit_intercept=True).fit(X,Y)

In [29]:
linear_model.score(X,Y)

0.25791013028449894

In [30]:
linear_model.coef_

array([11.16666667,  0.66666667])

In [31]:
linear_model.intercept_

22.333333333333336