In [139]:
from pyforest import *
import statsmodels.api as sm
from statsmodels.formula.api import ols
from patsy.contrasts import ContrastMatrix
from category_encoders import BackwardDifferenceEncoder, HelmertEncoder, PolynomialEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

In [4]:
iris_data = pd.read_csv('iris_data.csv')
iris_target = pd.read_csv('iris_target.csv')

In [5]:
iris_data.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

In [6]:
iris_data = pd.concat([iris_data, iris_target],axis=1)

In [69]:
iris_data.rename(columns = {'0' : 'Species'}, inplace=True)

In [71]:
for i in range(len(iris_data)):
    if iris_data.loc[i, 'Species'] == 0:
        iris_data.loc[i, 'Species'] = 'Iris-setosa'
    elif iris_data.loc[i, 'Species'] == 1:
        iris_data.loc[i, 'Species'] = 'Iris-versicolor'
    else:
        iris_data.loc[i, 'Species'] = 'Iris-virginica'

In [72]:
iris_data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [74]:
iris_data.drop(columns = ['sepal_length', 'sepal_width', 'petal_width'], inplace=True)

In [75]:
iris_data.sample(5)

Unnamed: 0,petal_length,Species
148,5.4,Iris-virginica
135,6.1,Iris-virginica
33,1.4,Iris-setosa
76,4.8,Iris-versicolor
24,1.9,Iris-setosa


In [76]:
iris_data.describe()

Unnamed: 0,petal_length
count,150.0
mean,3.758667
std,1.76442
min,1.0
25%,1.6
50%,4.35
75%,5.1
max,6.9


In [77]:
iris_species_mean = iris_data.groupby(by = 'Species').mean()

iris_species_mean

Unnamed: 0_level_0,petal_length
Species,Unnamed: 1_level_1
Iris-setosa,1.464
Iris-versicolor,4.26
Iris-virginica,5.552


In [80]:
mod = ols('petal_length ~ Species', data = iris_data)

res = mod.fit()

res.summary()

0,1,2,3
Dep. Variable:,petal_length,R-squared:,0.941
Model:,OLS,Adj. R-squared:,0.941
Method:,Least Squares,F-statistic:,1179.0
Date:,"Sun, 19 Apr 2020",Prob (F-statistic):,3.05e-91
Time:,17:16:28,Log-Likelihood:,-84.84
No. Observations:,150,AIC:,175.7
Df Residuals:,147,BIC:,184.7
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.4640,0.061,24.057,0.000,1.344,1.584
Species[T.Iris-versicolor],2.7960,0.086,32.488,0.000,2.626,2.966
Species[T.Iris-virginica],4.0880,0.086,47.500,0.000,3.918,4.258

0,1,2,3
Omnibus:,4.393,Durbin-Watson:,2.0
Prob(Omnibus):,0.111,Jarque-Bera (JB):,5.37
Skew:,0.121,Prob(JB):,0.0682
Kurtosis:,3.895,Cond. No.,3.73


In [58]:
mags

['ghaff']

In [82]:
# def _name_levels(prefix, levels)

In [140]:
# Backward difference coding
# encoder = BackwardDifferenceEncoder(cols = ['Species'])
# encoder = HelmertEncoder(cols = ['Species'])
encoder = PolynomialEncoder(cols = ['Species'])

In [141]:
species_encoded = encoder.fit_transform(iris_data)

In [142]:
species_encoded.head()

Unnamed: 0,intercept,petal_length,Species_0,Species_1
0,1,1.4,-0.707107,0.408248
1,1,1.4,-0.707107,0.408248
2,1,1.3,-0.707107,0.408248
3,1,1.5,-0.707107,0.408248
4,1,1.4,-0.707107,0.408248


In [143]:
encoded_iris = pd.concat([iris_data['Species'], species_encoded], axis=1)

In [144]:
encoded_iris.sample(4)

Unnamed: 0,Species,intercept,petal_length,Species_0,Species_1
61,Iris-versicolor,1,4.2,-5.5511150000000004e-17,-0.816497
145,Iris-virginica,1,5.2,0.7071068,0.408248
79,Iris-versicolor,1,3.5,-5.5511150000000004e-17,-0.816497
35,Iris-setosa,1,1.2,-0.7071068,0.408248


In [145]:
X = encoded_iris.drop(columns=['Species', 'petal_length'])
Y = encoded_iris['petal_length']

In [146]:
model = LinearRegression(fit_intercept=False)
# model = GradientBoostingRegressor(max_depth=100)
model.fit(X,Y)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [147]:
model.score(X,Y)

0.9413189735606261

In [118]:
# Helmert Coding
# Compares mean of the dependent variable over all categories

In [148]:
# generating evenly spaced
_, bin_edges = np.histogram(iris_data['petal_length'], 3)

In [149]:
bin_edges

array([1.        , 2.96666667, 4.93333333, 6.9       ])

In [150]:
pl_cat = np.digitize(iris_data['petal_length'], bin_edges, True)

In [151]:
pl_cat

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])

In [152]:
iris_data['pl_cat'] = pl_cat

In [153]:
iris_data.sample(4)

Unnamed: 0,petal_length,Species,pl_cat
25,1.6,Iris-setosa,1
52,4.9,Iris-versicolor,2
133,5.1,Iris-virginica,3
111,5.3,Iris-virginica,3
