In [2]:
import pandas as pd

In [4]:
df = pd.read_csv('../../../datasets/bike.csv')

In [5]:
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
df_train, df_test = train_test_split(df, train_size=0.7, random_state=123)

In [8]:
from statsmodels.formula.api import ols

In [11]:
model = ols(formula='registered ~ temp', data=df_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,registered,R-squared:,0.106
Model:,OLS,Adj. R-squared:,0.106
Method:,Least Squares,F-statistic:,902.3
Date:,"Wed, 30 Nov 2022",Prob (F-statistic):,1.92e-187
Time:,09:46:35,Log-Likelihood:,-48650.0
No. Observations:,7620,AIC:,97300.0
Df Residuals:,7618,BIC:,97320.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,27.5151,4.559,6.036,0.000,18.579,36.452
temp,6.3391,0.211,30.038,0.000,5.925,6.753

0,1,2,3
Omnibus:,2097.525,Durbin-Watson:,2.022
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5337.402
Skew:,1.502,Prob(JB):,0.0
Kurtosis:,5.79,Cond. No.,60.1


In [12]:
df_train, df_test = train_test_split(df, train_size=0.7, random_state=123)

In [22]:
model = ols(formula='casual ~ atemp', data=df_train).fit()
pred = model.predict(df_test)
pred.head(2)

6495    31.499001
7050    12.626390
dtype: float64

In [15]:
from sklearn.metrics import mean_squared_error

In [23]:
mean_squared_error(y_true=df_test['casual'], y_pred=pred) ** 0.5

44.46237010271433

In [30]:
df_s2 = df.loc[df['season'] == 2,]
df_s4 = df.loc[df['season'] == 4,]

df_s2_train, df_s2_test = train_test_split(df_s2, train_size=0.7, random_state=123)
df_s4_train, df_s4_test = train_test_split(df_s4, train_size=0.7, random_state=123)

model_s2 = ols(formula='casual ~ atemp', data=df_s2_train).fit()
pred_s2 = model_s2.predict(df_s2_test)
model_s4 = ols(formula='casual ~ atemp', data=df_s4_train).fit()
pred_s4 = model_s4.predict(df_s4_test)

rmse_s2 = mean_squared_error(y_true=df_s2_test['casual'], y_pred=pred_s2) ** 0.5
rmse_s4 = mean_squared_error(y_true=df_s4_test['casual'], y_pred=pred_s4) ** 0.5

abs(rmse_s2 - rmse_s4).round(1)

8.6

In [3]:
df = pd.read_csv('../../../datasets/diamonds.csv')

In [4]:
df.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31


In [5]:
df_sub = pd.concat([df.loc[:, 'carat'], df.loc[:, 'depth':'z']], axis=1)
df_sub.head(2)

Unnamed: 0,carat,depth,table,price,x,y,z
0,0.23,61.5,55.0,326,3.95,3.98,2.43
1,0.21,59.8,61.0,326,3.89,3.84,2.31


In [6]:
from patsy import dmatrices

In [10]:
y, X = dmatrices(formula_like='price ~ carat + depth + table + x + y + z', data=df_sub, return_type='dataframe')

In [13]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [16]:
df_vif = pd.DataFrame()
df_vif['colname'] = X.columns
df_vif['VIF'] = [vif(X.values, i) for i in range(X.shape[1])]
df_vif

Unnamed: 0,colname,VIF
0,Intercept,4821.69635
1,carat,21.602712
2,depth,1.49659
3,table,1.143225
4,x,56.187704
5,y,20.454295
6,z,23.530049


In [21]:
df_dia = pd.DataFrame({'carat': [1], 'depth': [60], 'table': [55]})
df_dia

Unnamed: 0,carat,depth,table
0,1,60,55


In [22]:
from statsmodels.formula.api import ols

In [72]:
model = ols('price ~ carat + depth', data=df).fit()

In [37]:
model.predict(df_dia).round()

0    5681.0
dtype: float64

In [38]:
df.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31


In [65]:
df_sub = df.loc[:, ['price', 'carat', 'color', 'depth']]
df_sub.head(2)

Unnamed: 0,price,carat,color,depth
0,326,0.23,E,61.5
1,326,0.21,E,59.8


In [66]:
df_dummy = pd.get_dummies(df_sub, columns=['color'], drop_first=True)
df_dummy.head(2)

Unnamed: 0,price,carat,depth,color_E,color_F,color_G,color_H,color_I,color_J
0,326,0.23,61.5,1,0,0,0,0,0
1,326,0.21,59.8,1,0,0,0,0,0


In [74]:
formula = 'price ~ ' + '+'.join(df_dummy.columns[1:])
formula
# formula = 'price ~ depth'
model = ols(formula=formula, data=df_dummy).fit()
# model.predict(df_pred)
# df_dummy.head(2)

In [77]:
df_test = df_dummy.iloc[[0],]
df_test['carat'] = 1
df_test['depth'] = 50
df_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['carat'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['depth'] = 50


Unnamed: 0,price,carat,depth,color_E,color_F,color_G,color_H,color_I,color_J
0,326,1,50,1,0,0,0,0,0


In [79]:
model.predict(df_test).round()

0    6885.0
dtype: float64

In [80]:
df = pd.read_csv('../../../datasets/diabetes.csv')

In [81]:
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
