In [76]:
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.stats.weightstats import *

In [77]:
z = (9.57-9.5) / (0.4/np.sqrt(160))
density = stats.norm.cdf(z)
p_value = 2 * (1-density)

print(f"p-value: {p_value:.4f}")

p-value: 0.0269


In [78]:
data = pd.read_csv('diamonds.txt', delimiter='\t')
data.head()

Unnamed: 0,carat,depth,table,price,x,y,z
0,0.23,61.5,55.0,326,3.95,3.98,2.43
1,0.21,59.8,61.0,326,3.89,3.84,2.31
2,0.23,56.9,65.0,327,4.05,4.07,2.31
3,0.29,62.4,58.0,334,4.2,4.23,2.63
4,0.31,63.3,58.0,335,4.34,4.35,2.75


In [79]:
y = data['price']
X = data.drop('price', axis=1)

In [80]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [81]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

In [82]:
import warnings
warnings.filterwarnings('ignore')

In [83]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [84]:
rf_model_abs_diff = np.abs(y_test - y_pred_rf)
lr_model_abs_diff = np.abs(y_test - y_pred_lr)

In [85]:
stats.ttest_rel(rf_model_abs_diff, lr_model_abs_diff)

Ttest_relResult(statistic=-13.017729783878696, pvalue=1.655174575138418e-38)

In [86]:
conf_int = DescrStatsW(lr_model_abs_diff - rf_model_abs_diff).tconfint_mean()
print(f"95% confidence interval: ({conf_int[0]}, {conf_int[1]})")

95% confidence interval: (74.2872453259553, 100.62452098634381)
