In [361]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.formula.api as smf
from sklearn.preprocessing import MinMaxScaler
import re
import plotly.express as px

## Odczytanie danych i wstępna ocena

In [362]:
df = pd.read_csv('messy_data.csv', sep=', ')
df.info()
display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   carat        150 non-null    float64
 1   clarity      200 non-null    object 
 2   color        200 non-null    object 
 3   cut          200 non-null    object 
 4   x dimension  197 non-null    float64
 5   y dimension  136 non-null    float64
 6   z dimension  152 non-null    float64
 7   depth        133 non-null    float64
 8   table        168 non-null    object 
 9   price        196 non-null    float64
dtypes: float64(6), object(4)
memory usage: 15.8+ KB






Unnamed: 0,carat,clarity,color,cut,x dimension,y dimension,z dimension,depth,table,price
0,0.5,IF,D,Ideal,5.1,5.15,3.2,61.5,,3000.0
1,0.7,vvs2,E,premium,5.7,,3.52,62.0,59.0,4500.0
2,,Si2,h,Good,4.3,4.31,,62.3,56.0,700.0
3,1.2,if,d,ideal,,6.82,4.2,61.7,58.0,10000.0
4,0.9,I1,J,Fair,6.0,,3.7,61.7,,2400.0


## Duplikaty

In [363]:
display(df.duplicated())
display("Duplications: {}".format(df.duplicated().sum()))
duplicates = df.duplicated(subset=["x dimension", "y dimension", "z dimension", "depth", "table"], keep=False)

display(df[duplicates])
duplicates = df.duplicated(subset=["carat", "cut"], keep=False)

# To check "carat" and "cut" duplicates
# for i in df['carat'].unique():
#     display(df[duplicates][df[duplicates]['carat'] == i])

display(df.loc[[73, 137]])
display(df.loc[[0, 6]])

0      False
1      False
2      False
3      False
4      False
       ...  
195    False
196    False
197    False
198    False
199    False
Length: 200, dtype: bool

'Duplications: 0'

Unnamed: 0,carat,clarity,color,cut,x dimension,y dimension,z dimension,depth,table,price
143,,IF,I,Premium,6.5,6.52,4.03,62.6,,7700.0
185,,VVS1,H,Good,6.5,6.52,4.03,62.6,,7900.0


Unnamed: 0,carat,clarity,color,cut,x dimension,y dimension,z dimension,depth,table,price
73,1.2,VVS1,e,Premium,6.8,,4.2,62.7,58,10200.0
137,1.2,vvs1,G,Premium,6.8,,4.2,,58,10100.0


Unnamed: 0,carat,clarity,color,cut,x dimension,y dimension,z dimension,depth,table,price
0,0.5,IF,D,Ideal,5.1,5.15,3.2,61.5,,3000.0
6,0.5,if,D,Ideal,5.1,,3.2,61.5,57.0,3100.0


## Mapowanie niewłaściwych wartości do NaN i ustandaryzowanie wartości kategorycznych

In [364]:
for col in df:
    display('{} number of null values: {}'.format(col, df[col].isna().sum()))

'carat number of null values: 50'

'clarity number of null values: 0'

'color number of null values: 0'

'cut number of null values: 0'

'x dimension number of null values: 3'

'y dimension number of null values: 64'

'z dimension number of null values: 48'

'depth number of null values: 67'

'table number of null values: 32'

'price number of null values: 4'

In [365]:
for col in ('clarity', 'color', 'cut'):
    display('{} unique values: {}'.format(col, df[col].unique()))
    df[col] = df[col].str.upper()
    display('{} unique values after uppercase transformation: {}'.format(col, df[col].unique()))

"clarity unique values: ['IF' 'vvs2' 'Si2' 'if' 'I1' 'Si1' 'Vvs1' 'SI2' 'vvs1' 'si2' 'si1' 'i1'\n 'VVS1' 'VVS2']"

"clarity unique values after uppercase transformation: ['IF' 'VVS2' 'SI2' 'I1' 'SI1' 'VVS1']"

"color unique values: ['D' 'E' 'h' 'd' 'J' 'G' 'f' 'I' 'F' 'Colorless' 'j' 'e' 'H' 'g'\n 'colorless']"

"color unique values after uppercase transformation: ['D' 'E' 'H' 'J' 'G' 'F' 'I' 'COLORLESS']"

"cut unique values: ['Ideal' 'premium' 'Good' 'ideal' 'Fair' 'very Good' 'good' 'Very good'\n 'very good' 'fair' 'Premium' 'Very Good']"

"cut unique values after uppercase transformation: ['IDEAL' 'PREMIUM' 'GOOD' 'FAIR' 'VERY GOOD']"

In [366]:
df['clarity'] = df['clarity'].astype('category')
df['color'] = df['color'].astype('category')
df['cut'] = df['cut'].astype('category')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   carat        150 non-null    float64 
 1   clarity      200 non-null    category
 2   color        200 non-null    category
 3   cut          200 non-null    category
 4   x dimension  197 non-null    float64 
 5   y dimension  136 non-null    float64 
 6   z dimension  152 non-null    float64 
 7   depth        133 non-null    float64 
 8   table        168 non-null    object  
 9   price        196 non-null    float64 
dtypes: category(3), float64(6), object(1)
memory usage: 12.4+ KB


In [367]:
df['table'].unique()

array([nan, '59', '56', '58', '57', ',', '58,', '54', '57,', '60', '55'],
      dtype=object)

In [368]:
df['table'] = df['table'].str.replace(",", "")
display(df['table'].unique())
df.replace({"": np.nan}, inplace=True)
display(df['table'].unique())
df['table'] = df['table'].astype('float64')
df.info()

array([nan, '59', '56', '58', '57', '', '54', '60', '55'], dtype=object)

array([nan, '59', '56', '58', '57', '54', '60', '55'], dtype=object)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   carat        150 non-null    float64 
 1   clarity      200 non-null    category
 2   color        200 non-null    category
 3   cut          200 non-null    category
 4   x dimension  197 non-null    float64 
 5   y dimension  136 non-null    float64 
 6   z dimension  152 non-null    float64 
 7   depth        133 non-null    float64 
 8   table        167 non-null    float64 
 9   price        196 non-null    float64 
dtypes: category(3), float64(7)
memory usage: 12.4 KB


## Uzupełnienie brakujących wartości medianą

In [369]:
df['carat'].fillna(df['carat'].median(), inplace=True)
df['x dimension'].fillna(df['x dimension'].median(), inplace=True)
df['y dimension'].fillna(df['y dimension'].median(), inplace=True)
df['z dimension'].fillna(df['z dimension'].median(), inplace=True)
df['depth'].fillna(df['depth'].median(), inplace=True)
df['table'].fillna(df['table'].median(), inplace=True)
df['price'].fillna(df['price'].median(), inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   carat        200 non-null    float64 
 1   clarity      200 non-null    category
 2   color        200 non-null    category
 3   cut          200 non-null    category
 4   x dimension  200 non-null    float64 
 5   y dimension  200 non-null    float64 
 6   z dimension  200 non-null    float64 
 7   depth        200 non-null    float64 
 8   table        200 non-null    float64 
 9   price        200 non-null    float64 
dtypes: category(3), float64(7)
memory usage: 12.4 KB


In [370]:
df.head()

Unnamed: 0,carat,clarity,color,cut,x dimension,y dimension,z dimension,depth,table,price
0,0.5,IF,D,IDEAL,5.1,5.15,3.2,61.5,57.0,3000.0
1,0.7,VVS2,E,PREMIUM,5.7,5.85,3.52,62.0,59.0,4500.0
2,0.75,SI2,H,GOOD,4.3,4.31,3.715,62.3,56.0,700.0
3,1.2,IF,D,IDEAL,5.9,6.82,4.2,61.7,58.0,10000.0
4,0.9,I1,J,FAIR,6.0,5.85,3.7,61.7,57.0,2400.0


In [371]:
df.to_csv('data_cleaned.csv', index=False)

In [372]:
df_numeric_values = df[['carat', 'x dimension', 'y dimension', 'z dimension', 'depth', 'table']].values
df_numeric_columns = df[['carat', 'x dimension', 'y dimension', 'z dimension', 'depth', 'table']].columns
min_max_scaler = MinMaxScaler()
df_numeric_values_scaled = min_max_scaler.fit_transform(df_numeric_values)
x = pd.DataFrame(data=df_numeric_values_scaled, columns=df_numeric_columns)
remaining_data = {'clarity': df['clarity'].values, 'color': df['color'].values, 'cut': df['cut'].values, 'price': df['price']}
x.rename(columns=lambda x : re.sub(r"\s+", "", x), inplace=True)
normalized_df = x.assign(**remaining_data)
display(normalized_df)

Unnamed: 0,carat,xdimension,ydimension,zdimension,depth,table,clarity,color,cut,price
0,0.214286,0.352941,0.371681,0.377990,0.166667,0.500000,IF,D,IDEAL,3000.0
1,0.357143,0.529412,0.578171,0.531100,0.444444,0.833333,VVS2,E,PREMIUM,4500.0
2,0.392857,0.117647,0.123894,0.624402,0.611111,0.333333,SI2,H,GOOD,700.0
3,0.714286,0.588235,0.864307,0.856459,0.277778,0.666667,IF,D,IDEAL,10000.0
4,0.500000,0.617647,0.578171,0.617225,0.277778,0.500000,I1,J,FAIR,2400.0
...,...,...,...,...,...,...,...,...,...,...
195,0.178571,0.294118,0.294985,0.624402,0.666667,0.833333,SI1,F,VERY GOOD,2300.0
196,0.750000,0.882353,0.578171,0.889952,0.611111,0.166667,VVS2,H,GOOD,10400.0
197,0.250000,0.411765,0.578171,0.416268,0.611111,0.500000,I1,I,PREMIUM,150000.0
198,0.464286,0.647059,0.660767,0.655502,0.611111,0.666667,SI2,E,FAIR,6300.0


In [373]:
model = smf.ols(formula="price ~ carat + xdimension + ydimension + zdimension + depth + table + C(clarity) + C(color) + C(cut)", data=normalized_df).fit()

In [374]:
display(model.summary())

0,1,2,3
Dep. Variable:,price,R-squared:,0.177
Model:,OLS,Adj. R-squared:,0.074
Method:,Least Squares,F-statistic:,1.727
Date:,"Sun, 04 Feb 2024",Prob (F-statistic):,0.0282
Time:,00:28:48,Log-Likelihood:,-2381.6
No. Observations:,200,AIC:,4809.0
Df Residuals:,177,BIC:,4885.0
Df Model:,22,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,896.8654,2.15e+04,0.042,0.967,-4.15e+04,4.33e+04
C(clarity)[T.IF],-6575.4307,1.15e+04,-0.573,0.568,-2.92e+04,1.61e+04
C(clarity)[T.SI1],-1.691e+04,1.05e+04,-1.612,0.109,-3.76e+04,3796.523
C(clarity)[T.SI2],-1831.4370,9582.916,-0.191,0.849,-2.07e+04,1.71e+04
C(clarity)[T.VVS1],-9336.6332,1.06e+04,-0.880,0.380,-3.03e+04,1.16e+04
C(clarity)[T.VVS2],-2666.8265,1.02e+04,-0.261,0.795,-2.29e+04,1.75e+04
C(color)[T.D],3.364e+04,1.75e+04,1.927,0.056,-816.681,6.81e+04
C(color)[T.E],-327.2464,1.68e+04,-0.019,0.984,-3.35e+04,3.29e+04
C(color)[T.F],4970.6895,1.62e+04,0.307,0.759,-2.69e+04,3.69e+04

0,1,2,3
Omnibus:,237.336,Durbin-Watson:,1.977
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7972.497
Skew:,4.996,Prob(JB):,0.0
Kurtosis:,32.272,Cond. No.,37.1


In [375]:
print("Model P Values:", model.pvalues.values)
print("Model Coef:", model.params.values)
print("Model Std Errs:", model.bse.values)

Model P Values: [0.966729   0.56770152 0.10881739 0.8486547  0.37988474 0.79477683
 0.05562121 0.98449352 0.75885974 0.39002661 0.93547615 0.596888
 0.98982171 0.04597147 0.28859211 0.32479167 0.72557767 0.74143086
 0.05799452 0.00249771 0.84253939 0.4326365  0.07892656]
Model Coef: [   896.86539568  -6575.43067383 -16912.04328655  -1831.43695671
  -9336.63318805  -2666.82650913  33638.45048851   -327.24642635
   4970.68946517  13590.69024123   1338.20368715   8580.05803878
    252.91186789 -18458.98665802 -10851.08558102  -9951.71129034
   3378.02353329  -9864.27941314  82898.0237829  -84646.37917569
  -5763.36941178  16969.0089416   25881.10411933]
Model Std Errs: [21471.43692143 11485.22378773 10493.54696105  9582.91573818
 10606.0664744  10237.12873341 17459.27466561 16813.59916916
 16167.29387782 15772.22105976 16506.32544735 16193.728019
 19797.44706059  9184.50177858 10194.44317256 10078.64538921
  9608.34898678 29848.66351013 43444.89124871 27594.50663
 28970.64581999 21575.821

In [376]:
px.imshow(df[['carat', 'x dimension', 'y dimension', 'z dimension', 'depth', 'table', 'price']].corr(), color_continuous_scale='Agsunset', title="Correlation heatmap for price and variables",text_auto=True)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [377]:
def determine_outlier_thresholds_iqr(dataframe, col_name, th1=0.25, th3=0.75):
    quartile1 = dataframe[col_name].quantile(th1)
    quartile3 = dataframe[col_name].quantile(th3)
    iqr = quartile3 - quartile1
    upper_limit = quartile3 + 1.5 * iqr
    lower_limit = quartile1 - 1.5 * iqr
    return lower_limit, upper_limit

def check_outliers_iqr(dataframe, col_name):
    lower_limit, upper_limit = determine_outlier_thresholds_iqr(dataframe, col_name)
    if dataframe[(dataframe[col_name] > upper_limit) | (dataframe[col_name] < lower_limit)].any(axis=None):
        return True
    else:
        return False
    
def replace_with_thresholds_iqr(dataframe, cols, th1=0.05, th3=0.95, replace=False):
    from tabulate import tabulate
    data = []
    for col_name in cols:
        outliers = check_outliers_iqr(dataframe, col_name)
        count = None
        lower_limit, upper_limit = determine_outlier_thresholds_iqr(dataframe, col_name, th1, th3)
        if outliers:
            count = dataframe[(dataframe[col_name] > upper_limit) | (dataframe[col_name] < lower_limit)][col_name].count()
            if replace:
                dataframe.loc[(dataframe[col_name] < lower_limit), col_name] = lower_limit
                dataframe.loc[(dataframe[col_name] > upper_limit), col_name] = upper_limit
        outliers_status = check_outliers_iqr(dataframe, col_name)
        data.append([outliers, outliers_status, count, col_name, lower_limit, upper_limit])
    table = tabulate(data, headers=['Outliers (Previously)', 'Outliers', 'Count', 'Column', 'Lower Limit', 'Upper Limit'], tablefmt='rst', numalign='right')
    print("Removing Outliers using IQR")
    print(table)

In [378]:
df_numeric_values = df[['carat', 'x dimension', 'y dimension', 'z dimension', 'depth', 'table']].values
df_numeric_columns = df[['carat', 'x dimension', 'y dimension', 'z dimension', 'depth', 'table']].columns
min_max_scaler = MinMaxScaler()
df_numeric_values_scaled = min_max_scaler.fit_transform(df_numeric_values)
x = pd.DataFrame(data=df_numeric_values_scaled, columns=df_numeric_columns)
x = x.assign(price=df['price'])
display(x)
replace_with_thresholds_iqr(x, ['carat', 'x dimension', 'y dimension', 'z dimension', 'depth', 'table', 'price'], 0.25, 0.75, True)
remaining_data = {'clarity': df['clarity'].values, 'color': df['color'].values, 'cut': df['cut'].values}
normalized_df = x.assign(**remaining_data)
normalized_df.to_csv('data_normalized.csv', index=False)
normalized_df.rename(columns=lambda x : re.sub(r"\s+", "", x), inplace=True)
display(normalized_df)

Unnamed: 0,carat,x dimension,y dimension,z dimension,depth,table,price
0,0.214286,0.352941,0.371681,0.377990,0.166667,0.500000,3000.0
1,0.357143,0.529412,0.578171,0.531100,0.444444,0.833333,4500.0
2,0.392857,0.117647,0.123894,0.624402,0.611111,0.333333,700.0
3,0.714286,0.588235,0.864307,0.856459,0.277778,0.666667,10000.0
4,0.500000,0.617647,0.578171,0.617225,0.277778,0.500000,2400.0
...,...,...,...,...,...,...,...
195,0.178571,0.294118,0.294985,0.624402,0.666667,0.833333,2300.0
196,0.750000,0.882353,0.578171,0.889952,0.611111,0.166667,10400.0
197,0.250000,0.411765,0.578171,0.416268,0.611111,0.500000,150000.0
198,0.464286,0.647059,0.660767,0.655502,0.611111,0.666667,6300.0


Removing Outliers using IQR
Outliers (Previously)    Outliers      Count  Column         Lower Limit    Upper Limit
True                     False             2  carat            -0.111607       0.924107
False                    False                x dimension      -0.117647        1.29412
True                     False            16  y dimension       0.239676       0.929941
True                     False             2  z dimension       0.157297        1.09988
True                     False            35  depth             0.388889       0.833333
False                    False                table            -0.166667        1.16667
True                     False             5  price                -5350          15850


Unnamed: 0,carat,xdimension,ydimension,zdimension,depth,table,price,clarity,color,cut
0,0.214286,0.352941,0.371681,0.377990,0.388889,0.500000,3000.0,IF,D,IDEAL
1,0.357143,0.529412,0.578171,0.531100,0.444444,0.833333,4500.0,VVS2,E,PREMIUM
2,0.392857,0.117647,0.239676,0.624402,0.611111,0.333333,700.0,SI2,H,GOOD
3,0.714286,0.588235,0.864307,0.856459,0.388889,0.666667,10000.0,IF,D,IDEAL
4,0.500000,0.617647,0.578171,0.617225,0.388889,0.500000,2400.0,I1,J,FAIR
...,...,...,...,...,...,...,...,...,...,...
195,0.178571,0.294118,0.294985,0.624402,0.666667,0.833333,2300.0,SI1,F,VERY GOOD
196,0.750000,0.882353,0.578171,0.889952,0.611111,0.166667,10400.0,VVS2,H,GOOD
197,0.250000,0.411765,0.578171,0.416268,0.611111,0.500000,15850.0,I1,I,PREMIUM
198,0.464286,0.647059,0.660767,0.655502,0.611111,0.666667,6300.0,SI2,E,FAIR


In [379]:
px.imshow(normalized_df[['carat', 'xdimension', 'ydimension', 'zdimension', 'depth', 'table', 'price']].corr(), color_continuous_scale='Agsunset', title="Correlation heatmap for price and variables",text_auto=True)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [380]:
model = smf.ols(formula="price ~ carat + xdimension + ydimension + zdimension + depth + table + C(clarity) + C(color) + C(cut)", data=normalized_df).fit()
display(model.summary())

0,1,2,3
Dep. Variable:,price,R-squared:,0.72
Model:,OLS,Adj. R-squared:,0.686
Method:,Least Squares,F-statistic:,20.72
Date:,"Sun, 04 Feb 2024",Prob (F-statistic):,6.22e-38
Time:,00:29:10,Log-Likelihood:,-1786.5
No. Observations:,200,AIC:,3619.0
Df Residuals:,177,BIC:,3695.0
Df Model:,22,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-3812.7667,1212.340,-3.145,0.002,-6205.268,-1420.265
C(clarity)[T.IF],1400.8085,584.515,2.397,0.018,247.294,2554.323
C(clarity)[T.SI1],380.1398,534.816,0.711,0.478,-675.297,1435.577
C(clarity)[T.SI2],587.3026,488.614,1.202,0.231,-376.957,1551.562
C(clarity)[T.VVS1],1195.3675,540.906,2.210,0.028,127.913,2262.822
C(clarity)[T.VVS2],1525.8692,523.224,2.916,0.004,493.308,2558.430
C(color)[T.D],1392.5052,892.379,1.560,0.120,-368.566,3153.576
C(color)[T.E],174.0423,857.093,0.203,0.839,-1517.395,1865.479
C(color)[T.F],597.0206,823.697,0.725,0.470,-1028.510,2222.552

0,1,2,3
Omnibus:,182.878,Durbin-Watson:,1.915
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4075.027
Skew:,3.399,Prob(JB):,0.0
Kurtosis:,24.043,Cond. No.,36.5


In [381]:
normalized_df = pd.get_dummies(normalized_df, columns=['clarity', 'color', 'cut'], drop_first=True)
normalized_df = normalized_df.astype({col: int for col in normalized_df.columns[7:]})
normalized_df.head()

Unnamed: 0,carat,xdimension,ydimension,zdimension,depth,table,price,clarity_IF,clarity_SI1,clarity_SI2,...,color_E,color_F,color_G,color_H,color_I,color_J,cut_GOOD,cut_IDEAL,cut_PREMIUM,cut_VERY GOOD
0,0.214286,0.352941,0.371681,0.37799,0.388889,0.5,3000.0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.357143,0.529412,0.578171,0.5311,0.444444,0.833333,4500.0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,0.392857,0.117647,0.239676,0.624402,0.611111,0.333333,700.0,0,0,1,...,0,0,0,1,0,0,1,0,0,0
3,0.714286,0.588235,0.864307,0.856459,0.388889,0.666667,10000.0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0.5,0.617647,0.578171,0.617225,0.388889,0.5,2400.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [382]:
def forward_selection(X, initial_list=[],
                      threshold_in=0.01,
                      verbose=True):
    included = list(initial_list)
    formula = "price ~"
    while True:
        changed=False
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        print(formula)
        for new_column in excluded:
            model = smf.ols(formula=formula + " " + new_column if formula == "price ~" else formula + " + " + new_column, data=X).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = excluded[new_pval.argmin()]
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add {} with p-value {}'.format(best_feature, best_pval))
            formula=formula + " " + best_feature if formula == "price ~" else formula + " + " + best_feature
        if not changed:
            break
    return included

In [383]:
forward_selection(normalized_df.rename(columns={"cut_VERY GOOD" : "cut_VERY_GOOD"}), initial_list=['price'], verbose=True)

price ~
Add xdimension with p-value 9.385686313009106e-45
price ~ xdimension
Add table with p-value 0.0006283848512421541
price ~ xdimension + table
Add carat with p-value 0.003746992838645761
price ~ xdimension + table + carat


['price', 'xdimension', 'table', 'carat']

In [384]:
model = smf.ols(formula="price ~ xdimension + table + carat", data=normalized_df).fit()
display(model.summary())

0,1,2,3
Dep. Variable:,price,R-squared:,0.667
Model:,OLS,Adj. R-squared:,0.662
Method:,Least Squares,F-statistic:,130.9
Date:,"Sun, 04 Feb 2024",Prob (F-statistic):,1.4299999999999999e-46
Time:,00:29:19,Log-Likelihood:,-1803.9
No. Observations:,200,AIC:,3616.0
Df Residuals:,196,BIC:,3629.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-2353.6137,467.132,-5.038,0.000,-3274.865,-1432.363
xdimension,8411.5607,1301.332,6.464,0.000,5845.151,1.1e+04
table,2366.3035,682.263,3.468,0.001,1020.785,3711.822
carat,4304.8164,1467.285,2.934,0.004,1411.123,7198.510

0,1,2,3
Omnibus:,151.072,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2273.3
Skew:,2.69,Prob(JB):,0.0
Kurtosis:,18.616,Cond. No.,17.8


In [385]:
model = smf.ols(formula="price ~ I(xdimension**2)", data=normalized_df).fit()
display(model.summary())

0,1,2,3
Dep. Variable:,price,R-squared:,0.669
Model:,OLS,Adj. R-squared:,0.667
Method:,Least Squares,F-statistic:,399.4
Date:,"Sun, 04 Feb 2024",Prob (F-statistic):,2.3e-49
Time:,00:29:22,Log-Likelihood:,-1803.5
No. Observations:,200,AIC:,3611.0
Df Residuals:,198,BIC:,3618.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1215.6503,257.688,4.718,0.000,707.486,1723.815
I(xdimension ** 2),1.098e+04,549.643,19.984,0.000,9900.330,1.21e+04

0,1,2,3
Omnibus:,167.058,Durbin-Watson:,2.015
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3641.921
Skew:,2.953,Prob(JB):,0.0
Kurtosis:,23.054,Cond. No.,4.51


In [386]:
normalized_df.rename(columns={"cut_VERY GOOD" : "cut_VERY_GOOD"}).to_csv('regression_data.csv', index=False)