In [3]:
import pandas as pd
import numpy as np

In [4]:
# Let's load the dataset 
df = pd.read_csv('delivery_time.csv')

In [5]:
df.shape

(21, 2)

In [6]:
df.head()

Unnamed: 0,Delivery Time,Sorting Time
0,21.0,10
1,13.5,4
2,19.75,6
3,24.0,9
4,29.0,10


In [8]:
df.rename(columns={'Delivery Time':'delivery_time','Sorting Time':'sorting_time'}, inplace=True)

There are total 21 rows and two columns present in this dataset

In [9]:
# Take a look of dataset
df.head()

Unnamed: 0,delivery_time,sorting_time
0,21.0,10
1,13.5,4
2,19.75,6
3,24.0,9
4,29.0,10


We have two columns 'Delivery Time' and 'Sorting Time'

In [10]:
# Check the datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   delivery_time  21 non-null     float64
 1   sorting_time   21 non-null     int64  
dtypes: float64(1), int64(1)
memory usage: 464.0 bytes


We can see dtype of 'Delivery Time' column is float where as dtype of 'Sorting Time' column is int

In [11]:
df.describe()

Unnamed: 0,delivery_time,sorting_time
count,21.0,21.0
mean,16.790952,6.190476
std,5.074901,2.542028
min,8.0,2.0
25%,13.5,4.0
50%,17.83,6.0
75%,19.75,8.0
max,29.0,10.0


* We can see mean value is 16.79 for 'Delivery Time' and 6.19 for 'Sorting Time'
* Standard deviation for 'Delivery Time' and 'Sorting Time' are 5.07 and 2.54 respectively
* Max for 'Delivery Time' and 'Sorting Time' are 29 and 10 respectively


In [12]:
# Let's check the null values
df.isnull().sum()

delivery_time    0
sorting_time     0
dtype: int64

There is no null value present.

In [13]:
# Let's check duplicates
df.duplicated().sum()

0

Also there is no duplicates are present in this dataset.

In [1]:
import statsmodels.formula.api as sm

In [14]:
model1 = sm.ols('delivery_time ~ sorting_time', data=df).fit()

In [15]:
result = model1.summary()
result

0,1,2,3
Dep. Variable:,delivery_time,R-squared:,0.682
Model:,OLS,Adj. R-squared:,0.666
Method:,Least Squares,F-statistic:,40.8
Date:,"Thu, 01 Jun 2023",Prob (F-statistic):,3.98e-06
Time:,10:53:34,Log-Likelihood:,-51.357
No. Observations:,21,AIC:,106.7
Df Residuals:,19,BIC:,108.8
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.5827,1.722,3.823,0.001,2.979,10.186
sorting_time,1.6490,0.258,6.387,0.000,1.109,2.189

0,1,2,3
Omnibus:,3.649,Durbin-Watson:,1.248
Prob(Omnibus):,0.161,Jarque-Bera (JB):,2.086
Skew:,0.75,Prob(JB):,0.352
Kurtosis:,3.367,Cond. No.,18.3


### 1. square root transformation

In [16]:
df['del_time_sqr']=np.sqrt(df['delivery_time'])

In [17]:
df.head()

Unnamed: 0,delivery_time,sorting_time,del_time_sqr
0,21.0,10,4.582576
1,13.5,4,3.674235
2,19.75,6,4.444097
3,24.0,9,4.898979
4,29.0,10,5.385165


In [18]:
model2 = sm.ols('del_time_sqr ~ sorting_time', data=df).fit()

In [19]:
result2 = model2.summary()
result2

0,1,2,3
Dep. Variable:,del_time_sqr,R-squared:,0.704
Model:,OLS,Adj. R-squared:,0.688
Method:,Least Squares,F-statistic:,45.2
Date:,"Thu, 01 Jun 2023",Prob (F-statistic):,2e-06
Time:,10:56:31,Log-Likelihood:,-6.6646
No. Observations:,21,AIC:,17.33
Df Residuals:,19,BIC:,19.42
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.7727,0.205,13.527,0.000,2.344,3.202
sorting_time,0.2066,0.031,6.723,0.000,0.142,0.271

0,1,2,3
Omnibus:,2.228,Durbin-Watson:,1.258
Prob(Omnibus):,0.328,Jarque-Bera (JB):,1.195
Skew:,0.58,Prob(JB):,0.55
Kurtosis:,3.142,Cond. No.,18.3


### 2. Log transformation

In [20]:
df['del_time_log'] = np.log(df['delivery_time'])

In [21]:
df.head()

Unnamed: 0,delivery_time,sorting_time,del_time_sqr,del_time_log
0,21.0,10,4.582576,3.044522
1,13.5,4,3.674235,2.60269
2,19.75,6,4.444097,2.983153
3,24.0,9,4.898979,3.178054
4,29.0,10,5.385165,3.367296


In [22]:
model3 = sm.ols('del_time_log ~ sorting_time', data=df).fit()

In [23]:
result3 = model3.summary()
result3

0,1,2,3
Dep. Variable:,del_time_log,R-squared:,0.711
Model:,OLS,Adj. R-squared:,0.696
Method:,Least Squares,F-statistic:,46.73
Date:,"Thu, 01 Jun 2023",Prob (F-statistic):,1.59e-06
Time:,10:58:10,Log-Likelihood:,7.792
No. Observations:,21,AIC:,-11.58
Df Residuals:,19,BIC:,-9.495
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.1214,0.103,20.601,0.000,1.906,2.337
sorting_time,0.1056,0.015,6.836,0.000,0.073,0.138

0,1,2,3
Omnibus:,1.238,Durbin-Watson:,1.325
Prob(Omnibus):,0.538,Jarque-Bera (JB):,0.544
Skew:,0.393,Prob(JB):,0.762
Kurtosis:,3.067,Cond. No.,18.3


### 3. cube root transformation

In [24]:
df['delivery_time_croot'] = df['delivery_time']**(1/3)

In [25]:
df.head()

Unnamed: 0,delivery_time,sorting_time,del_time_sqr,del_time_log,delivery_time_croot
0,21.0,10,4.582576,3.044522,2.758924
1,13.5,4,3.674235,2.60269,2.381102
2,19.75,6,4.444097,2.983153,2.70306
3,24.0,9,4.898979,3.178054,2.884499
4,29.0,10,5.385165,3.367296,3.072317


In [26]:
model4 = sm.ols('delivery_time_croot ~ sorting_time', data=df).fit()

In [27]:
result4 = model4.summary()

In [28]:
result4

0,1,2,3
Dep. Variable:,delivery_time_croot,R-squared:,0.708
Model:,OLS,Adj. R-squared:,0.693
Method:,Least Squares,F-statistic:,46.08
Date:,"Thu, 01 Jun 2023",Prob (F-statistic):,1.75e-06
Time:,11:00:24,Log-Likelihood:,11.651
No. Observations:,21,AIC:,-19.3
Df Residuals:,19,BIC:,-17.21
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.9951,0.086,23.283,0.000,1.816,2.174
sorting_time,0.0872,0.013,6.788,0.000,0.060,0.114

0,1,2,3
Omnibus:,1.873,Durbin-Watson:,1.274
Prob(Omnibus):,0.392,Jarque-Bera (JB):,0.966
Skew:,0.523,Prob(JB):,0.617
Kurtosis:,3.106,Cond. No.,18.3


### 4. Divide by max value

In [29]:
df['delivery_time_div'] = df['delivery_time']/df['delivery_time'].max()

In [30]:
df.head()

Unnamed: 0,delivery_time,sorting_time,del_time_sqr,del_time_log,delivery_time_croot,delivery_time_div
0,21.0,10,4.582576,3.044522,2.758924,0.724138
1,13.5,4,3.674235,2.60269,2.381102,0.465517
2,19.75,6,4.444097,2.983153,2.70306,0.681034
3,24.0,9,4.898979,3.178054,2.884499,0.827586
4,29.0,10,5.385165,3.367296,3.072317,1.0


In [31]:
model5 = sm.ols('delivery_time_div ~ sorting_time', data=df).fit()

In [32]:
result5 = model5.summary()
result5

0,1,2,3
Dep. Variable:,delivery_time_div,R-squared:,0.682
Model:,OLS,Adj. R-squared:,0.666
Method:,Least Squares,F-statistic:,40.8
Date:,"Thu, 01 Jun 2023",Prob (F-statistic):,3.98e-06
Time:,11:04:07,Log-Likelihood:,19.356
No. Observations:,21,AIC:,-34.71
Df Residuals:,19,BIC:,-32.62
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.2270,0.059,3.823,0.001,0.103,0.351
sorting_time,0.0569,0.009,6.387,0.000,0.038,0.075

0,1,2,3
Omnibus:,3.649,Durbin-Watson:,1.248
Prob(Omnibus):,0.161,Jarque-Bera (JB):,2.086
Skew:,0.75,Prob(JB):,0.352
Kurtosis:,3.367,Cond. No.,18.3


### 5. 4th root transformation

In [33]:
df['delivery_time_fourth'] = df['delivery_time']**(1/4)

In [34]:
df.head()

Unnamed: 0,delivery_time,sorting_time,del_time_sqr,del_time_log,delivery_time_croot,delivery_time_div,delivery_time_fourth
0,21.0,10,4.582576,3.044522,2.758924,0.724138,2.140695
1,13.5,4,3.674235,2.60269,2.381102,0.465517,1.916829
2,19.75,6,4.444097,2.983153,2.70306,0.681034,2.108103
3,24.0,9,4.898979,3.178054,2.884499,0.827586,2.213364
4,29.0,10,5.385165,3.367296,3.072317,1.0,2.320596


In [35]:
model6 = sm.ols('delivery_time_fourth ~ sorting_time', data=df).fit()

In [36]:
result6 = model6.summary()
result6

0,1,2,3
Dep. Variable:,delivery_time_fourth,R-squared:,0.709
Model:,OLS,Adj. R-squared:,0.694
Method:,Least Squares,F-statistic:,46.38
Date:,"Thu, 01 Jun 2023",Prob (F-statistic):,1.68e-06
Time:,11:06:50,Log-Likelihood:,22.544
No. Observations:,21,AIC:,-41.09
Df Residuals:,19,BIC:,-39.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.6846,0.051,33.026,0.000,1.578,1.791
sorting_time,0.0521,0.008,6.810,0.000,0.036,0.068

0,1,2,3
Omnibus:,1.706,Durbin-Watson:,1.284
Prob(Omnibus):,0.426,Jarque-Bera (JB):,0.856
Skew:,0.492,Prob(JB):,0.652
Kurtosis:,3.093,Cond. No.,18.3


In [37]:
metrics_df = pd.DataFrame(data=[[model1.rsquared,model1.rsquared_adj,model1.mse_total],
                   [model2.rsquared,model2.rsquared_adj,model2.mse_total],
                  [model3.rsquared,model3.rsquared_adj,model3.mse_total],
                  [model4.rsquared,model4.rsquared_adj,model4.mse_total],
                  [model5.rsquared,model5.rsquared_adj,model5.mse_total],
                  [model6.rsquared,model6.rsquared_adj,model6.mse_total]],
            columns=['rsquared','rsquared_adj','mse_total'],
            index=['model1','model2','model3','model4','model5','model6'])

In [38]:
metrics_df

Unnamed: 0,rsquared,rsquared_adj,mse_total
model1,0.682271,0.665549,25.754619
model2,0.70405,0.688474,0.391879
model3,0.710948,0.695735,0.101263
model4,0.70804,0.692674,0.069424
model5,0.682271,0.665549,0.030624
model6,0.709401,0.694107,0.024716


In [39]:
metrics_df[metrics_df['rsquared']==metrics_df['rsquared'].max()]

Unnamed: 0,rsquared,rsquared_adj,mse_total
model3,0.710948,0.695735,0.101263


## Conclusion:
* I have tried different transformation but the highest rsquared found when I applied 'cube root' transformation