# Linear Regression - Diamond Price Predictor 

#### Linear Regression and TEsting Assumptions

In [46]:
# Loading LIbraires
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn import linear_model

# import LinearRegression
from sklearn.linear_model import LinearRegression

# import metrics
from sklearn.metrics import mean_squared_error, r2_score

 # import train_test_split
from sklearn.model_selection import train_test_split

# import statsmodels
import statsmodels.api as sm

In [47]:
# Loading Data
df = pd.read_csv('diamonds.csv', index_col=0)

In [48]:
# Viewing Data
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [49]:
# shape of data
df.shape

(53940, 10)

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53940 entries, 1 to 53940
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.5+ MB


$Inference$
* The data set has object and float variables 
* No MIssing Values 

In [51]:
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [52]:
df.corr()['price'].sort_values()

depth   -0.010647
table    0.127134
z        0.861249
y        0.865421
x        0.884435
carat    0.921591
price    1.000000
Name: price, dtype: float64

In [53]:
# Checking for null values
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [54]:
# Checking for duplicate values
df.duplicated().sum()

146

In [55]:
# check the duplicates values
df[df.duplicated()]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1006,0.79,Ideal,G,SI1,62.3,57.0,2898,5.90,5.85,3.66
1007,0.79,Ideal,G,SI1,62.3,57.0,2898,5.90,5.85,3.66
1008,0.79,Ideal,G,SI1,62.3,57.0,2898,5.90,5.85,3.66
1009,0.79,Ideal,G,SI1,62.3,57.0,2898,5.90,5.85,3.66
2026,1.52,Good,E,I1,57.3,58.0,3105,7.53,7.42,4.28
...,...,...,...,...,...,...,...,...,...,...
47970,0.52,Ideal,D,VS2,61.8,55.0,1919,5.19,5.16,3.20
49327,0.51,Ideal,F,VVS2,61.2,56.0,2093,5.17,5.19,3.17
49558,0.71,Good,F,SI2,64.1,60.0,2130,0.00,0.00,0.00
50080,0.51,Ideal,F,VVS2,61.2,56.0,2203,5.19,5.17,3.17


# Splitting OUr Dataset 

In [56]:
# spliting data into dependent and independent variables
X = df.drop(['price'], axis=1)
y = df['price']

In [57]:
# creating dummy variables
X = pd.get_dummies(X, drop_first=True)
X.head()

Unnamed: 0,carat,depth,table,x,y,z,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
1,0.23,61.5,55.0,3.95,3.98,2.43,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0.21,59.8,61.0,3.89,3.84,2.31,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
3,0.23,56.9,65.0,4.05,4.07,2.31,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0.29,62.4,58.0,4.2,4.23,2.63,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0
5,0.31,63.3,58.0,4.34,4.35,2.75,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [58]:
# this adds the constant term to the dataset
X = sm.add_constant(X)

In [59]:
# Spliting data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

# Creating Linear Regression 
model = LinearRegression()

# Fitting the model
model.fit(X_train, y_train)

# Predicting the model
y_pred = model.predict(X_test)

# Checking the accuracy of the model
print('Accuracy of the model is: ', model.score(X_test, y_test))

# calculate the R-squared score
r2 = r2_score(y_test, y_pred)
print(f"R-squared score: {r2:.2f}")

Accuracy of the model is:  0.9210442589001284
R-squared score: 0.92


In [60]:
model = sm.OLS(y_train, X_train).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.919
Model:                            OLS   Adj. R-squared:                  0.919
Method:                 Least Squares   F-statistic:                 2.141e+04
Date:                Tue, 25 Apr 2023   Prob (F-statistic):               0.00
Time:                        17:21:34   Log-Likelihood:            -3.6480e+05
No. Observations:               43152   AIC:                         7.297e+05
Df Residuals:                   43128   BIC:                         7.299e+05
Df Model:                          23                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const          1796.5083    507.712      3.538

# Checking Linear Regression Assumptions
We will be checking the following Linear Regression assumptions:

* No Multicollinearity

* Linearity of variables

* Independence of error terms

* Normality of error terms

* No Heteroscedasticity