<h1> Data Preprocessing

In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer # for missing value removal
from sklearn.preprocessing import StandardScaler #for numerical data
from sklearn.preprocessing import OrdinalEncoder #for categorical data
from sklearn.pipeline import Pipeline #for creating pipeline
from sklearn.compose import ColumnTransformer #for composing the piplines
from sklearn.model_selection import train_test_split


In [3]:
data = pd.read_csv(r"D:/Projects/MLOps/gitClones/firstproject/datasets/Diamonds Prices2022.csv")

In [4]:
data

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...,...
53938,53939,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74
53939,53940,0.75,Ideal,D,SI2,62.2,55.0,2757,5.83,5.87,3.64
53940,53941,0.71,Premium,E,SI1,60.5,55.0,2756,5.79,5.74,3.49
53941,53942,0.71,Premium,F,SI1,59.8,62.0,2756,5.74,5.73,3.43


In [5]:
data.columns

Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',
       'price', 'x', 'y', 'z'],
      dtype='object')

In [6]:
data_origin = data.copy()

In [7]:
data.shape

(53943, 11)

In [8]:
class data_preprocessing:

     def columnSegregation(self, data):
          # numericalColumns = data.columns[data.dtypes != 'object'] # to access this, execute this command data[numericalColumns]
          # categoricalColumns = data.columns[data.dtypes == 'object']
          numericalColumns = data.select_dtypes(exclude="object") #to access this, execute this command numericalColumns
          categoricalColumns = data.select_dtypes(include="object")
          return numericalColumns, categoricalColumns
     

     def removeUnnamedColumns(self, data):

          Unnamed_cols = data.iloc[:, data.columns.str.contains("^Unnamed")]
          data.drop(Unnamed_cols, axis=1, inplace=True)
          
     
     def removeDuplicates(self, data):
          dups_cal = data.duplicated().sum()
          if dups_cal != 0:
               print("-"*40)
               print(f"{dups_cal} duplicate(s) present!!!")
               
               print("-"*40)
               print("Proceeding to remove the duplicate(s)")
               print("-"*40)
               
               a = len(data_origin)
               print("No. of rows before cleaning: ", a)
               
               print("-"*40)
               data.drop_duplicates(inplace=True)
               print("Verifying if all Duplicates are deleted")
               print("-"*40)

               b = len(data)
               
               print("No. of rows after cleaning: ", b)
               
               if a-b == dups_cal:
                    print(f"{a-b} Duplicate(s) are deleted")
                    print(f"The dimension of the data is {data.shape}")
               else:
                    print(f"Error!!! {a-b} Duplicate(s) remain")
          else:
               print("No Duplicates in the Data")
               

In [9]:
data_prepro = data_preprocessing()
data_prepro.removeUnnamedColumns(data)
data_prepro.removeDuplicates(data)

----------------------------------------
149 duplicate(s) present!!!
----------------------------------------
Proceeding to remove the duplicate(s)
----------------------------------------
No. of rows before cleaning:  53943
----------------------------------------
Verifying if all Duplicates are deleted
----------------------------------------
No. of rows after cleaning:  53794
149 Duplicate(s) are deleted
The dimension of the data is (53794, 10)


In [10]:
x = data.drop(labels="price", axis=1)
y = data["price"]

In [11]:
x

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,6.15,6.12,3.74


In [12]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53794 entries, 0 to 53939
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53794 non-null  float64
 1   cut      53794 non-null  object 
 2   color    53794 non-null  object 
 3   clarity  53794 non-null  object 
 4   depth    53794 non-null  float64
 5   table    53794 non-null  float64
 6   x        53794 non-null  float64
 7   y        53794 non-null  float64
 8   z        53794 non-null  float64
dtypes: float64(6), object(3)
memory usage: 4.1+ MB


In [13]:
y

0         326
1         326
2         327
3         334
4         335
         ... 
53935    2757
53936    2757
53937    2757
53938    2757
53939    2757
Name: price, Length: 53794, dtype: int64

In [14]:
# segregating the data based on dtypes
data_prepro = data_preprocessing()
num_col, cat_col = data_prepro.columnSegregation(x)

In [15]:
num_col

Unnamed: 0,carat,depth,table,x,y,z
0,0.23,61.5,55.0,3.95,3.98,2.43
1,0.21,59.8,61.0,3.89,3.84,2.31
2,0.23,56.9,65.0,4.05,4.07,2.31
3,0.29,62.4,58.0,4.20,4.23,2.63
4,0.31,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...
53935,0.72,60.8,57.0,5.75,5.76,3.50
53936,0.72,63.1,55.0,5.69,5.75,3.61
53937,0.70,62.8,60.0,5.66,5.68,3.56
53938,0.86,61.0,58.0,6.15,6.12,3.74


In [16]:
num_col.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53794 entries, 0 to 53939
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   carat   53794 non-null  float64
 1   depth   53794 non-null  float64
 2   table   53794 non-null  float64
 3   x       53794 non-null  float64
 4   y       53794 non-null  float64
 5   z       53794 non-null  float64
dtypes: float64(6)
memory usage: 2.9 MB


In [17]:
cat_col

Unnamed: 0,cut,color,clarity
0,Ideal,E,SI2
1,Premium,E,SI1
2,Good,E,VS1
3,Premium,I,VS2
4,Good,J,SI2
...,...,...,...
53935,Ideal,D,SI1
53936,Good,D,SI1
53937,Very Good,D,SI1
53938,Premium,H,SI2


In [18]:
cat_col.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53794 entries, 0 to 53939
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   cut      53794 non-null  object
 1   color    53794 non-null  object
 2   clarity  53794 non-null  object
dtypes: object(3)
memory usage: 1.6+ MB


In [19]:
cat_col["cut"].value_counts()
print("-"*40)
cat_col["color"].value_counts()
print("-"*40)
cat_col["clarity"].value_counts()

----------------------------------------
----------------------------------------


clarity
SI1     13032
VS2     12229
SI2      9150
VS1      8156
VVS2     5056
VVS1     3647
IF       1784
I1        740
Name: count, dtype: int64

In [20]:
cut_map = [
     "Ideal",
     "Premium",
     "Very Good",
     "Good",
     "Fair",
]

color_map = [
     "D",
     "E",
     "F",
     "G",
     "H",
     "I",
     "J",
]

clarity_map = [
     "IF",
     "VVS1",
     "VVS2",
     "VS1",
     "VS2",
     "SI1",
     "SI2",
     "I1",
]


In [21]:
num_pipeline = Pipeline(
     steps=[
          ("imputer", SimpleImputer(),),
          ("scaler", StandardScaler()),
     ]
)

In [22]:
cat_pipeline = Pipeline(
     steps=[
          ("imputer", SimpleImputer(strategy="most_frequent")),
          ("ordinalencoder", OrdinalEncoder(categories=[cut_map, color_map, clarity_map]))
     ]
)

In [23]:
num_col.columns

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [24]:
cat_col.columns

Index(['cut', 'color', 'clarity'], dtype='object')

In [25]:
preprocessor = ColumnTransformer(
     [
          ("num_pipeline", num_pipeline, num_col.columns),
          ("cat_pipeline", cat_pipeline, cat_col.columns),
     ]     
)

In [26]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)

In [27]:
x_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
45307,0.3,Ideal,D,SI1,61.7,57.0,4.28,4.31,2.65
15892,1.01,Ideal,G,VS2,61.6,56.0,6.46,6.43,3.97
22182,1.01,Premium,D,VVS2,62.4,60.0,6.31,6.36,3.95
48310,0.25,Ideal,F,VS2,61.6,55.0,4.06,4.08,2.51
32731,0.31,Premium,F,VS2,62.3,59.0,4.34,4.29,2.69


In [28]:
preprocessor.fit_transform(x_train)

array([[-1.04897737, -0.03672311, -0.20489093, ...,  0.        ,
         0.        ,  5.        ],
       [ 0.4399331 , -0.10665728, -0.65256226, ...,  0.        ,
         3.        ,  4.        ],
       [ 0.4399331 ,  0.45281615,  1.13812309, ...,  1.        ,
         0.        ,  2.        ],
       ...,
       [-1.00703623,  0.03321107, -1.14500074, ...,  0.        ,
         6.        ,  5.        ],
       [ 0.20925683,  0.73255287,  0.69045175, ...,  1.        ,
         6.        ,  5.        ],
       [ 0.4399331 ,  0.66261869,  0.24278041, ...,  1.        ,
         3.        ,  3.        ]])

In [29]:
preprocessor.transform(x_test)

array([[-0.189184  ,  2.20117064, -1.54790494, ...,  4.        ,
         6.        ,  4.        ],
       [ 0.20925683, -0.52626236,  0.69045175, ...,  2.        ,
         1.        ,  6.        ],
       [-0.48277198,  0.24301361, -0.65256226, ...,  0.        ,
         3.        ,  3.        ],
       ...,
       [ 0.06246284, -0.03672311, -0.20489093, ...,  0.        ,
         4.        ,  1.        ],
       [-0.189184  ,  0.66261869,  1.13812309, ...,  1.        ,
         1.        ,  1.        ],
       [ 0.56575652,  0.45281615, -0.20489093, ...,  1.        ,
         0.        ,  4.        ]])

In [30]:
x_train = pd.DataFrame(preprocessor.fit_transform(x_train), columns=preprocessor.get_feature_names_out())
x_test = pd.DataFrame(preprocessor.transform(x_test), columns=preprocessor.get_feature_names_out())

In [31]:
x_train

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-1.048977,-0.036723,-0.204891,-1.291914,-1.268254,-1.277513,0.0,0.0,5.0
1,0.439933,-0.106657,-0.652562,0.643521,0.614842,0.615184,0.0,3.0,4.0
2,0.439933,0.452816,1.138123,0.510349,0.552664,0.586507,1.0,0.0,2.0
3,-1.153830,-0.106657,-1.100234,-1.487233,-1.472552,-1.478254,0.0,2.0,4.0
4,-1.028007,0.382882,0.690452,-1.238645,-1.286019,-1.220159,1.0,2.0,4.0
...,...,...,...,...,...,...,...,...,...
37650,0.418963,-0.945867,0.242780,0.687912,0.641490,0.543491,1.0,4.0,4.0
37651,-0.356949,-0.036723,-1.547905,-0.191024,-0.157938,-0.173440,0.0,3.0,5.0
37652,-1.007036,0.033211,-1.145001,-1.194254,-1.170546,-1.177143,0.0,6.0,5.0
37653,0.209257,0.732553,0.690452,0.350542,0.259541,0.400105,1.0,6.0,5.0


In [32]:
x_test

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.189184,2.201171,-1.547905,-0.093365,-0.184586,0.113333,4.0,6.0,4.0
1,0.209257,-0.526262,0.690452,0.359420,0.392779,0.314073,2.0,1.0,6.0
2,-0.482772,0.243014,-0.652562,-0.386344,-0.362236,-0.345503,0.0,3.0,3.0
3,-0.608595,0.522750,1.138123,-0.581663,-0.566534,-0.517566,1.0,1.0,4.0
4,0.817403,0.382882,-1.100234,0.945378,0.961261,1.002327,0.0,5.0,5.0
...,...,...,...,...,...,...,...,...,...
16134,0.880315,0.173079,0.242780,1.025281,0.996791,1.031004,1.0,2.0,4.0
16135,-0.210155,-0.316460,1.585794,-0.066730,-0.033583,-0.087408,2.0,1.0,5.0
16136,0.062463,-0.036723,-0.204891,0.252883,0.295071,0.271058,0.0,4.0,1.0
16137,-0.189184,0.662619,1.138123,-0.004583,-0.104643,0.027301,1.0,1.0,1.0


<H2>Model Training

In [33]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, Ridge
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [34]:
models = {
     "RandomForest": RandomForestRegressor(),
     "LinearReg": LinearRegression(),
     "Lasso": Lasso(),
     "Elastic": ElasticNet(),
     "Ridge": Ridge(),
     "xgb": XGBRegressor()
}

In [35]:
models

{'RandomForest': RandomForestRegressor(),
 'LinearReg': LinearRegression(),
 'Lasso': Lasso(),
 'Elastic': ElasticNet(),
 'Ridge': Ridge(),
 'xgb': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)}

In [36]:
r2_list = []
mae_list = []
mse_list = []
used_models = []

In [37]:
def evaluate_data(true, pred):
     r2 = r2_score(true, pred)
     mae = mean_absolute_error(true, pred)
     mse = mean_squared_error(true, pred)

     return r2, mae, mse

In [38]:
max_matrics = {
     "r2":{"model":"", "value":float('-inf')},

     "mae":{"model":"", "value":float('inf')},

     "mse":{"model":"", "value":float('inf')}
    
}


for i in range(len(list(models))):
     model_name = list(models.keys())[i]
     model = (list(models.values()))[i]
     
     model.fit(x_train, y_train)

     y_pred = model.predict(x_test)

     r2, mae, mse = evaluate_data(y_test, y_pred)

     r2_list.append(r2)
     mae_list.append(mae)
     mse_list.append(mse)


     print("="*40)
     print(f"Model in use --> {(list(models.keys()))[i]}")
     print("="*40)
     print(f"R2_SCORE: {r2}")
     print(f"MAE: {mae}")
     print(f"MSE: {mse}")
     print("="*40)

     if r2 > max_matrics['r2']['value']:
          max_matrics['r2']['model'] = model_name          
          max_matrics['r2']['value'] = r2

     if mae < max_matrics['mae']['value']:
          max_matrics['mae']['model'] = model_name
          max_matrics['mae']['value'] = mae

     if mse < max_matrics['mse']['value']:
          max_matrics['mse']['model'] = model_name
          max_matrics['mse']['value'] = mse

print(f"Max R2_Score: {max_matrics['r2']['value']} --> {max_matrics['r2']['model']}")
print(f"Min MAE: {max_matrics['mae']['value']} --> {max_matrics['mae']['model']}")
print(f"Min MSE: {max_matrics['mse']['value']} --> {max_matrics['mse']['model']}")

Model in use --> RandomForest
R2_SCORE: 0.9820960716595838
MAE: 263.0390399830048
MSE: 274956.8069951073
Model in use --> LinearReg
R2_SCORE: 0.9060114236219523
MAE: 798.4747772281572
MSE: 1443415.0072298078
Model in use --> Lasso
R2_SCORE: 0.9062103701691381
MAE: 799.4574443676755
MSE: 1440359.7164390483
Model in use --> Elastic
R2_SCORE: 0.8346093117325453
MAE: 1059.4112584512643
MSE: 2539961.8836770575
Model in use --> Ridge
R2_SCORE: 0.9060162648764681
MAE: 798.5760480538071
MSE: 1443340.6584132675
Model in use --> xgb
R2_SCORE: 0.981963038444519
MAE: 269.6444007972336
MSE: 276999.8285109144
Max R2_Score: 0.9820960716595838 --> RandomForest
Min MAE: 263.0390399830048 --> RandomForest
Min MSE: 274956.8069951073 --> RandomForest
