## model Training

In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('gemstone.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [6]:
df=df.drop(labels='id',axis=1)

In [7]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [9]:
## split independent and dependet feature
X=df.drop(labels='price',axis=1)
y=df['price']

In [12]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77
...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81


In [17]:
## define numerical and categorical column
categorical_column=X.select_dtypes(include='object').columns
numerical_column=X.select_dtypes(exclude='object').columns

In [18]:
numerical_column

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [19]:
categorical_column

Index(['cut', 'color', 'clarity'], dtype='object')

In [54]:
cut_map=['Fair','Good','Very Good','Premium','Ideal']
clarity_map=['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']
color_map=['D','E','F','G','H','I','J']

In [44]:
df.color.unique()

array(['F', 'J', 'G', 'E', 'D', 'H', 'I'], dtype=object)

In [55]:
##preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

In [56]:
##  pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [58]:
numpipeline=Pipeline(
    steps=[
    ('SimpleImputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
    ]
)

## categorical pipeline
catepipeline=Pipeline(
    steps=[
        ('SimpleImputer',SimpleImputer(strategy='most_frequent')),
        ('OrdinalEncoder',OrdinalEncoder(categories=[cut_map,color_map,clarity_map])),
        ('scaler',StandardScaler())       
    ]
)

In [59]:
##columnabs tranfer
preprocessor=ColumnTransformer([
('numpipeline',numpipeline,numerical_column),
('catepipeline',catepipeline,categorical_column)
])

In [36]:
categorical_column

Index(['cut', 'color', 'clarity'], dtype='object')

In [30]:
#train test split
from sklearn.model_selection import train_test_split

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame( preprocessor.fit_transform(X_test),columns='price')

In [112]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [113]:
X_train

array([[-0.82314374, -1.12998781, -0.64189666, ...,  0.87410007,
        -0.93674681,  1.35074594],
       [ 0.94502267, -1.77782269,  0.92190185, ..., -1.13764403,
         0.91085333,  0.68445511],
       [ 1.9584839 ,  0.16568195,  0.40063568, ..., -0.13177198,
         0.91085333,  0.01816428],
       ...,
       [ 0.92345966,  0.90606467,  0.40063568, ..., -0.13177198,
         0.29498662,  0.01816428],
       [-1.03877378, -0.66724861, -0.64189666, ..., -1.13764403,
         0.29498662,  2.01703677],
       [-1.03877378, -0.01941373,  0.92190185, ..., -1.13764403,
         0.29498662, -1.31441737]])

In [114]:
X_test

array([[-0.62907669,  0.25822979, -0.12063049, ...,  0.87410007,
        -1.55261352, -0.64812655],
       [ 2.60537405, -2.14801405, -0.12063049, ..., -1.13764403,
         0.29498662, -1.31441737],
       [-1.1250258 , -1.22253565,  0.92190185, ..., -0.13177198,
        -0.93674681,  2.01703677],
       ...,
       [-0.82314374, -0.01941373, -0.64189666, ...,  0.87410007,
         0.29498662,  2.01703677],
       [ 0.90189666, -0.66724861,  1.44316802, ..., -0.13177198,
         1.52672004, -0.64812655],
       [ 0.47063656,  0.90606467, -0.64189666, ..., -1.13764403,
        -0.93674681,  0.01816428]])

In [115]:
## model training
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [116]:
model=LinearRegression()

In [117]:
model.fit(X_train,y_train)

LinearRegression()

In [118]:
model.coef_

array([ 6432.97591819,  -132.34206204,   -70.48787525, -1701.38593925,
        -494.17005097,   -76.32351645,    68.80035873,  -464.67990411,
         652.10059539])

In [127]:
model={
    'Linear':LinearRegression(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'ElasticNet':ElasticNet()
}

In [128]:
def evaulatemetrics(X_train,X_test,y_train,y_test,models):
    result={}
    for i in range(len(models)):
        model=list(models.values())[i]
        model.fit(X_train,y_train)
        ##predict
        y_predict=model.predict(X_test)
        
        mae=mean_absolute_error(y_test,y_predict)
        mse=mean_squared_error(y_test,y_predict)
        r2score=r2_score(y_test,y_predict)
        result[list(models.keys())[i]]={'Mae':mae,'mse':mse,'r2score':r2score}
    return result

In [129]:
evaulatemetrics(X_train,X_test,y_train,y_test,model)

{'Linear': {'Mae': 675.0758270067445,
  'mse': 1029473.3531156889,
  'r2score': 0.9362906819996045},
 'Ridge': {'Mae': 675.1077629781488,
  'mse': 1029482.8101268989,
  'r2score': 0.936290096749163},
 'Lasso': {'Mae': 676.2421173665508,
  'mse': 1029533.1506505491,
  'r2score': 0.9362869814082755},
 'ElasticNet': {'Mae': 1060.9432977143008,
  'mse': 2351174.871397875,
  'r2score': 0.8544967219374031}}