In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Data/gemstone.csv")
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
df=df.drop(labels=['id'],axis=1) # First we should remove this 
                                # i take 3 days to solve this issue
# because i Assing X & Y and than i remove id from df

In [4]:
## Independent(Features) and dependent features(Labals)
X = df.drop(labels=['price'],axis=1)
Y = df[['price']]

In [5]:
Y # Dependent Feature

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [6]:
# Define which columns should be ordinal-encoded and which should be Directly scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [7]:
categorical_cols # OrdinalS

Index(['cut', 'color', 'clarity'], dtype='object')

In [8]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Ideal','Very Good','Good','Premium','Fair']
color_categories = ['G','D','H','I','E','J','F']
clarity_categories =['SI1','VS2','VS1','SI2','VVS2','VVS1','IF','I1']

In [10]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values Replace missing values using a descriptive statistic (Stretgy = mean, median, or most frequent) 
from sklearn.preprocessing import StandardScaler # Handling Feature scaling 
# Standardized data is essential for accurate data analysis; it’s easier to draw clear conclusions about your current data when you have other data to measure it against
# Scaling the features in a machine learning model can improve the optimization process by making the flow of gradient descent smoother and helping algorithms reach the minimum of the cost function more quickly.
from sklearn.preprocessing import OrdinalEncoder # ordinal encoder
## pipelines Basicly means it will going to connect my previous things that we did

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer # for Grouping this together


In [11]:
# Numerical Pipeline
numerical_pipeline = Pipeline(
    steps=[("Imputing" ,SimpleImputer(strategy="median")),
           ("Standard",StandardScaler())
           ])

# Categorical Pipeline
categorcal_pipeline = Pipeline(
    steps=[
        ("imputer" , SimpleImputer(strategy="most_frequent")),
        ("Encoding" , OrdinalEncoder(categories=[cut_categories , color_categories,clarity_categories])),
        ("scaler" , StandardScaler())]
        )
# Now i need to combine these
preprocessor=ColumnTransformer(
[("numerical_pipeline",numerical_pipeline,numerical_cols),
("categorcal_pipeline",categorcal_pipeline,categorical_cols)
])

In [12]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.30,random_state=16)

In [13]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test= pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())


In [14]:
X_train.head()

Unnamed: 0,numerical_pipeline__carat,numerical_pipeline__depth,numerical_pipeline__table,numerical_pipeline__x,numerical_pipeline__y,numerical_pipeline__z,categorcal_pipeline__cut,categorcal_pipeline__color,categorcal_pipeline__clarity
0,-0.177492,0.166759,-0.117064,-0.026229,0.032747,0.005099,-0.880066,-1.260472,-1.088795
1,-0.97538,1.275635,-0.117064,-1.151629,-1.19948,-1.080388,-0.10063,-0.790384,-1.088795
2,0.965428,0.166759,-0.117064,1.072162,1.029402,1.076112,-0.880066,-1.260472,-0.480136
3,-0.199057,0.351572,-1.16051,-0.080248,-0.057858,-0.038321,-0.880066,-0.320296,0.128523
4,-0.824428,0.074353,-0.117064,-0.881533,-0.918604,-0.877763,-0.880066,-1.260472,-0.480136


## Modal Training

In [15]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error 

In [16]:
Regression = LinearRegression()

In [17]:
Regression.fit(X_train , y_train)

In [18]:
Regression.coef_

array([[ 6459.68895369,  -220.24389884,  -121.63377037, -2014.30120433,
         -579.71258734,   -44.70236087,   -79.95046784,   -83.05928092,
           68.62926094]])

In [19]:
Regression.intercept_

array([3980.89119637])

In [20]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

## Training Multiple Modals

In [21]:
modals = {
    "LinearRegression":LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "ElasticNet" : ElasticNet()
}
modal_list = []
r2_list = []

for i in range(len(list(modals))):
    modal = list(modals.values())[i]
    modal.fit(X_train,y_train)

    # Make PRediction
    y_pred = modal.predict(X_test)

    mae,rmse,r2_square = evaluate_model(y_test,y_pred)
    print(list(modals.keys())[i])
    modal_list.append(list(modals.keys())[i])

    print("Modal Training performance")
    print("Mae",mae)
    print("RMSE",rmse)
    print("r2_score" ,(r2_square)*100)

    r2_list.append(r2_square)
    
    print("="*35)
    print("\n")

LinearRegression
Modal Training performance
Mae 718.2743187057695
RMSE 1213.8574919820646
r2_score 90.86423993216837


Lasso
Modal Training performance
Mae 720.1123938077642
RMSE 1214.1181917329957
r2_score 90.86031534275423


Ridge
Modal Training performance
Mae 718.3037354464498
RMSE 1213.8547546348223
r2_score 90.86428113588396


ElasticNet
Modal Training performance
Mae 1123.625571802003
RMSE 1627.5361611017897
r2_score 83.57631631131083




In [22]:
list(modals)

['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet']

In [21]:
print(X_train["Color"].unique())
print(X_train["cut"].unique())
print(X_train["clarity"].unique())

KeyError: 'color'

In [31]:
print(list(modals.keys())[1])

Lasso
