In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("https://raw.githubusercontent.com/krishnaik06/FSDSRegression/main/notebooks/data/gemstone.csv")

In [3]:
df.drop(labels=['id'],axis=1,inplace=True)

In [4]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [5]:
### seggregating into dependent and independent features

X=df.drop(labels=['price'],axis=1)
y=df[['price']]

In [6]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77


In [7]:
### dividing into categorical and numerical features

categorical_cols=df.columns[df.dtypes=='object']
numerical_cols=df.select_dtypes(exclude='object').columns
numerical_cols=numerical_cols.drop(labels='price')
type(numerical_cols)

pandas.core.indexes.base.Index

In [8]:
### defining the ranking for each categorical variable

cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [9]:
### importing liabraries

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

### importing pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [10]:
### Numerical pipeline

num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaling',StandardScaler())
    ]
)


### Categorical pipeline

cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoding',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('scaling',StandardScaler())
    ]
)


### combining both numerical and categorical pipeline

preprocessor=ColumnTransformer(
    [
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
        ]
)

In [11]:
### spliting data into train and test

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=32)

In [12]:
X_train_trans=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())

In [13]:
X_test_trans=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [14]:
X_test_trans

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-1.083417,-0.389289,0.920535,-1.313892,-1.308482,-1.326941,-1.133654,-1.551285,0.685799
1,-0.196857,-1.867398,1.441566,0.030206,0.107685,-0.079861,-1.133654,1.529933,-1.313909
2,-1.040171,-0.851198,-0.121527,-1.196622,-1.190468,-1.239935,0.874927,-1.551285,0.685799
3,-0.153610,-0.574052,-0.121527,0.057268,0.080451,0.007145,0.874927,0.913689,1.352368
4,0.495093,0.719293,0.399504,0.643620,0.597897,0.688689,-0.129364,-1.551285,-1.313909
...,...,...,...,...,...,...,...,...,...
58067,-0.996924,0.165002,1.441566,-1.169560,-1.145078,-1.123928,-1.133654,0.297446,0.019230
58068,0.451847,1.181202,-1.684620,0.580474,0.616053,0.717691,-2.137944,-0.935041,-0.647340
58069,1.554642,-0.574052,3.004659,1.455491,1.496618,1.399234,-1.133654,1.529933,0.019230
58070,0.603211,-1.497871,0.399504,0.842077,0.761301,0.659687,-0.129364,-0.935041,-0.647340


In [15]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [16]:
regression=LinearRegression()
regression.fit(X_train_trans,y_train)

In [17]:
y_pred=regression.predict(X_test_trans)

In [18]:
r2_score(y_test,y_pred)

0.9371701066078352

In [19]:
import numpy as np

In [20]:
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mse)
    r_square=r2_score(true,predicted)
    return mae,rmse,r_square

In [23]:
## Training multiple models

models={
    'linearregression':LinearRegression(),
    'ridge':Ridge(),
    'lasso':Lasso(),
    'elasticnet':ElasticNet()
}

In [22]:
trained_model=list()

for i in range(len(models)):
    model=list(models.values())[i]
    model.fit(X_train_trans,y_train)

    trained_model.append(list(models.keys())[i])

    y_pred=model.predict(X_test_trans)

    mae,rmse,r_square=evaluate_model(y_test,y_pred)

    print('Model performence')
    print(f'Model:{model}')
    print(f'MAE:{mae}')
    print(f'RMSE:{rmse}')
    print(f'R SQUARE:{r_square * 100}')

    print('==='*20)
    

Model performence
Model:LinearRegression()
MAE:677.202109489184
RMSE:1010.6786151615756
R SQUARE:93.71701066078352
Model performence
Model:Ridge()
MAE:677.2308920416344
RMSE:1010.6730171043645
R SQUARE:93.71708026240731
Model performence
Model:Lasso()
MAE:678.4035104917499
RMSE:1010.9051605753813
R SQUARE:93.71419365857274
Model performence
Model:ElasticNet()
MAE:1058.8702566849374
RMSE:1526.0636865619363
R SQUARE:85.67530162691462


In [23]:
trained_model

['linearregression', 'ridge', 'lasso', 'elasticnet']

In [20]:
def evaluate_model(X_train,y_train,X_test,y_test,models:dict):
    
    report={}

    for i in range(len(models)):
        # creating model object
        model=list(models.values())[i]
        # training the model
        model.fit(X_train,y_train)
        # predicting 
        y_pred=model.predict(X_test)

        # evaluating the model
        R_square=r2_score(y_test,y_pred)

        report[list(models.keys())[i]]=R_square

        return report

In [24]:
evaluate_model(X_train_trans,y_train,X_test_trans,y_test,models)

{'linearregression': 0.9371701066078352}

In [3]:
dic={'a':[10],'b':[20],'c':[30]}

In [1]:
import pandas as pd
# pd.DataFrame(dic)

In [1]:
pwd()

'e:\\i neuron\\gemstonepriceprediction\\Notebooks'

In [4]:
pd.read_csv('e:\\i neuron\\gemstonepriceprediction\\Artifact\\DataIngestion\\test.csv')

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,14868,0.50,Ideal,D,SI1,62.1,57.0,5.05,5.08,3.14,1355
1,165613,2.00,Very Good,G,SI2,59.5,57.0,8.08,8.15,4.89,14691
2,96727,0.27,Premium,E,VVS1,60.5,59.0,4.19,4.16,2.52,844
3,145593,0.32,Premium,I,VVS1,61.2,59.0,4.43,4.44,2.71,707
4,118689,1.19,Ideal,H,SI1,62.5,56.0,6.77,6.81,4.23,5797
...,...,...,...,...,...,...,...,...,...,...,...
58067,39151,0.91,Very Good,I,SI2,62.4,59.0,6.18,6.13,3.83,2974
58068,32423,0.51,Ideal,D,VS2,62.4,56.0,5.13,5.11,3.19,1875
58069,17876,0.41,Ideal,G,VVS1,61.8,56.0,4.79,4.77,2.95,967
58070,72938,1.21,Premium,I,SI1,61.1,60.0,6.88,6.79,4.18,5656


In [2]:
import os
os.chdir('e:\\i neuron\\gemstonepriceprediction')
os.getcwd()

'e:\\i neuron\\gemstonepriceprediction'

In [3]:
from src.components.data_transformation import DataTransformation
from src.components.data_ingestion import DataIngestionConfig

In [4]:
ingestionconfig=DataIngestionConfig()
train_path=ingestionconfig.train_data_path
test_path=ingestionconfig.test_data_path

In [6]:
obj=DataTransformation()
X_train_arr,y_train,X_test_arr,y_test,_=obj.Initiate_data_transformation(train_path,test_path)

In [8]:
X_train_arr.shape

(135501, 9)

In [13]:
y_test

0         1355
1        14691
2          844
3          707
4         5797
         ...  
58067     2974
58068     1875
58069      967
58070     5656
58071     6759
Name: price, Length: 58072, dtype: int64

In [15]:
X_test_arr.shape

(58072, 9)