# Model Training 

In [2]:
import pandas as pd 

In [None]:
# read csv
df = pd.read_csv('data/gemstone.csv')

In [5]:
#drop id 

df = df.drop(labels=['id'],axis=1)

In [10]:
## Independent and Dependent features 

X = df.drop(labels=['price'],axis=1)
Y=df[['price']]

In [11]:
Y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [12]:
# define which columns should be ordinal-encoded and which should be scaled 

categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [13]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [17]:
from sklearn.impute import SimpleImputer  # to handle the missing values 
from sklearn.preprocessing import StandardScaler # Handle feature scaling 
from sklearn.preprocessing import OrdinalEncoder # Ordinal encoding 


#pipeline 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer 

In [19]:
# Numerical pipeline 

num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())

    ]
)


#Categorical pipeline 

cat_pipeline = Pipeline(
    steps=[
        ('impute',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
)



#Combining them 

preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])

In [20]:
## Train test split 

from sklearn.model_selection import train_test_split 

X_train , X_test , y_train , y_test = train_test_split(X,Y ,test_size=0.3,random_state=30)


In [None]:
X_train =pd.DataFrame()