## Model Training 

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("data\\data.csv")
df.head()

### Drop unnecessary Columns

In [None]:
df.drop("id", axis=1, inplace= True)

### Separate Dependent and Independent Features

In [None]:
X = df.drop("price",axis=1)
Y = df["price"]

In [None]:
X.head()

In [None]:
Y.head()

### In Independent Columns listing the Object and Numeric Datatypes

In [None]:
X_numeric = X.columns[X.dtypes != "O"]
X_numeric

In [None]:
X_categorical = X.columns[X.dtypes == "object"]
X_categorical

In [None]:
cut_categories = ['Fair', 'Good', 'Very Good', 'Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1', 'SI2', 'SI1','VS2' , 'VS1', 'VVS2', 'VVS1', 'IF']
cut_categories, clarity_categories, color_categories

In [None]:
from sklearn import  impute, preprocessing, pipeline, compose

#### Creating The Pipeline For Numerical Data :
1. To handle the missing values we simple imputer, change the null values to either Mean, Median, or Most_Frequent
2. For the feature scalling we use StandardScaler.For the numerical type data we use StandardScaler with mean.

In [None]:
num_pipline = pipeline.Pipeline(
    steps=[
    ("imputer", impute.SimpleImputer(strategy="median") ),
    ("scaler", preprocessing.StandardScaler())
    ]
)

#### Creating The Pipeline for Categorical Data :
1. To handle the missing values we simple imputer, change the null values to either Mean, Median, or Most_Frequent
2. Then the encoding will be performed.
3. For the feature scalling we use StandardScaler.For the Categorical data we use Most_frequent.

In [None]:
cat_pipline = pipeline.Pipeline(
    steps=[
        ("imputer", impute.SimpleImputer(strategy="most_frequent")),
        ("ordinal_encoder", preprocessing.OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories])),
        ("scaler", preprocessing.StandardScaler())
    ]
)

In [None]:
preprocessor = compose.ColumnTransformer([
    ("num_pipline", num_pipline, X_numeric),
    ("cat_pipline", cat_pipline, X_categorical)
])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state= 30)

In [None]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns = preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns = preprocessor.get_feature_names_out())

In [None]:
X_train.head()

## Model Training

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
regression = LinearRegression()
regression.fit(X_train, y_train)

In [None]:
regression.coef_

In [None]:
regression.intercept_

In [None]:
import numpy as np

In [None]:
def evaluate_module(true, predicted):
    mae = mean_absolute_error(true, predicted) 
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2_squared = r2_score(true, predicted)
    return mae,  rmse, r2_squared

In [None]:
## Train multiple models
models = {
    "LinearRegression" : LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "ElasticNet" : ElasticNet() 
}
trained_model_list = []
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    trained_model_list.append(model)
    
    #make predictions
    y_pred = model.predict(X_test)
    mae, rmse, r2_squared = evaluate_module(y_test, y_pred)
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    r2_list.append(r2_squared)

    
    print("Model Training Performance : ")
    print("rmse : ", rmse)
    print("MAE : ", mae)
    print("R2 squared : ", r2_squared*100)

    print("="*30, "\n")