## 2. Dataset Schema

In [2]:
# Defining schema 
schema = {
    "Brand": "categorical",
    "Price": "numerical",
    "Body": "categorical",
    "Mileage": "numerical",
    "EngineV": "numerical",
    "Engine Type": "categorical",
    "Registration": "categorical",
    "Year": "numerical",
    "Model": "categorical",
}

### Data Cleaning

In [None]:
import pandas as pd

# Loading the dataset
car_data = pd.read_csv("Car_data.csv")

In [4]:
# Checking missing values
print("Missing Values Before Cleaning:")
print(car_data.isnull().sum())

# Handle missing values
car_data["Price"].fillna(car_data["Price"].median(), inplace=True)  # Numerical: Median
car_data["EngineV"].fillna(car_data["EngineV"].median(), inplace=True)  # Numerical: Median

# Drop duplicates
car_data = car_data.drop_duplicates()

# Verify missing values
print("Missing Values After Cleaning:")
print(car_data.isnull().sum())


Missing Values Before Cleaning:
Brand             0
Price           172
Body              0
Mileage           0
EngineV         150
Engine Type       0
Registration      0
Year              0
Model             0
dtype: int64
Missing Values After Cleaning:
Brand           0
Price           0
Body            0
Mileage         0
EngineV         0
Engine Type     0
Registration    0
Year            0
Model           0
dtype: int64


In [5]:
# Outlier removal function
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]



Shape after outlier removal: (3556, 9)


In [None]:
# Apply outlier removal for numerical columns
numerical_columns = ["Price", "Mileage", "EngineV", "Year"]
for col in numerical_columns:
    car_data = remove_outliers(car_data, col)

print("Shape after outlier removal:", car_data.shape)


In [6]:
print(car_data.head())
print("Dataset shape:", car_data.shape)
print(car_data.dtypes)

           Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan      277      2.0      Petrol   
1  Mercedes-Benz   7900.0        van      427      2.9      Diesel   
3           Audi  23000.0  crossover      240      4.2      Petrol   
4         Toyota  18300.0  crossover      120      2.0      Petrol   
6            BMW   6100.0      sedan      438      2.0         Gas   

  Registration  Year         Model  
0          yes  1991           320  
1          yes  1999  Sprinter 212  
3          yes  2007            Q7  
4          yes  2011         Rav 4  
6          yes  1997           320  
Dataset shape: (3556, 9)
Brand            object
Price           float64
Body             object
Mileage           int64
EngineV         float64
Engine Type      object
Registration     object
Year              int64
Model            object
dtype: object


In [7]:
!pip install fastparquet



In [8]:
car_data.to_parquet("Car_data_cleaned.parquet", engine="fastparquet", index=False)
print("Cleaned dataset saved as 'Car_data_cleaned.parquet'")

Cleaned dataset saved as 'Car_data_cleaned.parquet'


## 3. Dataset Profiling Report

In [10]:
!pip install ydata-profiling



In [None]:
from ydata_profiling import ProfileReport

# Load the cleaned dataset
import pandas as pd
car_data_cleaned = pd.read_parquet("Car_data_cleaned.parquet", engine="fastparquet")


In [11]:
# Generate the profiling report
profile = ProfileReport(car_data_cleaned, title="Car Data Profiling Report", explorative=True)

# Save the profiling report to an HTML file
profile.to_file("Car_data_profiling_report.html")

print("Dataset profiling report saved as 'Car_data_profiling_report.html'")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset profiling report saved as 'Car_data_profiling_report.html'


## 4. Train-Test-Production Split

In [81]:
from sklearn.model_selection import train_test_split

# Load the cleaned dataset
car_data_cleaned = pd.read_parquet("Car_data_cleaned.parquet", engine="fastparquet")


In [83]:
# Split the data into Train (60%), Test (20%), and Production (20%)
train_data, temp_data = train_test_split(car_data_cleaned, test_size=0.4, random_state=42)
test_data, prod_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Print the shapes
print("Training Data Shape:", train_data.shape)
print("Testing Data Shape:", test_data.shape)
print("Production Data Shape:", prod_data.shape)

Training Data Shape: (2133, 9)
Testing Data Shape: (711, 9)
Production Data Shape: (712, 9)


In [85]:
# Export the datasets to CSV
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)
prod_data.to_csv('prod_data.csv', index=False)


## 6. ML Pipeline with Scikit-learn

In [128]:
!pip install xlrd



In [92]:
#Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [94]:
#Loading the data set
url = 'https://github.com/KeerthiNarumanchi/car_data_mlops/raw/master/Car_data_cleaned.parquet'
df = pd.read_parquet(url)

In [116]:
# URLs to the raw train and test data files on GitHub
train_url = "https://github.com/KeerthiNarumanchi/car_data_mlops/raw/master/train_data.xls"
test_url = "https://github.com/KeerthiNarumanchi/car_data_mlops/raw/master/test_data.xls"


In [124]:
print(train_url)
print(test_url)


https://github.com/KeerthiNarumanchi/car_data_mlops/raw/master/train_data.xls
https://github.com/KeerthiNarumanchi/car_data_mlops/raw/master/test_data.xls


In [132]:
df_train = pd.read_csv(train_url)
df_test = pd.read_csv(test_url)

In [148]:
#'target' is your target column
X = df_train.drop('Price', axis=1)  # Features
y = df_train['Price']  # Target

In [150]:
# Spliting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [152]:
#preprocessing steps
num_features = ['Mileage', 'EngineV', 'Year']  # Numerical columns
cat_features = ['Brand', 'Body', 'Engine Type', 'Registration', 'Model']  # Categorical columns

In [154]:
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])

In [156]:
## Defining the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

model_pipeline.fit(X_train, y_train)

In [157]:
#predicting the target variable
y_pred = model_pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}, MSE: {mse}, R^2: {r2}")


MAE: 2169.915613889818, MSE: 12763400.3098423, R^2: 0.8250184781500589


In [160]:
import joblib

# Saving the trained model to a file .pkl
joblib.dump(model, 'random_forest_model.pkl')


['random_forest_model.pkl']

## ML Experimentation and Tracking with MLflow/Weights and Biases

###  Linear Regression with Feature Scaling

In [36]:
!pip install mlflow




In [177]:
from sklearn.linear_model import LinearRegression
import mlflow


In [179]:
# Encode categorical features if needed (you might already have this handled)
X = pd.get_dummies(X, drop_first=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling (StandardScaler)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [181]:
# Linear Regression Model
lr = LinearRegression()

# Train the model
lr.fit(X_train_scaled, y_train)


In [183]:
# Predict on the test set
y_pred = lr.predict(X_test_scaled)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}, MSE: {mse}, R^2: {r2}")


MAE: 5386203155487163.0, MSE: 2.3232669288726282e+33, R^2: -3.1851134729691338e+25


In [40]:
import mlflow
import mlflow.sklearn

# End any active run before starting a new one
mlflow.end_run()

# Start a new run
mlflow.start_run()

# Log parameters, metrics, and model
mlflow.log_param("Model", "Linear Regression")
mlflow.log_metric("MAE", mae)
mlflow.log_metric("MSE", mse)
mlflow.log_metric("R2", r2)

# Log the model
mlflow.sklearn.log_model(lr, "model")


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



<mlflow.models.model.ModelInfo at 0x1e76f49f7d0>

### Decision Tree Regressor

In [185]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

In [212]:
# Decision Tree Regressor Model
dt = DecisionTreeRegressor(max_depth=5, min_samples_split=10)

# Train the model
dt.fit(X_train_scaled, y_train)

In [189]:
# End any active run before starting a new one
mlflow.end_run()

# Start MLflow experiment
mlflow.start_run()

# Log parameters, metrics
mlflow.log_param("Model", "Decision Tree")
mlflow.log_param("max_depth", 5)
mlflow.log_param("min_samples_split", 10)
mlflow.log_metric("MAE", -cv_scores.mean())

# Log the model
mlflow.sklearn.log_model(dt, "model")

# End the experiment
mlflow.end_run()




### Gradient Boosting Regressor

In [191]:
from sklearn.ensemble import GradientBoostingRegressor

In [193]:
# Gradient Boosting Regressor Model
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)

# Train the model
gb.fit(X_train_scaled, y_train)

# Evaluate on the test set
y_pred = gb.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")


MAE: 2588.4841835403263


In [195]:
# End any active run before starting a new one
mlflow.end_run()

# Start MLflow experiment
mlflow.start_run()

# Log parameters, metrics
mlflow.log_param("Model", "Gradient Boosting")
mlflow.log_param("n_estimators", 100)
mlflow.log_param("learning_rate", 0.1)
mlflow.log_metric("MAE", mae)

# Log the model
mlflow.sklearn.log_model(gb, "model")

# End the experiment
mlflow.end_run()




### XGBoost Regressor

In [48]:
!pip install xgboost



In [49]:
import xgboost as xgb

# XGBoost Regressor Model
xg = xgb.XGBRegressor(max_depth=5, subsample=0.8)

# Train the model
xg.fit(X_train_scaled, y_train)

# Evaluate on the test set
y_pred = xg.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")


MAE: 1962.8727023315432


In [50]:
# End any active run before starting a new one
mlflow.end_run()
# Start MLflow experiment
mlflow.start_run()

# Log parameters, metrics
mlflow.log_param("Model", "XGBoost")
mlflow.log_param("max_depth", 5)
mlflow.log_param("subsample", 0.8)
mlflow.log_metric("MAE", mae)

# Log the model
mlflow.sklearn.log_model(xg, "model")

# End the experiment
mlflow.end_run()




In [197]:
import joblib

# Saving the trained model to a file .pkl
joblib.dump(model, 'xgboost_model.pkl')


['xgboost_model.pkl']

### Support Vector Regressor

In [52]:
from sklearn.svm import SVR

# Support Vector Regressor Model
svr = SVR(C=1.0, kernel='rbf', epsilon=0.1)

# Train the model
svr.fit(X_train_scaled, y_train)

# Evaluate on the test set
y_pred = svr.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")


MAE: 6247.092667223262


In [53]:
# End any active run before starting a new one
mlflow.end_run()
# Start MLflow experiment
mlflow.start_run()

# Log parameters, metrics
mlflow.log_param("Model", "SVR")
mlflow.log_param("C", 1.0)
mlflow.log_param("kernel", 'rbf')
mlflow.log_param("epsilon", 0.1)
mlflow.log_metric("MAE", mae)

# Log the model
mlflow.sklearn.log_model(svr, "model")

# End the experiment
mlflow.end_run()




## Evaluation

In [204]:
from sklearn.model_selection import cross_val_score, KFold

In [206]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [208]:
# List of models to evaluate
models = {
    "Decision Tree": DecisionTreeRegressor(max_depth=5, min_samples_split=10),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1),
    "XGBoost": xgb.XGBRegressor(max_depth=5, subsample=0.8),
    "SVR": SVR(C=1.0, kernel='rbf', epsilon=0.1),
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

In [210]:
# K-fold cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate models using k-fold cross-validation and test set
for model_name, model in models.items():
    print(f"Evaluating model: {model_name}")
    
    # K-fold cross-validation
    cv_mae = cross_val_score(model, X_train_scaled, y_train, cv=kf, scoring='neg_mean_absolute_error')
    
    # Convert negative MAE to positive values
    cv_mae = -cv_mae
    
    # Calculate mean and standard deviation of MAE from cross-validation
    mean_cv_mae = np.mean(cv_mae)
    std_cv_mae = np.std(cv_mae)
    
    # Train the model on the full training set
    model.fit(X_train_scaled, y_train)
    
    # Evaluate on the test set
    y_pred = model.predict(X_test_scaled)
    test_mae = mean_absolute_error(y_test, y_pred)
    
    # Print the results
    print(f"Mean MAE from k-fold cross-validation: {mean_cv_mae:.4f}")
    print(f"Standard Deviation of MAE from k-fold cross-validation: {std_cv_mae:.4f}")
    print(f"Test set MAE: {test_mae:.4f}")
    print("-" * 50)

Evaluating model: Decision Tree
Mean MAE from k-fold cross-validation: 3504.7695
Standard Deviation of MAE from k-fold cross-validation: 154.9124
Test set MAE: 3247.3451
--------------------------------------------------
Evaluating model: Gradient Boosting
Mean MAE from k-fold cross-validation: 2735.7939
Standard Deviation of MAE from k-fold cross-validation: 177.5108
Test set MAE: 2589.0154
--------------------------------------------------
Evaluating model: XGBoost
Mean MAE from k-fold cross-validation: 2336.0742
Standard Deviation of MAE from k-fold cross-validation: 107.3447
Test set MAE: 2236.3319
--------------------------------------------------
Evaluating model: SVR
Mean MAE from k-fold cross-validation: 6204.5661
Standard Deviation of MAE from k-fold cross-validation: 421.3560
Test set MAE: 6115.8250
--------------------------------------------------
Evaluating model: Linear Regression
Mean MAE from k-fold cross-validation: 225609219510010816.0000
Standard Deviation of MAE fro