## Dataset Schema

In [39]:
# Defining schema 
schema = {
    "Brand": "categorical",
    "Price": "numerical",
    "Body": "categorical",
    "Mileage": "numerical",
    "EngineV": "numerical",
    "Engine Type": "categorical",
    "Registration": "categorical",
    "Year": "numerical",
    "Model": "categorical",
}

### Data Cleaning

In [41]:
import pandas as pd

# Loading the dataset
car_data = pd.read_csv("Car_data.csv")

# Checking missing values
print("Missing Values Before Cleaning:")
print(car_data.isnull().sum())

# Handle missing values
car_data["Price"].fillna(car_data["Price"].median(), inplace=True)  # Numerical: Median
car_data["EngineV"].fillna(car_data["EngineV"].median(), inplace=True)  # Numerical: Median

# Drop duplicates
car_data = car_data.drop_duplicates()

# Verify missing values
print("Missing Values After Cleaning:")
print(car_data.isnull().sum())


Missing Values Before Cleaning:
Brand             0
Price           172
Body              0
Mileage           0
EngineV         150
Engine Type       0
Registration      0
Year              0
Model             0
dtype: int64
Missing Values After Cleaning:
Brand           0
Price           0
Body            0
Mileage         0
EngineV         0
Engine Type     0
Registration    0
Year            0
Model           0
dtype: int64


In [42]:
# Outlier removal function
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Apply outlier removal for numerical columns
numerical_columns = ["Price", "Mileage", "EngineV", "Year"]
for col in numerical_columns:
    car_data = remove_outliers(car_data, col)

print("Shape after outlier removal:", car_data.shape)


Shape after outlier removal: (3556, 9)


In [43]:
print(car_data.head())
print("Dataset shape:", car_data.shape)
print(car_data.dtypes)

           Brand    Price       Body  Mileage  EngineV Engine Type  \
0            BMW   4200.0      sedan      277      2.0      Petrol   
1  Mercedes-Benz   7900.0        van      427      2.9      Diesel   
3           Audi  23000.0  crossover      240      4.2      Petrol   
4         Toyota  18300.0  crossover      120      2.0      Petrol   
6            BMW   6100.0      sedan      438      2.0         Gas   

  Registration  Year         Model  
0          yes  1991           320  
1          yes  1999  Sprinter 212  
3          yes  2007            Q7  
4          yes  2011         Rav 4  
6          yes  1997           320  
Dataset shape: (3556, 9)
Brand            object
Price           float64
Body             object
Mileage           int64
EngineV         float64
Engine Type      object
Registration     object
Year              int64
Model            object
dtype: object


In [44]:
!pip install fastparquet



In [45]:
car_data.to_parquet("Car_data_cleaned.parquet", engine="fastparquet", index=False)
print("Cleaned dataset saved as 'Car_data_cleaned.parquet'")

Cleaned dataset saved as 'Car_data_cleaned.parquet'


## Dataset Profiling Report

In [47]:
!pip install ydata-profiling



In [48]:
from ydata_profiling import ProfileReport

# Load the cleaned dataset
import pandas as pd
car_data_cleaned = pd.read_parquet("Car_data_cleaned.parquet", engine="fastparquet")

# Generate the profiling report
profile = ProfileReport(car_data_cleaned, title="Car Data Profiling Report", explorative=True)

# Save the profiling report to an HTML file
profile.to_file("Car_data_profiling_report.html")

print("Dataset profiling report saved as 'Car_data_profiling_report.html'")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset profiling report saved as 'Car_data_profiling_report.html'


## Train-Test-Production Split

In [50]:
from sklearn.model_selection import train_test_split

# Load the cleaned dataset
car_data_cleaned = pd.read_parquet("Car_data_cleaned.parquet", engine="fastparquet")

# Split the data into Train (60%), Test (20%), and Production (20%)
train_data, temp_data = train_test_split(car_data_cleaned, test_size=0.4, random_state=42)
test_data, prod_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Print the shapes
print("Training Data Shape:", train_data.shape)
print("Testing Data Shape:", test_data.shape)
print("Production Data Shape:", prod_data.shape)

Training Data Shape: (2133, 9)
Testing Data Shape: (711, 9)
Production Data Shape: (712, 9)


## ML Pipeline with Scikit-learn

In [87]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [91]:
url = 'https://github.com/KeerthiNarumanchi/car_data_mlops/raw/master/Car_data_cleaned.parquet'
df = pd.read_parquet(url)


In [93]:
X = df.drop('Price', axis=1)  # Features
y = df['Price']  # Target

In [95]:
num_features = ['Mileage', 'EngineV', 'Year']  # Numerical columns
cat_features = ['Brand', 'Body', 'Engine Type', 'Registration', 'Model']  # Categorical columns

In [97]:
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])

In [99]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

model_pipeline.fit(X_train, y_train)

In [101]:
y_pred = model_pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}, MSE: {mse}, R^2: {r2}")


MAE: 2022.573993044409, MSE: 10893299.779845186, R^2: 0.8525491107043393


In [162]:
import joblib

# Save the trained model to a file
joblib.dump(model, 'random_forest_model.pkl')


['random_forest_model.pkl']

## ML Experimentation and Tracking with MLflow/Weights and Biases

###  Linear Regression with Feature Scaling

In [108]:
!pip install mlflow


Collecting mlflow
  Obtaining dependency information for mlflow from https://files.pythonhosted.org/packages/c2/39/e051e58f35077500fea62adb67c0ff32cab768a5bbc1e0d8c682e30d56ee/mlflow-2.19.0-py3-none-any.whl.metadata
  Downloading mlflow-2.19.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.19.0 (from mlflow)
  Obtaining dependency information for mlflow-skinny==2.19.0 from https://files.pythonhosted.org/packages/05/95/75f59715e39aa2224e5ecd8c52d5a305467e16a843ade2235a215599a1fa/mlflow_skinny-2.19.0-py3-none-any.whl.metadata
  Downloading mlflow_skinny-2.19.0-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Obtaining dependency information for alembic!=1.10.0,<2 from https://files.pythonhosted.org/packages/cb/06/8b505aea3d77021b18dcbd8133aa1418f1a1e37e432a465b14c46b2c0eaa/alembic-1.14.0-py3-none-any.whl.metadata
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Obtaining dependency in

In [110]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow

# Load data
url = 'https://github.com/KeerthiNarumanchi/car_data_mlops/raw/master/Car_data_cleaned.parquet'
df = pd.read_parquet(url)

# Split the data into train and test sets
X = df.drop(columns=['Price'])
y = df['Price']

# Encode categorical features if needed (you might already have this handled)
X = pd.get_dummies(X, drop_first=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling (StandardScaler)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [112]:
# Linear Regression Model
lr = LinearRegression()

# Train the model
lr.fit(X_train_scaled, y_train)


In [114]:
# Predict on the test set
y_pred = lr.predict(X_test_scaled)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}, MSE: {mse}, R^2: {r2}")


MAE: 1149017354397330.8, MSE: 1.0978248888506387e+32, R^2: -1.4860075406300546e+24


In [125]:
import mlflow
import mlflow.sklearn

# End any active run before starting a new one
mlflow.end_run()

# Start a new run
mlflow.start_run()

# Log parameters, metrics, and model
mlflow.log_param("Model", "Linear Regression")
mlflow.log_metric("MAE", mae)
mlflow.log_metric("MSE", mse)
mlflow.log_metric("R2", r2)

# Log the model
mlflow.sklearn.log_model(lr, "model")




<mlflow.models.model.ModelInfo at 0x21296df0110>

### Decision Tree Regressor

In [129]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

# Decision Tree Regressor Model
dt = DecisionTreeRegressor(max_depth=5, min_samples_split=10)

# Train the model
dt.fit(X_train_scaled, y_train)

# Evaluate using cross-validation
cv_scores = cross_val_score(dt, X_train_scaled, y_train, cv=5, scoring='neg_mean_absolute_error')
print("Cross-validation MAE:", -cv_scores.mean())


Cross-validation MAE: 3310.6004867478478


In [131]:
# End any active run before starting a new one
mlflow.end_run()

# Start MLflow experiment
mlflow.start_run()

# Log parameters, metrics
mlflow.log_param("Model", "Decision Tree")
mlflow.log_param("max_depth", 5)
mlflow.log_param("min_samples_split", 10)
mlflow.log_metric("MAE", -cv_scores.mean())

# Log the model
mlflow.sklearn.log_model(dt, "model")

# End the experiment
mlflow.end_run()




### Gradient Boosting Regressor

In [134]:
from sklearn.ensemble import GradientBoostingRegressor

# Gradient Boosting Regressor Model
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)

# Train the model
gb.fit(X_train_scaled, y_train)

# Evaluate on the test set
y_pred = gb.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")


MAE: 2569.306794050584


In [136]:
# End any active run before starting a new one
mlflow.end_run()

# Start MLflow experiment
mlflow.start_run()

# Log parameters, metrics
mlflow.log_param("Model", "Gradient Boosting")
mlflow.log_param("n_estimators", 100)
mlflow.log_param("learning_rate", 0.1)
mlflow.log_metric("MAE", mae)

# Log the model
mlflow.sklearn.log_model(gb, "model")

# End the experiment
mlflow.end_run()




### XGBoost Regressor

In [141]:
!pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/70/58/2f94976df39470fb00eec2cb4f914dde44cd0df8d96483208bf7db4bc97e/xgboost-2.1.3-py3-none-win_amd64.whl.metadata
  Downloading xgboost-2.1.3-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.3-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 325.1 kB/s eta 0:06:25
   ---------------------------------------- 0.1/124.9 MB 465.5 kB/s eta 0:04:29
   ---------------------------------------- 0.3/124.9 MB 1.4 MB/s eta 0:01:28
   ---------------------------------------- 0.7/124.9 MB 3.4 MB/s eta 0:00:37
   ---------------------------------------- 1.3

In [144]:
import xgboost as xgb

# XGBoost Regressor Model
xg = xgb.XGBRegressor(max_depth=5, subsample=0.8)

# Train the model
xg.fit(X_train_scaled, y_train)

# Evaluate on the test set
y_pred = xg.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")


MAE: 1962.8727023315432


In [146]:
# End any active run before starting a new one
mlflow.end_run()
# Start MLflow experiment
mlflow.start_run()

# Log parameters, metrics
mlflow.log_param("Model", "XGBoost")
mlflow.log_param("max_depth", 5)
mlflow.log_param("subsample", 0.8)
mlflow.log_metric("MAE", mae)

# Log the model
mlflow.sklearn.log_model(xg, "model")

# End the experiment
mlflow.end_run()




### Support Vector Regressor

In [148]:
from sklearn.svm import SVR

# Support Vector Regressor Model
svr = SVR(C=1.0, kernel='rbf', epsilon=0.1)

# Train the model
svr.fit(X_train_scaled, y_train)

# Evaluate on the test set
y_pred = svr.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")


MAE: 6247.092667223262


In [150]:
# End any active run before starting a new one
mlflow.end_run()
# Start MLflow experiment
mlflow.start_run()

# Log parameters, metrics
mlflow.log_param("Model", "SVR")
mlflow.log_param("C", 1.0)
mlflow.log_param("kernel", 'rbf')
mlflow.log_param("epsilon", 0.1)
mlflow.log_metric("MAE", mae)

# Log the model
mlflow.sklearn.log_model(svr, "model")

# End the experiment
mlflow.end_run()


