In [2]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder

# Load the dataset
overall_details = pd.read_csv('C:/Users/HP/Desktop/MY PROJECTS/CAR DEKHO/car_dekho_files/new files/overall_details.csv')

# Step 1: Drop rows where 'filtered_price' is NaN
overall_details.dropna(subset=['filtered_price'], inplace=True)

# Step 2: Preprocessing (One-Hot Encoding for categorical variables)
categorical_cols = ['ft', 'bt', 'transmission', 'ownerNo', 'oem', 'model', 'modelYear', 'centralVariantId', 'variantName']

# One-Hot Encoding
one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_cols = one_hot_encoder.fit_transform(overall_details[categorical_cols])

with open('encoder.pkl', 'wb') as encoder_file:
    pickle.dump(one_hot_encoder, encoder_file)

print("OneHotEncoder saved to encoder.pkl")

# Combine with the rest of the data
encoded_df = pd.DataFrame(encoded_cols, columns=one_hot_encoder.get_feature_names_out(categorical_cols))
overall_details = pd.concat([overall_details.drop(columns=categorical_cols), encoded_df], axis=1)

# Step 3: Split the data into features (X) and target (y)
X = overall_details.drop(columns=['filtered_price'])  # Features
y = overall_details['filtered_price']  # Target variable

# Step 4: Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Step 2: Save the model to a file using pickle
with open('comparison_df_lr.pkl', 'wb') as file:
    pickle.dump(lr_model, file)

print("Model saved to comparison_df_lr.pkl")

# Step 6: Make predictions using Linear Regression
y_pred_lr = lr_model.predict(X_test)

# Step 7: Evaluate the Linear Regression model
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f"Linear Regression RMSE: {rmse_lr}")
print(f"Linear Regression R² Score: {r2_lr}")

# Step 8: Compare the actual and predicted prices for Linear Regression
comparison_df_lr = pd.DataFrame({'Actual Prices': y_test.values, 'Predicted Prices (Linear Regression)': y_pred_lr})
print(comparison_df_lr.head())


OneHotEncoder saved to encoder.pkl
Model saved to comparison_df_lr.pkl
Linear Regression RMSE: 1.5521416791111737
Linear Regression R² Score: 0.8174820546014454
   Actual Prices  Predicted Prices (Linear Regression)
0          13.00                             13.636429
1           5.00                              9.151467
2           4.45                              4.859503
3           8.45                              7.986187
4           1.60                              3.089029
