In [91]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
df = pd.read_csv("car_price_prediction_.csv") 

# Display dataset info
print(df.info())

# Select features and target variable 
selected_features = ["Year", "Mileage", "Engine Size", "Brand"]  
target_col = "Price"  # Target variable

# Handle categorical variables (One-Hot Encoding)
df = pd.get_dummies(df[selected_features + [target_col]], drop_first=True)

# Handle missing values
df = df.dropna()

# Split dataset
X = df.drop(columns=[target_col])
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Extract coefficients
coefficients = pd.DataFrame({"Feature": X.columns, "Coefficient": model.coef_})
print("\n🔹 Feature Coefficients:\n", coefficients.sort_values(by="Coefficient", ascending=False))

# Predict on test set
y_pred = model.predict(X_test)

# Calculate metrics
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

# Display results
print(f"\n🔹 R² Score: {r2:.4f}")
print(f"🔹 Mean Squared Error: {mse:.2f}")

# Identify most impactful feature
top_feature = coefficients.iloc[coefficients["Coefficient"].abs().idxmax()]
print(f"\n🔹 The most impactful feature is '{top_feature['Feature']}' with a coefficient of {top_feature['Coefficient']:.2f}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Car ID        2500 non-null   int64  
 1   Brand         2500 non-null   object 
 2   Year          2500 non-null   int64  
 3   Engine Size   2500 non-null   float64
 4   Fuel Type     2500 non-null   object 
 5   Transmission  2500 non-null   object 
 6   Mileage       2500 non-null   int64  
 7   Condition     2500 non-null   object 
 8   Price         2500 non-null   float64
 9   Model         2500 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 195.4+ KB
None

🔹 Feature Coefficients:
           Feature  Coefficient
3       Brand_BMW  1946.400727
7     Brand_Tesla  1103.326109
1         Mileage    -0.001669
2     Engine Size  -108.449349
0            Year  -147.628017
6  Brand_Mercedes  -184.366613
8    Brand_Toyota  -714.609654
5     Brand_Honda -1394.308751
4  