<a href="https://colab.research.google.com/github/Mythresh09/ML-Projects-/blob/main/certification_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset
df = pd.read_csv("cars.csv")

# Display basic dataset info
print("Dataset Overview:")
print(df.head())
print(df.info())

# Drop ID column (not needed for prediction)
df.drop(columns=['Id'], inplace=True)

# Exploratory Data Analysis (EDA)
print("Checking for Missing Values:")
print(df.isnull().sum())

# Univariate Analysis - Price Distribution
plt.figure(figsize=(8, 5))
sns.histplot(df['price'], bins=30, kde=True)
plt.title("Price Distribution of Used Cars")
plt.show()

# Multivariate Analysis - Correlation Matrix
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Feature Correlation Matrix")
plt.show()

# Define numerical and categorical features
num_features = ['year', 'distance_travelled(km)', 'car_age']
cat_features = ['brand', 'model_name', 'fuel_type', 'city']

# Preprocessing Pipelines
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine transformations
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

# Split dataset into features and target
X = df.drop(columns=['price'])
y = df['price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate models
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    pipeline.fit(X_train, y_train)

    # Predictions
    y_pred = pipeline.predict(X_test)

    # Evaluation Metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\n{name} Performance:")
    print(f"MAE: {mae:.2f}, MSE: {mse:.2f}, R2 Score: {r2:.2f}")

# Feature Importance Analysis using Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', rf_model)])
rf_pipeline.fit(X_train, y_train)

# Extract feature names after transformation
feature_names = (num_features +
                 list(rf_pipeline.named_steps['preprocessor']
                      .named_transformers_['cat']
                      .get_feature_names_out(cat_features)))

# Get feature importance
importance = rf_pipeline.named_steps['model'].feature_importances_

# Create DataFrame
feature_importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importance})
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)

# Plot Feature Importance
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance_df[:10], x="Importance", y="Feature")
plt.title("Top 10 Important Features for Used Car Price Prediction")
plt.show()
