In [None]:
## Import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
import statsmodels.api as sm

In [None]:
## Import dataset
df=pd.read_csv("C:/Users/Administrator/Desktop/ML_Model/CarPrice_Assignment.csv")
df.head(5)

In [None]:
## Inspect dataset
df_copy = df.copy()
print(df_copy.shape)
print(df_copy.columns)
print(df_copy.info())

In [None]:
############################# Prepare the dataset for Model Fit ##################################

In [None]:
### Checking null values
df_copy.isnull().sum()
# Check Duplication
df_copy.duplicated().sum()
# Check the number of unique values of each column
df_copy.nunique()

In [None]:
### seperate the categorical and numerical variables
# Separate numerical columns
numerical_df = df.select_dtypes(include=['int64', 'float64'])

# Separate categorical columns
categorical_df = df.select_dtypes(include=['object'])

# Print or inspect the separated DataFrames
print("Numerical Variables:")
print(numerical_df.head())

print("\nCategorical Variables:")
print(categorical_df.head())

In [None]:
### Inspect_categorical variables
# Display unique values for each categorical column
for col in categorical_df:
    unique_values = categorical_df[col].unique()
    print(f"Unique values in '{col}': {unique_values}")

In [None]:
# Extract brand and model from CarName
categorical_df['brand'] = categorical_df['CarName'].apply(lambda x: x.split(' ')[0])
categorical_df['model'] = categorical_df['CarName'].apply(lambda x: ' '.join(x.split(' ')[1:]))

In [None]:
categorical_df.head(10)

In [None]:
## Drop CarName and model
categorical_df.drop(columns=['CarName','model'], inplace= True)
categorical_df.head(10)

In [None]:
### presenting categorical variables in graph

# Create subplots
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(12, 9))
axes = axes.ravel()  # Flatten the 2D array of axes


# Loop through each categorical column
for i, column in enumerate(categorical_df):
    sns.countplot(x=categorical_df[column], data=categorical_df, palette='bright', ax=axes[i], saturation=0.95)
    for container in axes[i].containers:
        axes[i].bar_label(container, color='black', size=10)
    axes[i].set_title(f'Count Plot of {column.capitalize()}')
    axes[i].set_xlabel(column.capitalize())
    axes[i].set_ylabel('Count')

# Adjust layout and show plots
plt.tight_layout()
plt.show()


In [None]:
# Encoding categorical variables
label_encoder = LabelEncoder()
for column in categorical_df:
    categorical_df[column] = label_encoder.fit_transform(categorical_df[column])

In [None]:
##### inspect numerical variables
numerical_df.describe()

In [None]:
## Drop car_ID column
numerical_df.drop(columns=['car_ID','symboling'], inplace = True)
numerical_df.head(10)

In [None]:
# Distribution of Numerical Features
numerical_features = ['wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight',
                      'enginesize', 'boreratio', 'stroke', 'compressionratio', 'horsepower',
                      'peakrpm', 'citympg', 'highwaympg', 'price']

plt.figure(figsize=(12, 8))
for feature in numerical_features:
    plt.subplot(3, 5, numerical_features.index(feature)+1)
    sns.histplot(data=numerical_df[feature], bins=20, kde=True)
    plt.title(feature)
plt.tight_layout()
plt.show()

In [None]:
### Bivariate analysis
correlation_matrix = numerical_df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap of Numerical Variables')
plt.show()

In [None]:
##### Combining Numerical dataset and Categorical dataset
combined_df = pd.concat([ categorical_df , numerical_df], axis=1)

In [None]:
# Splitting the dataset
X=combined_df.drop(columns=['price'])
y=combined_df.price

In [None]:
combined_df.info()

In [None]:
########################### Model fit #####################################

In [None]:
### Train_test_Spilt
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Model training
model_1 = LinearRegression()
model_1.fit(X_train, y_train)

# Predictions
y_pred = model_1.predict(X_test)
y_pred

In [None]:
# Assuming you have trained your model and have predictions for train and test sets
y_train_pred = model_1.predict(X_train)  # Predictions on training set
y_test_pred = model_1.predict(X_test)    # Predictions on test set

# Calculate RMSE for both train and test sets
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

# Calculate R-squared for both train and test sets
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Train RMSE: {train_rmse}")
print(f"Test RMSE: {test_rmse}")
print(f"Train R-squared: {train_r2}")
print(f"Test R-squared: {test_r2}")

In [None]:
#####################     Model Diagnostics and Assumption Tests      ##############

In [None]:
#### a.Check Linearity Assumption
# Plot Residuals vs Fitted Values to check for linearity. The residuals should scatter randomly around zero.

# Residuals vs Fitted Values Plot
residuals =model_2.resid
fitted_values = model_2.fittedvalues

plt.scatter(fitted_values, residuals)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs Fitted Values')
plt.show()

## Interpretation: If the residuals show no clear pattern and are evenly distributed around zero, the linearity assumption holds

In [None]:
## b. Check Multicollinearity (Variance Inflation Factor - VIF)
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X_intercept.columns
vif_data["VIF"] = [variance_inflation_factor(X_intercept.values, i) for i in range(X.shape[1])]

# Display VIF values
print(vif_data)

#Interpretation: VIF > 10 indicates high multicollinearity, which can lead to unreliable coefficient estimates.

In [None]:
## c. Check for Normality of Residuals
# Q-Q plot
sm.qqplot(residuals, line='s')
plt.show()



## Additionally, you can use the Shapiro-Wilk test for normality:


from scipy.stats import shapiro

# Shapiro-Wilk test
shapiro_test = shapiro(residuals)

# Print p-value
print('Shapiro-Wilk test p-value:', shapiro_test[1])

### Interpretation: A p-value > 0.05 suggests that the residuals are normally distributed

In [None]:
###### d. Check for Autocorrelation of Residuals

# Durbin-Watson test
durbin_watson_stat = sm.stats.stattools.durbin_watson(residuals)
print('Durbin-Watson Statistic:', durbin_watson_stat)
## Interpretation: A Durbin-Watson statistic close to 2 indicates no autocorrelation in residuals.