In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Step 1: Load and Preprocess Data
df = pd.read_csv(r'C:\Users\HP\Downloads\CarPrices\CarPrice_Assignment.csv')

# Drop missing values and irrelevant columns
df = df.dropna()
df = df.drop(columns=['car_ID'])

# Convert categorical variables to numeric (One-hot encoding)
df = pd.get_dummies(df, drop_first=True)

# Correlation matrix to identify relevant features
correlation_matrix = df.corr()
price_corr = correlation_matrix['price']

# Dropping features with correlation less than 0.01
low_corr_columns = price_corr[price_corr.abs() < 0.01].index.tolist()
df = df.drop(columns=low_corr_columns)

# Step 2: Standardization (Z-score normalization)
X = df.drop('price', axis=1)
X = (X - X.mean()) / X.std()

# Normalize the target variable 'price'
y = (df['price'] - df['price'].mean()) / df['price'].std()

# Step 3: Train-test split (80-20)
train_size = int(0.8 * len(df))
X_train, X_test = X[:train_size].values, X[train_size:].values
y_train, y_test = y[:train_size].values, y[train_size:].values

# Step 4: Linear Regression from Scratch
class LinearRegressionScratch:
    def __init__(self, learning_rate=0.05, iterations=2000):
        self.learning_rate = learning_rate
        self.iterations = iterations
        self.theta = None
        self.cost_history = []

    def fit(self, X, y):
        X = np.insert(X, 0, 1, axis=1)  # Add bias term
        self.theta = np.zeros(X.shape[1])
        m = len(y)

        for i in range(self.iterations):
            y_pred = X.dot(self.theta)
            error = y_pred - y
            gradient = (1/m) * X.T.dot(error)
            self.theta -= self.learning_rate * gradient

            # Using MAE for cost function
            cost = (1 / m) * np.sum(np.abs(error))
            self.cost_history.append(cost)

    def predict(self, X):
        X = np.insert(X, 0, 1, axis=1)  # Add bias term
        return X.dot(self.theta)

# Step 5: Train the Model
model = LinearRegressionScratch(learning_rate=0.05, iterations=2000)
model.fit(X_train, y_train)

# Step 6: Predict on Test Data
y_pred = model.predict(X_test)

# Step 7: De-normalize Predictions and Actual Prices
min_price = df['price'].min()
max_price = df['price'].max()

y_pred_denorm = y_pred * (max_price - min_price) + min_price
y_test_denorm = y_test * (max_price - min_price) + min_price

# Step 8: Model Evaluation (Mean Squared Error and MAE)
mse = np.mean((y_pred - y_test) ** 2)
mae = np.mean(np.abs(y_test - y_pred))
print(f'Mean Squared Error on test set (normalized): {mse}')
print(f'Mean Absolute Error: {mae}')

# Step 9: Calculate R-squared and Adjusted R-squared
ss_total = np.sum((y_test_denorm - np.mean(y_test_denorm)) ** 2)
ss_res = np.sum((y_test_denorm - y_pred_denorm) ** 2)
r_squared = 1 - (ss_res / ss_total)
adjusted_r_squared = 1 - (1 - r_squared) * (len(y_test_denorm) - 1) / (len(y_test_denorm) - X_test.shape[1] - 1)

print(f'R-squared: {r_squared}')
print(f'Adjusted R-squared: {adjusted_r_squared}')

# Step 10: Visualize Predictions vs Actual Values
plt.scatter(y_test_denorm, y_pred_denorm)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted Car Prices')
plt.show()

# Step 11: Plot the Cost Function vs Iterations
plt.plot(range(1, len(model.cost_history) + 1), model.cost_history)
plt.xlabel('Iteration')
plt.ylabel('Cost (MAE)')
plt.title('Cost Function vs Iterations')
plt.show()
