In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold,train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [5]:
df=pd.read_csv("USA_Housing.csv")
df.head()


Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,79545.45857,5.682861,7.009188,4.09,23086.8005,1059034.0
1,79248.64245,6.0029,6.730821,3.09,40173.07217,1505891.0
2,61287.06718,5.86589,8.512727,5.13,36882.1594,1058988.0
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260617.0
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943.5


In [6]:
X=df.drop(columns=['Price'])
y=df['Price']

In [7]:
scaler=StandardScaler()
X_scaled= scaler.fit_transform(X)
model=LinearRegression()
mse_scores=[]
r2_scores=[]
beta_list=[]

In [8]:
kf=KFold(n_splits=5, shuffle=True, random_state=42)

In [9]:
for train_idx, test_idx in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # Add intercept column (bias term)
    X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]
    
    # Step d: Compute Beta using Normal Equation
    beta = np.linalg.inv(X_train_bias.T @ X_train_bias) @ (X_train_bias.T @ y_train)
    
    # Predictions
    y_pred = X_test_bias @ beta
    
    # R2 Score
    r2 = r2_score(y_test, y_pred)
    
    r2_scores.append(r2)
    beta_list.append(beta)

In [10]:
# Find best beta
best_idx = np.argmax(r2_scores)
best_beta = beta_list[best_idx]
print("R2 Scores for 5 folds:", r2_scores)
print("Best Beta Matrix (with max R2):\n", best_beta)

R2 Scores for 5 folds: [0.9179971706985147, 0.9145677884802819, 0.9116116385364478, 0.9193091764960816, 0.9243869413350316]
Best Beta Matrix (with max R2):
 [1.23161736e+06 2.30225051e+05 1.63956839e+05 1.21115120e+05
 7.83467170e+02 1.50662447e+05]


In [11]:

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]


y_pred_final = X_test_bias @ best_beta
final_r2 = r2_score(y_test, y_pred_final)

print("Final R2 Score on 30% Test Data:", final_r2)

Final R2 Score on 30% Test Data: 0.9147458156636433


In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# Load dataset
X = df.drop("Price", axis=1).values
y = df["Price"].values.reshape(-1, 1)


X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.44, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=30/44, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Add bias (intercept column)
X_train = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_val = np.c_[np.ones((X_val.shape[0], 1)), X_val]
X_test = np.c_[np.ones((X_test.shape[0], 1)), X_test]

# Gradient Descent function
def gradient_descent(X, y, alpha, iterations=1000):
    m, n = X.shape
    beta = np.zeros((n, 1))
    for i in range(iterations):
        gradients = (1/m) * (X.T @ (X @ beta - y))
        beta = beta - alpha * gradients
    return beta

# Try different learning rates
learning_rates = [0.001, 0.01, 0.1, 1]
results = []

for lr in learning_rates:
    beta = gradient_descent(X_train, y_train, alpha=lr, iterations=1000)
    

    y_val_pred = X_val @ beta
    y_test_pred = X_test @ beta
    
    # R2 Scores
    r2_val = r2_score(y_val, y_val_pred)
    r2_test = r2_score(y_test, y_test_pred)
    
    results.append((lr, beta.flatten(), r2_val, r2_test))


for lr, beta, r2_val, r2_test in results:
    print(f"\nLearning Rate: {lr}")
    print("Coefficients (Beta):", beta[:5], "...")  # show first 5 only
    print("R2 Score (Validation):", r2_val)
    print("R2 Score (Test):", r2_test)


best_model = max(results, key=lambda x: x[2])
print("\nBest Model:")
print("Learning Rate:", best_model[0])
print("Best Beta:", best_model[1])
print("Best Validation R2:", best_model[2])
print("Test R2 with Best Beta:", best_model[3])


Learning Rate: 0.001
Coefficients (Beta): [774640.34886253 145451.2595162  104480.77963542  71382.2744303
  25993.88625322] ...
R2 Score (Validation): -1.0004518442472916
R2 Score (Test): -0.9212201229291082

Learning Rate: 0.01
Coefficients (Beta): [1225053.45844097  231802.79904472  165980.38171552  120446.24749517
    3240.16976372] ...
R2 Score (Validation): 0.9199464700170698
R2 Score (Test): 0.9134381229635951

Learning Rate: 0.1
Coefficients (Beta): [1225106.34781021  231827.54854547  166006.22902472  120763.07797071
    2922.26769971] ...
R2 Score (Validation): 0.9199649194854793
R2 Score (Test): 0.9134494051887397

Learning Rate: 1
Coefficients (Beta): [1225106.34781021  231827.54854547  166006.22902472  120763.07797071
    2922.26769971] ...
R2 Score (Validation): 0.9199649194854793
R2 Score (Test): 0.9134494051887397

Best Model:
Learning Rate: 0.1
Best Beta: [1225106.34781021  231827.54854547  166006.22902472  120763.07797071
    2922.26769971  152609.02782229]
Best Valida

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score

In [14]:
columns = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system", "bore",
           "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
data = pd.read_csv(url, names=columns, na_values="?")

In [15]:
data = data.dropna(subset=["price"])

numeric_cols = data.select_dtypes(include=["object"]).columns.difference(
    ["make","fuel_type","aspiration","num_doors","body_style",
     "drive_wheels","engine_location","engine_type","num_cylinders","fuel_system"]
)

for col in numeric_cols:
    data[col] = pd.to_numeric(data[col], errors="coerce")


for col in data.columns:
    if data[col].dtype == "object":
        data[col] = data[col].fillna(data[col].mode()[0])
    else:
        data[col] = data[col].fillna(data[col].mean())

In [None]:
# (i) num_doors & num_cylinders
word_to_num = {"two": 2, "three": 3, "four": 4, "five": 5, "six": 6,
               "eight": 8, "twelve": 12}
data["num_doors"] = data["num_doors"].replace(word_to_num).astype(int)
data["num_cylinders"] = data["num_cylinders"].replace(word_to_num).astype(int)

# (ii) Dummy encoding
data = pd.get_dummies(data, columns=["body_style", "drive_wheels"], drop_first=True)

# (iii) Label encoding
label_cols = ["make", "aspiration", "engine_location", "fuel_type"]
le = LabelEncoder()
for col in label_cols:
    data[col] = le.fit_transform(data[col])

# (iv) fuel_system (pfi = 1 else 0)
data["fuel_system"] = data["fuel_system"].apply(lambda x: 1 if "pfi" in x else 0)

# (v) engine_type (ohc = 1 else 0)
data["engine_type"] = data["engine_type"].apply(lambda x: 1 if "ohc" in x else 0)

# Step 4: Features & Target
X = data.drop("price", axis=1)
y = data["price"].astype(float)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 5: Linear Regression (70-30 split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2_original = r2_score(y_test, y_pred)

print("R² score without PCA:", r2_original)

# Step 6: PCA + Linear Regression
pca = PCA(n_components=0.95)  # keep 95% variance
X_reduced = pca.fit_transform(X_scaled)

X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3, random_state=42)

model_pca = LinearRegression()
model_pca.fit(X_train_pca, y_train)
y_pred_pca = model_pca.predict(X_test_pca)
r2_pca = r2_score(y_test, y_pred_pca)

print("R² score with PCA:", r2_pca)