In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [3]:
df = pd.read_csv('Real estate.csv')
df = df.drop(columns=['No'])

In [4]:
df

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2012.917,19.5,306.59470,9,24.98034,121.53951,42.2
2,2013.583,13.3,561.98450,5,24.98746,121.54391,47.3
3,2013.500,13.3,561.98450,5,24.98746,121.54391,54.8
4,2012.833,5.0,390.56840,5,24.97937,121.54245,43.1
...,...,...,...,...,...,...,...
409,2013.000,13.7,4082.01500,0,24.94155,121.50381,15.4
410,2012.667,5.6,90.45606,9,24.97433,121.54310,50.0
411,2013.250,18.8,390.96960,7,24.97923,121.53986,40.6
412,2013.000,8.1,104.81010,5,24.96674,121.54067,52.5


In [7]:
# Feature engineering
df['distance_to_mrt_log'] = np.log1p(df['X3 distance to the nearest MRT station'])
df['age_squared'] = df['X2 house age'] ** 2
df['lat_long_interaction'] = df['X5 latitude'] * df['X6 longitude']

In [8]:
X = df.drop(columns=['Y house price of unit area'])
y = df['Y house price of unit area']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [7]:
# Create pipeline with polynomial features and regularization
pipeline = Pipeline([
    ('poly', PolynomialFeatures(include_bias=False)),
    ('scaler', StandardScaler()),
    ('model', Ridge())
])

In [8]:
# Hyperparameter tuning grid
param_grid = {
    'poly__degree': [1, 2],  # Test linear and quadratic features
    'model__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Regularization strengths
    'model__max_iter': [10000]  # Ensure convergence
}


In [9]:
# Grid search with cross-validation
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

In [10]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Test R² Score: {r2:.4f}")


Best Parameters: {'model__alpha': 10, 'model__max_iter': 10000, 'poly__degree': 2}
Test R² Score: 0.7036


In [11]:
coefficients = best_model.named_steps['model'].coef_
feature_names = best_model.named_steps['poly'].get_feature_names_out(input_features=X.columns)
feature_importance = pd.Series(coefficients, index=feature_names)
print("\nTop 10 Features:")
print(feature_importance.abs().sort_values(ascending=False).head(10))



Top 10 Features:
age_squared^2                                                             2.514134
X3 distance to the nearest MRT station X4 number of convenience stores    1.906411
X2 house age distance_to_mrt_log                                          1.667175
distance_to_mrt_log^2                                                     1.442774
X4 number of convenience stores^2                                         1.343564
X2 house age age_squared                                                  1.267184
X4 number of convenience stores distance_to_mrt_log                       1.224913
X4 number of convenience stores age_squared                               1.200445
X2 house age X3 distance to the nearest MRT station                       1.126494
X2 house age lat_long_interaction                                         1.038079
dtype: float64


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# Load dataset
df = pd.read_csv('Real estate.csv')
df = df.drop(columns=['No'])  # Remove ID column

# Prepare features and target
X = df.drop(columns=['Y house price of unit area'])  # Features
y = df['Y house price of unit area']                 # Target

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train simple linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict on test data
y_pred = model.predict(X_test_scaled)

# Evaluate model
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.4f}")

# Output coefficients
coefficients = model.coef_
intercept = model.intercept_
feature_names = X.columns

# Display results
print("\nIntercept:", intercept)
print("Coefficients:")
for name, coef in zip(feature_names, coefficients):
    print(f"{name}: {coef:.4f}")


R² Score: 0.6811

Intercept: 38.39154078549889
Coefficients:
X1 transaction date: 1.5296
X2 house age: -3.0627
X3 distance to the nearest MRT station: -5.7869
X4 number of convenience stores: 3.2189
X5 latitude: 2.8551
X6 longitude: -0.4410
