# House Price Prediction

## Import Libraries

In [1]:
import os
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from transformations import Transformations

## Load Datasets

In [2]:
load_dotenv()


True

In [3]:
test = pd.read_csv(os.getenv('TEST_DATA'))
train = pd.read_csv(os.getenv('TRAIN_DATA'))

In [4]:
X_train = train.drop(columns=['SalePrice'])
y_train = train['SalePrice']
X_test = test

## EDA

In [5]:
numerical_features = [name for name, typ in X_train.dtypes.items() if typ in ['float64', 'int64']]
categorical_features = [name for name, typ in X_train.dtypes.items() if typ == 'object']

In [6]:
transformer = Transformations(categorical_features, numerical_features)

### Feature Preparation

In [7]:
X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)

## Model

In [8]:
model = LinearRegression()

model.fit(X_train_transformed, y_train)

y_pred = model.predict(X_test_transformed)

### Model Evaluation

In [None]:
# Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ]
)

# Split data properly
X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Fit only on training data
X_train_sub_transformed = preprocessor.fit_transform(X_train_sub)
X_val_transformed = preprocessor.transform(X_val)  # Apply same transformations

# Train model
model.fit(X_train_sub_transformed, y_train_sub)

# Predict and evaluate
y_val_pred = model.predict(X_val_transformed)
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

print(f"Validation MSE: {mse:.4f}")
print(f"Validation R²: {r2:.4f}")

0
Series([], dtype: int64)
0
Validation MSE: 202749773836038354852644913152.0000
Validation R²: -26433005371539615744.0000
