In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import Lasso, Ridge
import xgboost as xgb

import warnings
warnings.simplefilter(action='ignore')


In [None]:
#  Dataset Files
def list_files(directory):
    for dirname, _, filenames in os.walk(directory):
        for filename in filenames:
            print(os.path.join(dirname, filename))

# file path
data_dir = "data"
list_files(data_dir)

# Load Data
train = pd.read_csv(data_dir + 'train.csv')
test = pd.read_csv(data_dir + 'test.csv')


In [None]:
# data Overview
def data_summary(df, name):
    print(f"Dataset: {name}")
    print(f"Shape: {df.shape}")
    print(df.head(), "\n")

data_summary(train, "Train")
data_summary(test, "Test")

# Droping unnecessary Columns
train.drop(columns=['Id'], inplace=True)
test.drop(columns=['Id'], inplace=True)

In [None]:
# data Visualization
def three_chart_plot(df, feature):
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    sns.histplot(df[feature], kde=True, ax=axes[0]).set_title('Histogram')
    stats.probplot(df[feature], plot=axes[1])
    sns.boxplot(y=df[feature], ax=axes[2]).set_title('Box Plot')
    plt.show()

three_chart_plot(train, 'SalePrice')


In [None]:
# missing Values
def missing_values_plot(data, threshold=20):
    missing_perc = data.isnull().mean() * 100
    missing_perc.sort_values(ascending=False).plot(kind='bar', color='red')
    plt.axhline(y=threshold, color='blue', linestyle='--')
    plt.title("Missing Values Percentage per Column")
    plt.show()

missing_values_plot(train)

# data preprocessing
def preprocess_data(df):
    df.fillna(df.median(), inplace=True)
    return df

train = preprocess_data(train)
test = preprocess_data(test)

# feature engineering
def add_features(df):
    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    df['TotalBathrooms'] = df['FullBath'] + 0.5 * df['HalfBath'] + df['BsmtFullBath'] + 0.5 * df['BsmtHalfBath']
    return df
train = add_features(train)
test = add_features(test)


In [None]:
# Train-Test Split
y = train['SalePrice']
X = train.drop(columns=['SalePrice'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Model Evaluation Function
def evaluate_model(model, X_train, y_train, X_test, y_test):
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    print(f"Train RMSE: {np.sqrt(mean_squared_error(y_train, train_pred))}")
    print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, test_pred))}\n")


In [None]:
# Train Models
models = {
    "Lasso": Lasso(alpha=0.1),
    "Ridge": Ridge(alpha=0.1),
    "XGBoost": xgb.XGBRegressor(n_estimators=1000, learning_rate=0.1)
}

for name, model in models.items():
    print(f"Training {name} Model...")
    model.fit(X_train, y_train)
    evaluate_model(model, X_train, y_train, X_test, y_test)

# Make Predictions
final_model = models["Ridge"]
test_pred = final_model.predict(test)
submission = pd.DataFrame({'Id': test.index, 'SalePrice': test_pred})
submission.to_csv('submission.csv', index=False)
print("Submission file created!")