In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor



In [None]:
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

## EDA

In [None]:
print(train_data.shape)
print(test_data.shape)

In [None]:
X = pd.concat([train_data.drop(columns=['SalePrice']),test_data],ignore_index=True)

In [None]:
# drop 'SalePrice'
X = pd.concat([train_data.drop(columns=['SalePrice']),test_data],ignore_index=True)

In [None]:
# check missing values
print(f"missing values: {X.isnull().values.sum()}")
plt.figure(figsize=(16,9))
sns.heatmap(X.isnull())
plt.show()

In [None]:
#calculate the percentage of null values in the columns
null_percent = X.isnull().sum()/X.shape[0]*100
null_percent

In [None]:
# deleting the columns with more than 50 missing values
col_to_drop = null_percent[null_percent > 50].keys()
col_to_drop

In [None]:
X = X.drop(columns=list(col_to_drop))

In [None]:
# check new shape
X.shape

In [None]:
# feature engineering
numerical_cols = X.loc[:, X.isnull().any()].select_dtypes(include='number').columns
categorical_cols = X.loc[:, X.isnull().any()].select_dtypes(exclude='number').columns

print("# Numerical columns with null values:", len(numerical_cols))
print("# Categorical columns with null values:", len(categorical_cols))

In [None]:
for column in numerical_cols:
    # Replace missing values with the mean
    X[column] = X[column].fillna(X[column].mean())

In [None]:
for column in categorical_cols:
    # Replace missing values with the mode
    X[column] = X[column].fillna(X[column].mode()[0])

In [None]:
if not X.isnull().values.any():
    print("\nThere are no missing values.")

In [None]:
# One-hot encoding
print(X.shape)
X = pd.get_dummies(data=X)
print(X.shape)

In [None]:
test_data_transform = X.iloc[train_data.shape[0]:].copy()
X = X.iloc[:train_data.shape[0]].copy()

In [None]:
# independent variables and target
X = X.drop(columns=['Id'])
y = train_data['SalePrice']
print(X.shape)
print(y.shape)

In [None]:
# check distribution target value
plt.figure(figsize=(16,9))
bar = sns.distplot(train_data['SalePrice'])
plt.show()

In [None]:
# Log-transform the target variable
y_log = np.log(y)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
print (X_train.shape)
print (X_test.shape)

# Random Forest

In [None]:
# Build the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
# Train the model
rf_model.fit(X_train_scaled, y_train)

In [None]:
# Make predictions on the test set
y_test_pred_rf = rf_model.predict(X_test_scaled)
rmse_rf = np.sqrt(mean_squared_error(np.log(y_test),np.log(y_test_pred_rf)))
rounded_rmse_rf = round(rmse_rf, 4)
print(f'Root Mean Squared Error on test Set (Random Forest): {rounded_rmse_rf}')


In [8]:
import joblib

In [9]:
# Load the scaler
scaler = joblib.load('data/scaler.joblib')
# Load the model
rf_model = joblib.load('data/model.joblib')
# sale transformed test data
test_data_transform = pd.read_parquet('./data/test_prep.parquet')
test_X = test_data_transform.drop('Id', axis=1)
test_X_scaled = scaler.transform(test_X)
test_preds_rf = rf_model.predict(test_X_scaled)
# Save the predictions to a CSV file
test_data = pd.read_csv('./data/test.csv')
result = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': test_preds_rf})
result.to_csv('data/predictions.csv', index=False)