Predict Housing Price

Preprocess Dataset Cleaning
1. Identify missing rows
2. Drop the missing rows
3. Identify duplate values
4. Drop Duplicate values

In [2]:
import pandas as pd
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#identify missing values
df = pd.read_csv('train.csv')

# Load your dataset
df = pd.read_csv('train.csv')
#*********OUTLIERS*********************
# Select only the numerical columns
numerical_df = df.select_dtypes(include=[np.number])

# Calculate the Z-scores for each numerical column
z_scores = np.abs(stats.zscore(numerical_df))

# Set a threshold for identifying outliers
threshold = 3

# Assuming you want to replace outliers based on z-score
# and replace them with median for the column
upper_limit = numerical_df['MSSubClass'].mean() + threshold * numerical_df['MSSubClass'].std()
median = numerical_df['MSSubClass'].median()
df['MSSubClass'] = df['MSSubClass'].apply(lambda x: median if x > upper_limit else x)
#print(df['MSSubClass'])

# Recalculate the Z-scores after imputation
z_scores_post_imputation = np.abs(stats.zscore(df[['MSSubClass']]))

# Identify outliers after imputation
outliers_after = (z_scores_post_imputation > threshold).sum(axis=0)

# Print out the number of outliers before and after imputation
print(f"Outliers before imputation: {df['MSSubClass']}")
print(f"Outliers after imputation: {outliers_after}")

#***********************OUTLIERS***************************************************************

#************************MISSING VALUES******************************************
missing_values = df.isnull().sum()
# Instead of dropping rows with missing values, impute with most frequent value
for column in df.columns:
    if df[column].dtype == 'object':  # Check if column is categorical
        df[column] = df[column].fillna(df[column].mode()[0]) # Fill missing values with mode
    else:
        df[column] = df[column].fillna(df[column].mean()) # Fill missing values with mean
#Save the Cleaned Dataset
df.to_csv("cleaned.csv")
#**************************MISSING VALUES ***********************************************

Outliers before imputation: 0       60.0
1       20.0
2       60.0
3       70.0
4       60.0
        ... 
1455    60.0
1456    20.0
1457    70.0
1458    20.0
1459    20.0
Name: MSSubClass, Length: 1460, dtype: float64
Outliers after imputation: MSSubClass    10
dtype: int64


Outliers

In [None]:
import pandas as pd
from scipy import stats
import numpy as np

# Load your dataset
df = pd.read_csv('train.csv')
#*********OUTLIERS*********************
# Select only the numerical columns
numerical_df = df.select_dtypes(include=[np.number])

# Calculate the Z-scores for each numerical column
z_scores = np.abs(stats.zscore(numerical_df))

# Set a threshold for identifying outliers
threshold = 3

# Assuming you want to replace outliers based on z-score
# and replace them with median for the column
upper_limit = numerical_df['MSSubClass'].mean() + threshold * numerical_df['MSSubClass'].std()
median = numerical_df['MSSubClass'].median()
df['MSSubClass'] = df['MSSubClass'].apply(lambda x: median if x > upper_limit else x)


print(df['MSSubClass'])
#***********************OUTLIERS************************

0       60.0
1       20.0
2       60.0
3       70.0
4       60.0
        ... 
1455    60.0
1456    20.0
1457    70.0
1458    20.0
1459    20.0
Name: MSSubClass, Length: 1460, dtype: float64


Encode Train Data

In [3]:
from re import X
#Import Necessary Libraries

from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn.model_selection import train_test_split

#Load cleaned dataset
train_encode = pd.read_csv('cleaned.csv')
train_encode = pd.DataFrame(train_encode)

#Find non-numerical values in the cleaned dataset
def find_non_numerical(train_encode):
    non_numerical = []
    for column in df.columns:
        if df[column].dtype == 'object':
            non_numerical.append(column)
    return non_numerical
#print(find_non_numerical(train_encode))

#Separate features and Target
X = train_encode[find_non_numerical(train_encode)]
y = train_encode['SalePrice']

#Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first')

#Fit and Transform the Features
X_encoded = encoder.fit_transform(X)

#Convert to dataframes and column names
encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(X.columns))

df_encoded = pd.concat([encoded_df, y], axis=1)
#print(df_encoded)

#Save the Encoded Dataset
df_encoded.to_csv("encoded.csv")

Model Training

In [4]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

#Load and prepare dataset
data = pd.read_csv('encoded.csv')

#Specify the name of the target column
target_column = 'SalePrice'

#Separate features and target
X = data.drop(columns=[target_column])
y = data[target_column]

# Initialize the Random Forest model
#rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

#Initialize the model here
rf_model = GradientBoostingRegressor()

#Fit the model with the entire dataset
rf_model.fit(X, y)

#Make predictions on the same dataset since we are not splitting
y_pred = rf_model.predict(X)

# Evaluate the model using regression metrics
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y, y_pred)
# Calculate R-squared (R2)
r2 = r2_score(y, y_pred)

# Print the evaluation metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

# Save the trained model
joblib.dump(rf_model, 'model_1')




Mean Squared Error (MSE): 771040092.7065036
R-squared (R2): 0.877744420591863


['model_1']

Clean Test Dataset

In [5]:
import pandas as pd

#identify missing values
df = pd.read_csv('test.csv')
missing_values = df.isnull().sum()
# Instead of dropping rows with missing values, impute with most frequent value
for column in df.columns:
    if df[column].dtype == 'object':  # Check if column is categorical
        df[column] = df[column].fillna(df[column].mode()[0]) # Fill missing values with mode
    else:
        df[column] = df[column].fillna(df[column].mean()) # Fill missing values with mean
#Save the Cleaned Dataset
df.to_csv("cleaned_test.csv")

Encode Test Dataset

In [6]:
from re import X
#Import Necessary Libraries
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn.model_selection import train_test_split

#Load cleaned dataset
test_encode = pd.read_csv('cleaned_test.csv')
test_encode = pd.DataFrame(test_encode)

#Find non-numerical values in the cleaned dataset
def find_non_numerical(test_encode):
    non_numerical = []
    for column in df.columns:
        if df[column].dtype == 'object':
            non_numerical.append(column)
    return non_numerical
#print(find_non_numerical(train_encode))

#Separate features and Target
X = test_encode[find_non_numerical(test_encode)]
y = test_encode['Id']

#Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first')

#Fit and Transform the Features
X_encoded = encoder.fit_transform(X)

#Convert to dataframes and column names
encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(X.columns))

df_encoded = pd.concat([encoded_df, y], axis=1)
#print(df_encoded)

#Save the Encoded Dataset
df_encoded.to_csv("encoded_test.csv")

Predict Housing Price

In [7]:
import pandas as pd
import joblib

# Load the model
model = joblib.load('model_1')

# Load the training data to get the original feature names
predict_data = pd.read_csv('encoded.csv')  # Replace with the path to your encoded training data
predict_features = predict_data.drop(columns=['SalePrice']).columns  # Get the feature names from the training data

# Load the test data
test_data = pd.read_csv('encoded_test.csv')

# Specify the name of the target column (if it exists in test data, remove it)
target_column = 'Id'  # Replace with your actual target column name, if needed

# **Store 'Id' before reindexing**
index_ids = test_data['Id'].tolist()

# Remove the target column if it exists in test data
if target_column in test_data.columns:
    test_data = test_data.drop(columns=[target_column])

# Reindex the test data to match the training data columns
test_data = test_data.reindex(columns=predict_features, fill_value=0) # Fill missing values with 0

# Make predictions for each row in the test data
predictions = model.predict(test_data)

# Create a DataFrame with Item_Identifier and Item_Outlet_Sales columns
results = pd.DataFrame({
    'Id': index_ids,
    'SalePrice': predictions
})

# Display the results
print(results)

# Optionally, save the results to a CSV file
results.to_csv('HousingPricePrediction2.csv', index=False)

        Id      SalePrice
0     1461  164405.011509
1     1462  189471.970586
2     1463  248311.109665
3     1464  237643.124412
4     1465  260052.520477
...    ...            ...
1454  2915  154696.719373
1455  2916  164693.264048
1456  2917  200418.812150
1457  2918  224562.009428
1458  2919  322335.584551

[1459 rows x 2 columns]
