<a href="https://colab.research.google.com/github/Ironsoldier353/ML_project/blob/main/Predicting_sale_price.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing libraries

In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
import os


Loading the data

In [11]:
# Load the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

Preprocessing the data

In [12]:
# Print column names to verify
print("Train Data Columns:", train_df.columns)
print("Test Data Columns:", test_df.columns)

# Define the target column
target_column = 'SalePrice'

# Separate features and target
X_train = train_df.drop(target_column, axis=1)
y_train = train_df[target_column]

# Test data does not have the target column
X_test = test_df

# Identify numeric and categorical columns
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Preprocessing for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Handle missing values
    ('scaler', StandardScaler())])

# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing for both numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

Train Data Columns: Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'Gara

Train the model

In [13]:
# Polynomial features
degree = 2
poly = PolynomialFeatures(degree=degree)

# Create the full pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', poly),
    ('regressor', LinearRegression())])

# Train the model
model.fit(X_train, y_train)


Predicitng the saleprice

In [14]:
# Make predictions
y_pred = model.predict(X_test)

# Save predictions to a CSV file
output_file = 'predictions.csv'
predictions_df = pd.DataFrame(y_pred, columns=['Predicted'])
predictions_df.to_csv(output_file, index=False)

# Check if the file is saved
if os.path.exists(output_file):
    print(f"Predictions successfully saved to '{output_file}'")
else:
    print(f"Failed to save predictions to '{output_file}'")

Predictions successfully saved to 'predictions.csv'


Model performance

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Split the data
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the model on the training split
model.fit(X_train_split, y_train_split)

# Predict on the validation split
y_val_pred = model.predict(X_val_split)

# Evaluate performance
mae = mean_absolute_error(y_val_split, y_val_pred)
mse = mean_squared_error(y_val_split, y_val_pred)
rmse = mse ** 0.5
r2 = r2_score(y_val_split, y_val_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")


MAE: 21181.93466103875
MSE: 1030301233.1084145
RMSE: 32098.30576694687
R^2: 0.8656769992203176
