In [1]:

!pip install pandas
!pip install numpy
!pip install scikit-learn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


try:
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
except FileNotFoundError:
    print("Make sure 'train.csv' and 'test.csv' are uploaded to your Colab session.")

    exit()



if 'SalePrice' in train_df.columns:
    y_train = train_df['SalePrice']
    train_df = train_df.drop('SalePrice', axis=1)
else:
    print("Target variable 'SalePrice' not found in train.csv. Please check the dataset.")
    exit()


ntrain = train_df.shape[0]
all_data = pd.concat((train_df, test_df), sort=False).reset_index(drop=True)


all_data = all_data.drop('Id', axis=1)


categorical_features = all_data.select_dtypes(include=['object']).columns
numerical_features = all_data.select_dtypes(exclude=['object']).columns

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # Handle unseen categories
])

# Create a column transformer to apply different transformations to different column types
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply the preprocessing steps
all_data_processed = preprocessor.fit_transform(all_data)

# Split back into train and test sets
X_train_processed = all_data_processed[:ntrain]
X_test_processed = all_data_processed[ntrain:]

print("Preprocessing and Feature Engineering Complete.")
print(f"Shape of processed training data: {X_train_processed.shape}")
print(f"Shape of processed test data: {X_test_processed.shape}")
print(f"Shape of training target variable: {y_train.shape}")



Preprocessing and Feature Engineering Complete.
Shape of processed training data: (1460, 310)
Shape of processed test data: (1459, 310)
Shape of training target variable: (1460,)
