In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# 1. Load Data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 2. Save test IDs and drop Id columns
test_ID = test['Id']
train.drop('Id', axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

# 3. Separate target and features
y = train['SalePrice']
X = train.drop('SalePrice', axis=1)

# 4. Drop columns with too many missing values
missing_threshold = 0.4
missing = X.isnull().mean()
drop_cols = missing[missing > missing_threshold].index
X.drop(columns=drop_cols, inplace=True)
test.drop(columns=drop_cols, inplace=True)

# 5. Combine X and test for uniform processing
all_data = pd.concat([X, test], axis=0)

# 6. Impute numeric features with median
num_cols = all_data.select_dtypes(include=[np.number]).columns
imputer_num = SimpleImputer(strategy='median')
all_data[num_cols] = imputer_num.fit_transform(all_data[num_cols])

# 7. Impute categorical features with mode
cat_cols = all_data.select_dtypes(include=['object']).columns
imputer_cat = SimpleImputer(strategy='most_frequent')
all_data[cat_cols] = imputer_cat.fit_transform(all_data[cat_cols])

# 8. Encode categorical features
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col])
    label_encoders[col] = le

# 9. Feature Scaling
scaler = StandardScaler()
all_data_scaled = pd.DataFrame(scaler.fit_transform(all_data), columns=all_data.columns)

# 10. Split back to train/test
X_scaled = all_data_scaled.iloc[:len(X), :]
X_test_scaled = all_data_scaled.iloc[len(X):, :]

# 11. Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 12. Model Training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 13. Validation Prediction
y_val_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("Validation RMSE:", rmse)

# 14. Test Set Prediction
y_test_pred = model.predict(X_test_scaled)

# 15. Save Submission File
submission = pd.DataFrame({
    'Id': test_ID,
    'SalePrice': y_test_pred
})
submission.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' created.")

Validation RMSE: 27963.900575285
Submission file 'submission.csv' created.
