# House Prices - Advanced Regression Techniques

This notebook demonstrates a basic approach to predict house prices using a RandomForestRegressor. The steps include data loading, preprocessing, feature engineering, model training, and prediction.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

## Data Loading
Load the train and test datasets.

In [None]:
train_df = pd.read_csv('/mnt/data/dataset/train.csv')
test_df = pd.read_csv('/mnt/data/dataset/test.csv')
train_df.head(), test_df.head()

## Data Preprocessing
Handle missing values and encode categorical variables.

In [None]:
def preprocess_data(df):
    # Handle missing values
    df.fillna(df.median(), inplace=True)
    df.fillna("None", inplace=True)

    # Encode categorical variables
    for col in df.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    
    return df

train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

## Feature Engineering
Prepare the features for model training.

In [None]:
X = train_df.drop(['SalePrice', 'Id'], axis=1)
y = train_df['SalePrice']

## Train-Test Split
Split the data into training and validation sets.

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

## Model Training
Train a RandomForestRegressor model on the training data.

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

## Model Evaluation
Evaluate the model using the validation set.

In [None]:
y_pred = model.predict(X_valid)
mse = mean_squared_error(y_valid, y_pred)
print(f'Mean Squared Error: {mse}')

## Prediction and Submission
Make predictions on the test data and prepare the submission file.

In [None]:
X_test = test_df.drop(['Id'], axis=1)
predictions = model.predict(X_test)

submission = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': predictions
})
submission.to_csv('/mnt/data/submission.csv', index=False)