In [97]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [98]:
# Load the clean dataset
df = pd.read_csv('../data/moscow_houses.csv')

In [99]:
df.head()

Unnamed: 0,Price,Apartment type,Metro station,Minutes to metro,Region,Number of rooms,Area,Living area,Kitchen area,Floor,Number of floors,Renovation
0,6300000.0,Secondary,Опалиха,6.0,Moscow region,1.0,30.6,11.1,8.5,25.0,25,Cosmetic
1,9000000.0,Secondary,Павшино,2.0,Moscow region,1.0,49.2,20.0,10.0,6.0,15,European-style renovation
2,11090000.0,Secondary,Мякинино,14.0,Moscow region,1.0,44.7,16.2,13.1,10.0,25,Cosmetic
3,8300000.0,Secondary,Строгино,8.0,Moscow region,1.0,35.1,16.0,11.0,12.0,33,European-style renovation
4,6450000.0,Secondary,Опалиха,6.0,Moscow region,1.0,37.7,15.2,4.0,5.0,5,Without renovation


In [100]:
# Separate features and target
X = df.drop(columns=['Price'])
y = df['Price']

# Encode categorical variables
categorical_cols = ['Apartment type', 'Metro station', 'Region', 'Renovation']
X_encoded = pd.get_dummies(X, columns=categorical_cols)

# Split data into train-test
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (16697, 550)
Shape of X_test: (4175, 550)
Shape of y_train: (16697,)
Shape of y_test: (4175,)


In [101]:
# Save training and testing datasets to CSV files
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False, header=True)
y_test.to_csv('y_test.csv', index=False, header=True)

In [102]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Initialize the Random Forest regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Train the Random Forest model on the training data
rf_regressor.fit(X_train, y_train)

# Predict housing prices on the testing data
y_pred = rf_regressor.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Display evaluation metrics
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)

Mean Squared Error (MSE): 146853874475275.56
Mean Absolute Error (MAE): 4142213.1730717863


In [103]:
# Convert NumPy arrays to pandas Series or DataFrame
# y_train_df = pd.Series(y_train, name='Price')
# y_test_df = pd.Series(y_test, name='Price')

# Save the target variables (y_train, y_test) as CSV files
# y_train_df.to_csv('y_train.csv', index=False)
# y_test_df.to_csv('y_test.csv', index=False)

In [104]:
# X_train.to_csv('X_train.csv', index=False)
# X_test.to_csv('X_test.csv', index=False)