In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
import xgboost as xgb
import csv

In [14]:
# Load the data
df = pd.read_csv('train.csv')
df.drop_duplicates(inplace=True)

# Separate features and target
X = df.drop('price_doc', axis=1)
y = df['price_doc']

# Convert object columns to numeric using a simpler approach
X = pd.get_dummies(X, drop_first=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply robust scaling to the training set
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)

# Best parameters from the grid search
best_params = {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50}

# XGBoost model with best parameters
xg_reg = xgb.XGBRegressor(
    colsample_bytree=best_params['colsample_bytree'],
    learning_rate=best_params['learning_rate'],
    max_depth=best_params['max_depth'],
    n_estimators=best_params['n_estimators'],
    objective='reg:squarederror'
)

In [15]:
# Fit the model
xg_reg.fit(X_train, y_train)



In [16]:
# Predictions on the test set
y_pred = xg_reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


Mean Squared Error: 699267522580393.2


In [17]:
# Load and preprocess the test data
df_test = pd.read_csv('test.csv')
df_test = df_test.drop(['row ID'], axis=1)

# Convert object columns to numeric using the same approach as in the training set
df_test = pd.get_dummies(df_test, drop_first=True)

# Ensure the columns match the training set
df_test = pd.DataFrame(df_test, columns=X.columns, dtype=float)

# Robust Scaling for the test set
df_test = scaler.transform(df_test)


In [None]:

y_pred_test = xg_reg.predict(df_test)


In [None]:

# Write predictions to a CSV file
filepath = 'xgboost_predictions.csv'
with open(filepath, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['row ID', 'price_doc'])
    for c, prediction in enumerate(y_pred_test, start=1):
        writer.writerow([c, prediction])
