In [1]:


import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from xgboost import XGBRegressor


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:

or_train_df = pd.read_csv('/content/drive/MyDrive/house price-20240402T140044Z-001/house price/train.csv')
or_test_df = pd.read_csv('/content/drive/MyDrive/house price-20240402T140044Z-001/house price/test.csv')

train_df = or_train_df
test_df = or_test_df

In [4]:

train_df = train_df.drop(['Id'], axis=1)
test_df = test_df.drop(['Id'], axis=1)

In [5]:
print(f"Shape of train set: {train_df.shape}")
print(f"Shape of test set: {test_df.shape}")

Shape of train set: (1259, 80)
Shape of test set: (201, 79)


## Replace NaNs
**While replacing we have to make sure to replace it with '0' (character) in case of a string column and 0 (number) if otherwise**

In [6]:
str_columns = []
num_columns = []

def convert_to_str(df):
    for col in df.columns:
        if (df[col].dtype == np.int64 or df[col].dtype == np.float64):
            df[col] = df[col].fillna(method='ffill')
            if col not in num_columns:
                num_columns.append(col)
        else:
            df[col] = df[col].fillna(method='ffill')
            if col not in str_columns:
                str_columns.append(col)
    return df

train_df = convert_to_str(train_df)
test_df = convert_to_str(test_df)

In [7]:
train_df['type'] = 'train'
test_df['type'] = 'test'

# Add a dummy SalePrice column to test dataframe to make number of columns equal
test_df['SalePrice'] = train_df['SalePrice'].iloc[:201]

# Concatenate the two DataFrames
df = pd.concat([train_df, test_df], ignore_index=True)

In [8]:
for col in str_columns:
    one_hot = pd.get_dummies(df[col])

    replace_cols = {}
    for one_col in one_hot.columns:
        replace_cols[one_col] = f"{col}_{one_col}"
    one_hot = one_hot.rename(columns=replace_cols)

    df = df.drop(col, axis = 1)
    df = df.join(one_hot)

In [9]:
train_df = df[df['type'] == 'train']
test_df = df[df['type'] == 'test']

train_df = train_df.drop(['type'], axis=1)
test_df = test_df.drop(['type'], axis=1)

test_df = test_df.reset_index(drop=True)

In [10]:
print(f"Shape of train set: {train_df.shape}")
print(f"Shape of test set: {test_df.shape}")

Shape of train set: (1259, 288)
Shape of test set: (201, 288)


## Standardization of values

In [11]:
scaler = StandardScaler()
scaler.fit(train_df[num_columns])

In [12]:
train_df[num_columns] = scaler.transform(train_df[num_columns])
test_df[num_columns] = scaler.transform(test_df[num_columns])

In [13]:
print(f"Shape of train set: {train_df.shape}")
print(f"Shape of test set: {test_df.shape}")

Shape of train set: (1259, 288)
Shape of test set: (201, 288)


In [14]:
test_df = test_df.drop(['SalePrice'], axis=1)

In [15]:
train_labels = train_df['SalePrice']
train_data = train_df.drop(['SalePrice'], axis=1)

## XGBoost regressor model

In [16]:
model = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)

In [17]:
model.fit(train_data, train_labels)

In [18]:
predictions = model.predict(test_df)

In [19]:
test_df['SalePrice'] = predictions
test_df[num_columns] = scaler.inverse_transform(test_df[num_columns])
test_df = pd.DataFrame(test_df, columns=train_df.columns)

In [20]:

results = pd.DataFrame()
results['Id'] = or_test_df['Id']
results['SalePrice'] = test_df['SalePrice']

In [21]:
results.head()

Unnamed: 0,Id,SalePrice
0,1260,151138.384005
1,1261,192509.887145
2,1262,130296.62649
3,1263,133063.498103
4,1264,167205.338574


In [22]:
results.to_csv('submissions.csv', index=False)