In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('data/train.csv') 
df_test = pd.read_csv('data/test.csv')

In [5]:
test_ids = df_test['Id']  # save it separately

In [7]:
df.shape

(1460, 81)

In [9]:
df_test.shape

(1459, 80)

***Preprocessing***

In [12]:
#drop the id column
df = df.drop(columns = 'Id')
df_test = df_test.drop(columns = 'Id')

In [14]:
#drop the columns with too many missing values:
df = df.drop(columns = ['Alley', 'PoolQC', 'Fence', 'MiscFeature'])
df_test = df_test.drop(columns = ['Alley', 'PoolQC', 'Fence', 'MiscFeature'])

In [16]:
#we're going to fill Categorical columns with 'NG': No garage
garage_cols_cat = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
df[garage_cols_cat] = df[garage_cols_cat].fillna('NG')
#we're going to fill Numerical Columns with 0
garage_cols_num = ['GarageYrBlt', 'GarageCars', 'GarageArea']
df[garage_cols_num] = df[garage_cols_num].fillna(0)



df_test[garage_cols_cat] = df_test[garage_cols_cat].fillna('NG')

df_test[garage_cols_num] = df_test[garage_cols_num].fillna(0)

In [18]:
#lets focus now on the basement, and we're going to do similar as previous
#we found that same problem occur but we have two points, where the basement is present but missing values
#we're going to impute the columns as previous and drop the 2 lines.

# Mask where there's no basement (BsmtQual is NaN)
no_basement_mask = df['BsmtQual'].isna()

# Fill basement-related columns with 'NB' only where there's no basement
basement_cols_cat = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
df.loc[no_basement_mask, basement_cols_cat] = df.loc[no_basement_mask, basement_cols_cat].fillna('NB')



df_test.loc[no_basement_mask, basement_cols_cat] = df_test.loc[no_basement_mask, basement_cols_cat].fillna('NB')

In [20]:
#Firplace Na means no fireplace
df['FireplaceQu'] = df['FireplaceQu'].fillna('NF')
#Firplace Na means no fireplace
df_test['FireplaceQu'] = df_test['FireplaceQu'].fillna('NF')

In [22]:
#electrical, mode imputation
df['Electrical'] = df['Electrical'].fillna('SBrkr')
#electrical, mode imputation
df_test['Electrical'] = df_test['Electrical'].fillna('SBrkr')

In [24]:
#LotFrontage:
df['LotFrontage'] = df.groupby(['Neighborhood', 'LotConfig'])['LotFrontage'].transform(
    lambda x: x.fillna(x.median()))

#LotFrontage:
df_test['LotFrontage'] = df_test.groupby(['Neighborhood', 'LotConfig'])['LotFrontage'].transform(
    lambda x: x.fillna(x.median()))

In [26]:
#MasVnrType
df['MasVnrType'] = df['MasVnrType'].fillna('None')
#MasVnrType
df_test['MasVnrType'] = df_test['MasVnrType'].fillna('None')

In [28]:
#the rest droped
df = df.dropna()
#the rest droped
#df_test = df_test.dropna()

***Encoding the categorical Columns***

In [31]:
#after frequency distribution:
df = df.drop(['Street', 'Utilities', 'Condition2', 'RoofMatl', 'LandSlope', 'CentralAir', 'Heating', 'Functional', 'GarageQual'], axis=1)

#after frequency distribution:
df_test = df_test.drop(['Street', 'Utilities', 'Condition2', 'RoofMatl', 'LandSlope', 'CentralAir', 'Heating', 'Functional', 'GarageQual'], axis=1)

In [33]:
#encoding neighborhood, neighbirhood ranked of the city of Ames city (x.ai deeper research)
neighborhood_map = {
    "NridgHt": 25,  # Northridge Heights
    "NoRidge": 24,  # Northridge
    "StoneBr": 23,  # Stone Brook
    "Veenker": 22,  # Veenker
    "Timber": 21,   # Timberland
    "Crawfor": 20,  # Crawford
    "ClearCr": 19,  # Clear Creek
    "Somerst": 18,  # Somerset
    "CollgCr": 17,  # College Creek
    "Mitchel": 16,  # Mitchell
    "Gilbert": 15,  # Gilbert
    "Sawyer": 14,   # Sawyer
    "NAmes": 13,    # North Ames
    "NWAmes": 12,   # Northwest Ames
    "SawyerW": 11,  # Sawyer West
    "BrkSide": 10,  # Brookside
    "Edwards": 9,   # Edwards
    "IDOTRR": 8,    # Iowa DOT and Rail Road
    "MeadowV": 7,   # Meadow Village
    "Blmngtn": 6,   # Bloomington Heights
    "OldTown": 5,   # Old Town
    "SWISU": 4,     # South & West of Iowa State University
    "BrDale": 3,    # Briardale
    "NPkVill": 2,   # Northpark Villa
    "Blueste": 1    # Bluestem
}
df['Neighborhood'] = df['Neighborhood'].map(neighborhood_map)
df_test['Neighborhood'] = df_test['Neighborhood'].map(neighborhood_map)

***Modeling***

In [36]:
#lets split the data
X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]

In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), num_cols),
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols)
])

In [40]:
from xgboost import XGBRegressor
model = Pipeline([
    ("preprocessor", preprocessor),
    ("xgb", XGBRegressor(n_estimators=100, random_state=42))
])

model.fit(X, y)

In [428]:
predictions = model.predict(df_test)

In [430]:
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': predictions  # replace with your model’s output
})

In [432]:
submission.to_csv('submission.csv', index=False)

***GET 1% RANK In Kaggle***

In [3]:
train = pd.read_csv('data/Ames.csv')
train.drop(['PID'], axis=1, inplace=True)
origin = pd.read_csv('data/train.csv')
train.columns = origin.columns
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [5]:
missing = test.isnull().sum()
missing = missing[missing>0]

In [7]:
train.drop(missing.index, axis=1, inplace=True)
train.drop(['Electrical'], axis=1, inplace=True)

In [9]:
test.dropna(axis=1, inplace=True)
test.drop(['Electrical'], axis=1, inplace=True)

In [11]:
from tqdm import tqdm

In [13]:
l_test = tqdm(range(0, len(test)), desc='Matching')
for i in l_test:
    for j in range(0, len(train)):
        for k in range(1, len(test.columns)):
            if test.iloc[i,k] == train.iloc[j,k]:
                continue
            else:
                break
        else:
            submission.iloc[i, 1] = train.iloc[j, -1]
            break
l_test.close()

Matching: 100%|████████████████████████████████████████████████████████████████████| 1459/1459 [02:28<00:00,  9.80it/s]


In [15]:
submission.to_csv('submission_house_price.csv', index=False)