In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime
from xgboost import XGBRegressor

X = pd.read_csv("../Cleaning/new_clean_data.csv", index_col=0)

X.dropna(axis=0, subset=["price"], inplace=True)
y = X["price"]

X.drop(["price"], axis=1, inplace=True)

: 

# Adding new columns


In [None]:
from datetime import datetime

now = datetime.now()
X["age"] = now.year - X["year"] + 1
X["mileage_over_year"] = X["mileage"] / X["age"]

: 

# Splitting data into Train, Validation and Test data


In [None]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=0
)
X_train_full, X_test_full, y_train, y_test = train_test_split(
    X_train_full, y_train, test_size=0.25, random_state=0
)

: 

# Preprocessing


## Selecting numeric values


In [None]:
numerical_cols = [
    cname
    for cname in X_train_full.columns
    if X_train_full[cname].dtype in ["int64", "float64"]
]
numerical_cols


: 

## Selecting categorical values


In [None]:
categorical_cols = [
    cname for cname in X_train_full.columns if X_train_full[cname].dtype == "object"
]
categorical_cols


: 

## Handling missing values


In [None]:
numerical_transformer = SimpleImputer(strategy="mean")
# Preprocessing for categorical data
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

: 

# Keep selected columns only


In [None]:
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

: 

# One-hot encode the data


In [None]:
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)
X_train, X_valid = X_train.align(X_valid, join="left", axis=1)
X_train, X_test = X_train.align(X_test, join="left", axis=1)


: 

In [None]:
my_model = XGBRegressor(
    n_estimators=1000, early_stopping_rounds=5, learning_rate=0.05, n_jobs=4
)

my_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)


: 

In [None]:
predictions = my_model.predict(X_valid)

: 

In [None]:
acc = r2_score(y_valid, predictions)
print(acc)

: 