In [1]:
!pip install feature_engine


Collecting feature_engine
  Downloading feature_engine-1.8.3-py2.py3-none-any.whl.metadata (9.9 kB)
Downloading feature_engine-1.8.3-py2.py3-none-any.whl (378 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m378.6/378.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: feature_engine
Successfully installed feature_engine-1.8.3


In [2]:
import numpy as np

import pandas as pd

import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

import matplotlib.pyplot as plt

import warnings

In [3]:
train = pd.read_csv("/content/train.csv")
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-21,Banglore,New Delhi,08:55:00,19:10:00,615,1.0,In-flight meal not included,7832
1,Jet Airways,2019-03-27,Delhi,Cochin,17:30:00,04:25:00,655,1.0,In-flight meal not included,6540
2,Goair,2019-03-09,Banglore,New Delhi,11:40:00,14:35:00,175,0.0,No Info,7305
3,Air India,2019-06-12,Kolkata,Banglore,09:25:00,18:30:00,545,1.0,No Info,8366
4,Jet Airways,2019-03-12,Banglore,New Delhi,22:55:00,07:40:00,525,1.0,In-flight meal not included,11087
...,...,...,...,...,...,...,...,...,...,...
6690,Jet Airways,2019-03-21,Delhi,Cochin,10:45:00,18:50:00,1925,2.0,No Info,11093
6691,Air India,2019-05-01,Kolkata,Banglore,09:25:00,18:30:00,545,1.0,No Info,8891
6692,Jet Airways,2019-06-01,Delhi,Cochin,14:00:00,19:00:00,300,1.0,In-flight meal not included,10262
6693,Air Asia,2019-06-24,Delhi,Cochin,07:55:00,13:25:00,330,1.0,No Info,6152


In [4]:
val = pd.read_csv("/content/val.csv")
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-06-24,Delhi,Cochin,20:25:00,01:30:00,305,1.0,No Info,5054
1,Multiple Carriers,2019-06-12,Delhi,Cochin,09:45:00,22:30:00,765,1.0,No Info,9646
2,Jet Airways,2019-03-12,Banglore,New Delhi,22:55:00,15:15:00,980,1.0,In-flight meal not included,11087
3,Multiple Carriers,2019-06-06,Delhi,Cochin,13:00:00,21:00:00,480,1.0,No Info,13587
4,Jet Airways,2019-05-18,Delhi,Cochin,23:05:00,04:25:00,1760,2.0,No Info,16704
...,...,...,...,...,...,...,...,...,...,...
1669,Spicejet,2019-05-01,Chennai,Kolkata,09:45:00,12:00:00,135,0.0,No Info,3597
1670,Indigo,2019-05-01,Kolkata,Banglore,08:10:00,13:00:00,290,1.0,No Info,5069
1671,Jet Airways,2019-05-27,Delhi,Cochin,05:30:00,12:35:00,425,2.0,In-flight meal not included,15544
1672,Jet Airways,2019-06-12,Mumbai,Hyderabad,19:35:00,21:05:00,90,0.0,In-flight meal not included,3210


In [5]:
test = pd.read_csv("/content/test.csv")
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-06,Banglore,New Delhi,08:00:00,08:15:00,1455,1.0,No Info,17996
1,Spicejet,2019-06-06,Kolkata,Banglore,22:20:00,00:40:00,140,0.0,No Info,3873
2,Indigo,2019-03-18,Kolkata,Banglore,05:30:00,08:20:00,170,0.0,No Info,4462
3,Indigo,2019-06-27,Chennai,Kolkata,19:35:00,21:55:00,140,0.0,No Info,3597
4,Indigo,2019-05-06,Kolkata,Banglore,15:15:00,17:45:00,150,0.0,No Info,4804
...,...,...,...,...,...,...,...,...,...,...
2088,Jet Airways,2019-05-27,Delhi,Cochin,19:15:00,12:35:00,1040,1.0,In-flight meal not included,12898
2089,Multiple Carriers,2019-06-27,Delhi,Cochin,11:25:00,19:15:00,470,1.0,No Info,7155
2090,Jet Airways,2019-06-03,Delhi,Cochin,02:15:00,04:25:00,1570,1.0,In-flight meal not included,11627
2091,Multiple Carriers,2019-06-06,Delhi,Cochin,15:15:00,01:30:00,615,1.0,No Info,6795


In [6]:
# airline
air_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

#doj
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
    ("scaler", MinMaxScaler())
])

# source & destination
location_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Kolkata", "Mumbai", "New Delhi"]
    return (
        X
        .assign(**{
            f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

location_transformer = FeatureUnion(transformer_list=[
    ("part1", location_pipe1),
    ("part2", FunctionTransformer(func=is_north))
])

# dep_time & arrival_time
time_pipe1 = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
    ("scaler", MinMaxScaler())
])

def part_of_day(X, morning=4, noon=12, eve=16, night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour
        for col in columns
    })

    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:, col].between(morning, noon, inclusive="left"),
                 X_temp.loc[:, col].between(noon, eve, inclusive="left"),
                 X_temp.loc[:, col].between(eve, night, inclusive="left")],
                ["morning", "afternoon", "evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )

time_pipe2 = Pipeline(steps=[
    ("part", FunctionTransformer(func=part_of_day)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
])

time_transformer = FeatureUnion(transformer_list=[
    ("part1", time_pipe1),
    ("part2", time_pipe2)
])

# duration
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma


    def fit(self, X, y=None):
        # Convert input to DataFrame if it's a NumPy array
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)

        if not self.variables:
            self.variables = X.select_dtypes(include="number").columns.to_list()

        self.reference_values_ = {
            col: (
                X
                .loc[:, col]
                .quantile(self.percentiles)
                .values
                .reshape(-1, 1)
            )
            for col in self.variables
        }

        return self


    def transform(self, X):
        # Convert input to DataFrame if it's a NumPy array
        if isinstance(X, np.ndarray):
             # We need to know the original column names to convert back to DataFrame correctly.
             # However, the ColumnTransformer usually passes a single column array at this point.
             # Assuming a single column input based on the ColumnTransformer config:
             if X.shape[1] > 1 and not self.variables:
                  raise ValueError("Cannot transform multi-column numpy array without variable names.")

             # If self.variables is set, use the first variable name for the column.
             # Otherwise, create a generic column name (e.g., '0').
             column_name = self.variables[0] if self.variables else 0
             X = pd.DataFrame(X, columns=[column_name])


        objects = []
        for col in self.variables:
            columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)
        return pd.concat(objects, axis=1)


def duration_category(X, short=180, med=400):
    # Ensure X is a DataFrame
    if isinstance(X, np.ndarray):
        # Assuming the input is the 'duration' column as a NumPy array
        X = pd.DataFrame(X, columns=['duration'])

    return (
        X
        .assign(duration_cat=np.select([X.duration.lt(short),
                                    X.duration.between(short, med, inclusive="left")],
                                    ["short", "medium"],
                                    default="long"))
        .drop(columns="duration")
    )

def is_over(X, value=1000):
     # Ensure X is a DataFrame
    if isinstance(X, np.ndarray):
         # Assuming the input is the 'duration' column as a NumPy array
        X = pd.DataFrame(X, columns=['duration'])
    return (
        X
        .assign(**{
            f"duration_over_{value}": X.duration.ge(value).astype(int)
        })
        .drop(columns="duration")
    )

duration_pipe1 = Pipeline(steps=[
    ("rbf", RBFPercentileSimilarity()),
    ("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
    ("cat", FunctionTransformer(func=duration_category)),
    ("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
    ("part1", duration_pipe1),
    ("part2", duration_pipe2),
    ("part3", FunctionTransformer(func=is_over)),
    ("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
    ("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
    ("imputer", SimpleImputer(strategy="median")),
    ("union", duration_union)
])

# total_stops
def is_direct(X):
     # Ensure X is a DataFrame
    if isinstance(X, np.ndarray):
         # Assuming the input is the 'total_stops' column as a NumPy array
        X = pd.DataFrame(X, columns=['total_stops'])
    return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))


total_stops_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("", FunctionTransformer(func=is_direct))
])

# additional_info
info_pipe1 = Pipeline(steps=[
    ("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

def have_info(X):
     # Ensure X is a DataFrame
    if isinstance(X, np.ndarray):
         # Assuming the input is the 'additional_info' column as a NumPy array
        X = pd.DataFrame(X, columns=['additional_info'])
    return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
("part1", info_pipe1),
("part2", FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
("union", info_union)
])

# column transformer
column_transformer = ColumnTransformer(transformers=[
("air", air_transformer, ["airline"]),
("doj", doj_transformer, ["date_of_journey"]),
("location", location_transformer, ["source", 'destination']),
("time", time_transformer, ["dep_time", "arrival_time"]),
("dur", duration_transformer, ["duration"]),
("stops", total_stops_transformer, ["total_stops"]),
("info", info_transformer, ["additional_info"])
], remainder="passthrough")



In [7]:
# feature selector
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
estimator=estimator,
scoring="r2",
threshold=0.1
)

In [8]:
# preprocessor
preprocessor = Pipeline(steps=[
("ct", column_transformer),
("selector", selector)
])

In [9]:
preprocessor.fit(
    train.drop(columns="price"),
    train.price.copy()
)

  pd.to_datetime(
  pd.to_datetime(
  col: pd.to_datetime(X.loc[:, col]).dt.hour
  col: pd.to_datetime(X.loc[:, col]).dt.hour


In [10]:
preprocessor.transform(train.drop(columns="price"))


  pd.to_datetime(
  pd.to_datetime(
  col: pd.to_datetime(X.loc[:, col]).dt.hour
  col: pd.to_datetime(X.loc[:, col]).dt.hour


Unnamed: 0,x1,x2,x4,x6,x8,x9,x10,x19,x22,x23,x24,x25,x26
0,0.0,1.0,0.0,0.176471,0.169492,-0.857930,-0.736484,-0.364262,2.0,0.0,-0.033916,1.0,0.0
1,0.0,1.0,0.0,0.235294,0.220339,1.065418,1.061694,-0.364262,2.0,0.0,0.046422,1.0,0.0
2,0.0,0.0,1.0,0.058824,0.067797,-0.857930,-0.736484,2.373008,0.0,0.0,-0.917631,0.0,1.0
3,0.0,0.0,0.0,0.882353,0.872881,-0.203928,-0.224351,-0.364262,2.0,0.0,-0.174507,1.0,0.0
4,0.0,1.0,0.0,0.117647,0.093220,-0.857930,-0.736484,-0.364262,2.0,0.0,-0.214676,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6690,0.0,1.0,0.0,0.176471,0.169492,1.065418,1.061694,-0.364262,2.0,1.0,2.597145,2.0,0.0
6691,0.0,0.0,0.0,0.529412,0.516949,-0.203928,-0.224351,-0.364262,2.0,0.0,-0.174507,1.0,0.0
6692,0.0,1.0,0.0,0.764706,0.779661,1.065418,1.061694,-0.364262,1.0,0.0,-0.666576,1.0,0.0
6693,0.0,0.0,1.0,1.000000,0.974576,1.065418,1.061694,-0.364262,1.0,0.0,-0.606322,1.0,0.0


In [11]:
def get_file_name(name):
    return f"{name}-pre.csv"

In [12]:
def export_data(data, name, pre):
    # split data into X and y subsets
    X = data.drop(columns="price")
    y = data.price.copy()

    # transformation
    X_pre = pre.transform(X)

    # exporting
    file_name = get_file_name(name)
    (
        y
        .to_frame()
        .join(X_pre)
        .to_csv(file_name, index=False)
    )

In [13]:
def export_and_upload_bucket(data, name, pre):
    export_data(data, name, pre)
    # upload_to_bucket(name)

In [14]:
export_and_upload_bucket(train, "train", preprocessor)

  pd.to_datetime(
  pd.to_datetime(
  col: pd.to_datetime(X.loc[:, col]).dt.hour
  col: pd.to_datetime(X.loc[:, col]).dt.hour


In [15]:
export_and_upload_bucket(val, "val", preprocessor)

  pd.to_datetime(
  pd.to_datetime(
  col: pd.to_datetime(X.loc[:, col]).dt.hour
  col: pd.to_datetime(X.loc[:, col]).dt.hour


In [16]:
export_and_upload_bucket(test, "test", preprocessor)

  pd.to_datetime(
  pd.to_datetime(
  col: pd.to_datetime(X.loc[:, col]).dt.hour
  col: pd.to_datetime(X.loc[:, col]).dt.hour


In [17]:
df1 = pd.read_csv("/content/train-pre.csv")
df1

Unnamed: 0,price,x1,x2,x4,x6,x8,x9,x10,x19,x22,x23,x24,x25,x26
0,7832,0.0,1.0,0.0,0.176471,0.169492,-0.857930,-0.736484,-0.364262,2.0,0.0,-0.033916,1.0,0.0
1,6540,0.0,1.0,0.0,0.235294,0.220339,1.065418,1.061694,-0.364262,2.0,0.0,0.046422,1.0,0.0
2,7305,0.0,0.0,1.0,0.058824,0.067797,-0.857930,-0.736484,2.373008,0.0,0.0,-0.917631,0.0,1.0
3,8366,0.0,0.0,0.0,0.882353,0.872881,-0.203928,-0.224351,-0.364262,2.0,0.0,-0.174507,1.0,0.0
4,11087,0.0,1.0,0.0,0.117647,0.093220,-0.857930,-0.736484,-0.364262,2.0,0.0,-0.214676,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6690,11093,0.0,1.0,0.0,0.176471,0.169492,1.065418,1.061694,-0.364262,2.0,1.0,2.597145,2.0,0.0
6691,8891,0.0,0.0,0.0,0.529412,0.516949,-0.203928,-0.224351,-0.364262,2.0,0.0,-0.174507,1.0,0.0
6692,10262,0.0,1.0,0.0,0.764706,0.779661,1.065418,1.061694,-0.364262,1.0,0.0,-0.666576,1.0,0.0
6693,6152,0.0,0.0,1.0,1.000000,0.974576,1.065418,1.061694,-0.364262,1.0,0.0,-0.606322,1.0,0.0


In [18]:

# Import necessary libraries
from sklearn.model_selection import train_test_split
import xgboost as xgb

# Define features (X) and target (y)
X = df1.drop('price', axis=1)
y = df1['price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost Regressor model
xgbr = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgbr.fit(X_train, y_train)

In [19]:

# Import necessary libraries
from sklearn.model_selection import train_test_split
import xgboost as xgb

# Define features (X) and target (y)
X = df1.iloc[:,1:]
y = df1['price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost Regressor model
xgbr = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgbr.fit(X_train, y_train)

In [20]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Make predictions on the test set
y_pred = xgbr.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared: {r2:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")


Mean Squared Error: 5441206.00
Root Mean Squared Error: 2332.64
R-squared: 0.72
Mean Absolute Error: 1426.29


In [21]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgbr, param_grid=param_grid,
                           scoring='r2', cv=5, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best R2 Score: {best_score:.2f}")

# Use the best model
best_model = grid_search.best_estimator_

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.8}
Best R2 Score: 0.78


In [22]:
val = pd.read_csv("/content/val-pre.csv")
val


Unnamed: 0,price,x1,x2,x4,x6,x8,x9,x10,x19,x22,x23,x24,x25,x26
0,5054,1.0,0.0,0.0,1.000000,0.974576,1.065418,1.061694,-0.364262,1.0,0.0,-0.656533,1.0,0.0
1,9646,0.0,0.0,0.0,0.882353,0.872881,1.065418,1.061694,-0.364262,2.0,0.0,0.267351,1.0,0.0
2,11087,0.0,1.0,0.0,0.117647,0.093220,-0.857930,-0.736484,-0.364262,2.0,0.0,0.699166,1.0,0.0
3,13587,0.0,0.0,0.0,0.823529,0.822034,1.065418,1.061694,-0.364262,2.0,0.0,-0.305056,1.0,0.0
4,16704,0.0,1.0,0.0,0.647059,0.661017,1.065418,1.061694,-0.364262,2.0,1.0,2.265752,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1669,3597,0.0,0.0,1.0,0.529412,0.516949,-1.892191,-0.736484,-0.364262,0.0,0.0,-0.997969,0.0,1.0
1670,5069,1.0,0.0,0.0,0.529412,0.516949,-0.203928,-0.224351,-0.364262,1.0,0.0,-0.686660,1.0,0.0
1671,15544,0.0,1.0,0.0,0.764706,0.737288,1.065418,1.061694,-0.364262,2.0,0.0,-0.415520,2.0,0.0
1672,3210,0.0,1.0,0.0,0.882353,0.872881,-1.892191,-0.736484,-0.364262,0.0,0.0,-1.088349,0.0,1.0


In [23]:
test = pd.read_csv("/content/test-pre.csv")
test

Unnamed: 0,price,x1,x2,x4,x6,x8,x9,x10,x19,x22,x23,x24,x25,x26
0,17996,0.0,1.0,0.0,0.058824,0.042373,-0.857930,-0.736484,-0.364262,2.0,1.0,1.653176,1.0,0.0
1,3873,0.0,0.0,1.0,0.823529,0.822034,-0.203928,-0.224351,-0.364262,0.0,0.0,-0.987926,0.0,1.0
2,4462,1.0,0.0,0.0,0.176471,0.144068,-0.203928,-0.224351,3.112489,0.0,0.0,-0.927673,0.0,1.0
3,3597,1.0,0.0,0.0,1.000000,1.000000,-1.892191,-0.736484,-0.364262,0.0,0.0,-0.987926,0.0,1.0
4,4804,1.0,0.0,0.0,0.588235,0.559322,-0.203928,-0.224351,-0.364262,0.0,0.0,-0.967842,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2088,12898,0.0,1.0,0.0,0.764706,0.737288,1.065418,1.061694,-0.364262,2.0,1.0,0.819672,1.0,0.0
2089,7155,0.0,0.0,0.0,1.000000,1.000000,1.065418,1.061694,-0.364262,2.0,0.0,-0.325140,1.0,0.0
2090,11627,0.0,1.0,0.0,0.823529,0.796610,1.065418,1.061694,-0.364262,2.0,1.0,1.884147,1.0,0.0
2091,6795,0.0,0.0,0.0,0.823529,0.822034,1.065418,1.061694,-0.364262,2.0,0.0,-0.033916,1.0,0.0


In [24]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Load validation and test datasets (already preprocessed)
val = pd.read_csv("/content/val-pre.csv")
test = pd.read_csv("/content/test-pre.csv")

# Split into features and target
X_val = val.drop(columns="price")
y_val = val["price"]

X_test = test.drop(columns="price")
y_test = test["price"]

# Predict using the best model
y_val_pred = best_model.predict(X_val)
y_test_pred = best_model.predict(X_test)

# Evaluate on validation set
val_mse = mean_squared_error(y_val, y_val_pred)
val_rmse = np.sqrt(val_mse)
val_r2 = r2_score(y_val, y_val_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)

print("Validation Results:")
print(f"  R-squared: {val_r2:.2f}")
print(f"  RMSE: {val_rmse:.2f}")
print(f"  MAE: {val_mae:.2f}")

# Evaluate on test set
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print("\nTest Results:")
print(f"  R-squared: {test_r2:.2f}")
print(f"  RMSE: {test_rmse:.2f}")
print(f"  MAE: {test_mae:.2f}")


Validation Results:
  R-squared: 0.73
  RMSE: 2472.50
  MAE: 1457.58

Test Results:
  R-squared: 0.73
  RMSE: 2352.29
  MAE: 1437.84


In [None]:
import joblib

# Save the model
joblib.dump(best_model, "xgboost_model.pkl")

In [None]:
joblib.dump(preprocessor, "preprocessor.pkl")