In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('data.csv')

In [None]:
df

Unnamed: 0,Type,Months_Old,Price
0,570ES,46,427.78
1,82MS,36,237.80
2,570ES,45,365.95
3,570ES,8,189.89
4,82MS,29,66.77
...,...,...,...
1995,82MS,50,162.78
1996,570ES,5,209.15
1997,991MS,30,141.76
1998,570ES,56,511.92


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Type        2000 non-null   object 
 1   Months_Old  2000 non-null   int64  
 2   Price       2000 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 47.0+ KB


In [None]:
# Splitting the dataset based on months_old
train_df = df[df["Months_Old"] > 36]  # Train: months_old > 36
val_df = df[(df["Months_Old"] >= 12) & (df["Months_Old"] <= 36)]  # Validation: 12 ≤ months_old ≤ 36
test_df = df[df["Months_Old"] < 12]  # Test: months_old < 12

In [None]:
input_cols = ['Type', 'Months_Old']
output_col = ['Price']
train_input = train_df[input_cols]
train_output = train_df[output_col]
val_input = val_df[input_cols]
val_output = val_df[output_col]
test_input = test_df[input_cols]
test_output = test_df[output_col]

In [None]:
inputs_numerical = train_input.select_dtypes(include=np.number).columns.tolist()
inputs_categorical = train_input.select_dtypes(include='object').columns.tolist()
print(inputs_categorical)
print(inputs_numerical)

['Type']
['Months_Old']


In [None]:
train_df[inputs_numerical].describe()

Unnamed: 0,Months_Old
count,767.0
mean,47.787484
std,6.551698
min,37.0
25%,42.0
50%,48.0
75%,54.0
max,59.0


In [None]:
train_df[inputs_categorical].nunique()

Unnamed: 0,0
Type,3


In [None]:
##Missing Value
df[inputs_numerical].isna().sum()

Unnamed: 0,0
Months_Old,0


In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Fit only on train data
scaler.fit(train_df[inputs_numerical])

# Transform train, val, and test
train_df[inputs_numerical] = scaler.transform(train_df[inputs_numerical])
val_df[inputs_numerical] = scaler.transform(val_df[inputs_numerical])
test_df[inputs_numerical] = scaler.transform(test_df[inputs_numerical])

print(scaler.data_min_)
print(scaler.data_max_)

[37.]
[59.]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[inputs_numerical] = scaler.transform(train_df[inputs_numerical])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df[inputs_numerical] = scaler.transform(val_df[inputs_numerical])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[inputs_numerical] = scaler.transform(test_df[inputs_n

In [None]:
train_df

Unnamed: 0,Type,Months_Old,Price
0,570ES,0.409091,427.78
2,570ES,0.363636,365.95
5,82MS,0.818182,132.22
6,570ES,0.727273,251.61
7,991MS,0.818182,149.81
...,...,...,...
1989,570ES,0.045455,352.65
1993,82MS,0.545455,87.23
1994,82MS,0.363636,131.15
1995,82MS,0.590909,162.78


In [None]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Step 1: Initialize OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)  # Ensures array output

# Step 2: Fit only on train data to prevent data leakage
encoder.fit(train_df[inputs_categorical])

# Step 3: Transform train, val, and test sets
train_encoded = pd.DataFrame(encoder.transform(train_df[inputs_categorical]),
                             columns=encoder.get_feature_names_out(inputs_categorical),
                             index=train_df.index)

val_encoded = pd.DataFrame(encoder.transform(val_df[inputs_categorical]),
                           columns=encoder.get_feature_names_out(inputs_categorical),
                           index=val_df.index)

test_encoded = pd.DataFrame(encoder.transform(test_df[inputs_categorical]),
                            columns=encoder.get_feature_names_out(inputs_categorical),
                            index=test_df.index)

# Step 4: Drop categorical columns **without resetting index**
train_df = train_df.drop(columns=inputs_categorical)
val_df = val_df.drop(columns=inputs_categorical)
test_df = test_df.drop(columns=inputs_categorical)

# Step 5: Merge encoded features **without resetting index**
train_df = pd.concat([train_df, train_encoded], axis=1)
val_df = pd.concat([val_df, val_encoded], axis=1)
test_df = pd.concat([test_df, test_encoded], axis=1)

# Step 6: Verify NaN values
print(train_df.isna().sum().sum(), val_df.isna().sum().sum(), test_df.isna().sum().sum())


0 0 0


Index(['Months_Old', 'Price', 'Type_570ES', 'Type_82MS', 'Type_991MS'], dtype='object')
['Type']


In [None]:
train_df

Unnamed: 0,Months_Old,Price,Type_570ES,Type_82MS,Type_991MS
0,0.409091,427.78,1.0,0.0,0.0
2,0.363636,365.95,1.0,0.0,0.0
5,0.818182,132.22,0.0,1.0,0.0
6,0.727273,251.61,1.0,0.0,0.0
7,0.818182,149.81,0.0,0.0,1.0
...,...,...,...,...,...
1989,0.045455,352.65,1.0,0.0,0.0
1993,0.545455,87.23,0.0,1.0,0.0
1994,0.363636,131.15,0.0,1.0,0.0
1995,0.590909,162.78,0.0,1.0,0.0


In [None]:
# Merge numerical and encoded features before training
train_final = pd.concat([train_input[inputs_numerical], train_encoded], axis=1)
val_final = pd.concat([val_input[inputs_numerical], val_encoded], axis=1)
test_final = pd.concat([test_input[inputs_numerical], test_encoded], axis=1)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(train_final, train_output)

train_preds = model.predict(train_final)
val_preds = model.predict(val_final)

train_rmse = mean_squared_error(train_output, train_preds)** 0.5
val_rmse = mean_squared_error(val_output, val_preds)** 0.5

print(f"Train RMSE: {train_rmse:.2f}, Validation RMSE: {val_rmse:.2f}")


  return fit_method(estimator, *args, **kwargs)


Train RMSE: 106.08, Validation RMSE: 121.73


In [None]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Define the model
model = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42, objective='reg:squarederror')

# Train the model
model.fit(train_final, train_output)

# Predictions
train_preds = model.predict(train_final)
val_preds = model.predict(val_final)
test_preds = model.predict(test_final)

# RMSE Calculation
train_rmse = mean_squared_error(train_output, train_preds) ** 0.5
val_rmse = mean_squared_error(val_output, val_preds) ** 0.5
test_rmse = mean_squared_error(test_output, test_preds) ** 0.5

print(f"Train RMSE: {train_rmse:.2f}, Validation RMSE: {val_rmse:.2f}, Test RMSE: {test_rmse:.2f}")


Train RMSE: 106.01, Validation RMSE: 121.91, Test RMSE: 114.71


In [None]:
train_final

Unnamed: 0,Months_Old,Type_570ES,Type_82MS,Type_991MS
0,46,1.0,0.0,0.0
2,45,1.0,0.0,0.0
5,55,0.0,1.0,0.0
6,53,1.0,0.0,0.0
7,55,0.0,0.0,1.0
...,...,...,...,...
1989,38,1.0,0.0,0.0
1993,49,0.0,1.0,0.0
1994,45,0.0,1.0,0.0
1995,50,0.0,1.0,0.0


In [None]:
model.predict([[36, 0,1,0]])

array([186.25943], dtype=float32)

In [None]:
import joblib

In [None]:
calculator={
    'model': model,
    'final_cols': train_final.columns.tolist()
}
joblib.dump(calculator, 'calculatorModel.joblib')

NameError: name 'calculatorModel' is not defined