In [1]:
import pandas as pd

cars = pd.read_csv('../../data/cars.csv')

# output the first few songs to have a look on the data
cars[:5]

Unnamed: 0,Price,Year,Mileage,City,State,Vin,Make,Model
0,8995,2014,35725,El Paso,TX,19VDE2E53EE000083,Acura,ILX6-Speed
1,10888,2013,19606,Long Island City,NY,19VDE1F52DE012636,Acura,ILX5-Speed
2,8995,2013,48851,El Paso,TX,19VDE2E52DE000025,Acura,ILX6-Speed
3,10999,2014,39922,Windsor,CO,19VDE1F71EE003817,Acura,ILX5-Speed
4,14799,2016,22142,Lindon,UT,19UDE2F32GA001284,Acura,ILXAutomatic


### For each fold, a performance score is calculated, which indicates the model's performance on that particular fold of the cross-validation. 

In [2]:
import math
import pandas as pd
from sklearn import linear_model
from sklearn import model_selection
from sklearn.metrics import mean_squared_error

# Read data
cars = pd.read_csv('../../data/cars.csv')

# Prepare data matrix and target vector
X = cars[['Year', 'Mileage']].values
y = cars['Price']

# Perform five-fold cross-validation
car_splits = model_selection.KFold(n_splits=5, shuffle=True)
rmses = []

# Iterate over each fold
for train_index, test_index in car_splits.split(X):
    # Split into training and test data
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Fit linear regression model
    lr = linear_model.LinearRegression()
    lr.fit(X_train, y_train)
    
    # Print coefficients
    print(f'Intercept: {lr.intercept_}')
    print(f'Coefficients: {lr.coef_}')
    
    # Compute RMSE
    rmse = math.sqrt(mean_squared_error(y_test, lr.predict(X_test)))
    print(f'RMSE: {rmse}')
    rmses.append(rmse)
    
    print('-------')

# Calculate and print mean RMSE
mean_rmse = sum(rmses) / len(rmses)
print(f'Mean RMSE: {mean_rmse}')

Intercept: -1518188.0079261663
Coefficients: [ 7.67053522e+02 -8.84020902e-02]
RMSE: 12123.568406841809
-------
Intercept: -1518729.9010896431
Coefficients: [ 7.67317998e+02 -8.84617298e-02]
RMSE: 12288.7376330792
-------
Intercept: -1450921.0110993078
Coefficients: [ 7.33726888e+02 -9.18804269e-02]
RMSE: 12107.442492538834
-------
Intercept: -1509732.648062994
Coefficients: [ 7.62855627e+02 -8.85323636e-02]
RMSE: 12320.09135847702
-------
Intercept: -1505804.2166468957
Coefficients: [ 7.60911981e+02 -8.88756343e-02]
RMSE: 12249.300791210462
-------
Mean RMSE: 12217.828136429463


These scores represent the performance of the model on each fold of the cross-validation.

## Add some other feature and trying to improve the model 

In [None]:
import math
import pandas as pd
from sklearn import linear_model
from sklearn import model_selection
from sklearn.metrics import mean_squared_error

# Read data
cars = pd.read_csv('../../data/cars.csv')

# Prepare numerical features
numerical_features = ['Year', 'Mileage']
cars_numerical = cars[numerical_features]

# Prepare additional features by one-hot encoding
categorical_features = ['State', 'Make']
cars_encoded = pd.get_dummies(cars[categorical_features], prefix=categorical_features)

# Combine numerical and encoded features
cars_combined = pd.concat([cars_numerical, cars_encoded], axis=1)

# Prepare data matrix and target vector
X = cars_combined.values
y = cars['Price']

# Perform five-fold cross-validation
kf = model_selection.KFold(n_splits=5, shuffle=True)
rmses = []
for train_index, test_index in kf.split(X):

    # Split into training and test data
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Fit linear regression model
    lr = linear_model.LinearRegression()
    lr.fit(X_train, y_train)

    # Print coefficients
    print('Intercept:', lr.intercept_)
    print('Coefficients:\n', lr.coef_)

    # Compute RMSE
    rmse = math.sqrt(mean_squared_error(y_test, lr.predict(X_test)))
    print('RMSE:', rmse)
    rmses.append(rmse)
    
    print('-------')

print('Mean RMSE:', sum(rmses) / len(rmses))


Intercept: -2301401.70081356
Coefficients:
 [ 1.16601820e+03 -6.10130222e-02  1.32329873e+03  8.30612629e+02
  1.05620167e+03 -7.31473733e+02 -2.19443285e+03 -1.01318508e+03
  1.35426961e+03 -1.48010456e+03 -3.25932171e+03  3.67241550e+02
 -7.26938098e+02 -1.71232936e+03 -7.66488770e+02 -4.99534353e+02
  1.34271537e+03 -1.00063737e+03  8.21546160e+02  1.12192199e+03
 -8.28513360e+02 -9.69680491e+02 -6.91818666e+02 -1.31546378e+03
  6.00058336e+02 -9.01397462e+02 -1.12359056e+03  1.09464032e+02
 -1.80356385e+03 -2.93788761e+01 -5.39136867e+02  1.88415345e+03
  3.73518089e+03 -1.47774212e+03  1.25048903e+02  3.52119025e+02
  2.30337685e+03 -3.84099518e+02 -3.42244259e+02  1.31852982e+03
 -6.83525502e+02 -7.22942360e+02 -1.59848122e+03  4.88897848e+02
  1.19126852e+03 -1.57074974e+03 -3.09784847e+02 -9.55286729e+02
  1.18614968e+02  3.70744878e+03  6.18206038e+02  8.60314915e+02
  4.94096135e+02 -1.02577210e+03 -8.25456804e+02 -1.28237220e+03
  1.50785644e+03 -5.25306696e+02  3.24171931e+

By including the state and make as additional features, we observed a significant improvement in the RMSE. However, we made a conscious decision not to include the city or model as they would have introduced a large number of extra features, potentially leading to overfitting.