In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sb 

In [None]:
# new data

data_new=pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv')
data=data_new
data


# old data

data_old=pd.read_csv('/kaggle/input/old-data/used_cars.csv')
data_old

# reformatting old data

data_old['milage'] = data_old['milage'].str.replace('mi.', '')
data_old['milage'] = data_old['milage'].str.replace(',', '')
data_old['price'] = data_old['price'].str.replace('$', '')
data_old['price'] = data_old['price'].str.replace(',', '')

data_old['milage']=data_old['milage'].apply(pd.to_numeric)
data_old['price']=data_old['price'].apply(pd.to_numeric)

new_id=pd.DataFrame(np.arange(188533,192542,1),columns=['id'])
data_old=pd.concat([new_id,data_old],axis=1)

data_old


# final data

data=pd.concat([data_new,data_old],axis=0,ignore_index=True)
data

In [None]:
data['brand'].unique()


In [None]:
data['model'].unique()


In [None]:
data['model'].unique().shape

In [None]:
(data['model_year'].sort_values( ascending=True)).unique()

In [None]:
data['fuel_type'].unique()

In [None]:
data['engine'].unique()

In [None]:
data['engine'].unique().shape

In [None]:
data['transmission'].unique()

In [None]:
data['ext_col'].unique()

In [None]:
data['int_col'].unique()

In [None]:
data['accident'].unique()

In [None]:
data['clean_title'].unique()

In [None]:
# linear regression with milage only 

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# random_state
rs=7

# milage 

X=np.array(data['milage']).reshape(-1,1)
y=np.array(data['price']).reshape(-1,1)


X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.8, random_state = rs)
model=LinearRegression().fit(X_train,y_train)



In [None]:
# root_mean_squared_error

from sklearn.metrics import mean_squared_error

mse_train=mean_squared_error(y_train, model.predict(X_train))
mse_val=mean_squared_error(y_val,model.predict(X_val))

rmse_train=np.sqrt(mse_train)
rmse_val=np.sqrt(mse_val)

print('rmse_train is ',rmse_train)
print('rmse_val is',rmse_val)

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
# plotting milage against price

X=X.flatten()
y=y.flatten()

plt.scatter(X,y)
plt.scatter(X,model.coef_*X+model.intercept_,c='r')
plt.legend(['data','prediction'])
plt.xlabel('milage')
plt.ylabel('price')


In [None]:
# corelation matrix between model year, milage, price

X=data[['model_year','milage','price']]
dataplot = sb.heatmap(X.corr(), cmap="YlGnBu", annot=True) 


Price is highly corelated to milage and model year.


In [None]:
# linear regression with model year and milage

X=data[['model_year','milage']]
y=data[['price']]

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.8, random_state = rs)
model=LinearRegression().fit(X_train,y_train)

mse_train=mean_squared_error(y_train, model.predict(X_train))
mse_val=mean_squared_error(y_val,model.predict(X_val))

rmse_train=np.sqrt(mse_train)
rmse_val=np.sqrt(mse_val)

print('rmse_train is ',rmse_train)
print('rmse_val is',rmse_val)


In [None]:
# random forest with model year and milage

from sklearn.ensemble import RandomForestRegressor

model=RandomForestRegressor(n_estimators=100 , max_depth=6, random_state=rs)
model.fit(X_train,y_train)

mse_train=mean_squared_error(y_train, model.predict(X_train))
mse_val=mean_squared_error(y_val,model.predict(X_val))

rmse_train=np.sqrt(mse_train)
rmse_val=np.sqrt(mse_val)

print('rmse_train is ',rmse_train)
print('rmse_val is',rmse_val)



In [None]:
# gradient boost with model year and milage

from xgboost import XGBRegressor

model = XGBRegressor(n_estimators= 5000,learning_rate=0.1, max_depth=6, objective ='reg:linear',random_state=rs)
model.fit(X_train,y_train, eval_set = [(X_val,y_val)],early_stopping_rounds = 40)
rmse=model.evals_result()['validation_0']['rmse']
plt.plot(rmse)
plt.xlabel("iteration")
plt.ylabel("loss")

# best iteration
print('best iteration is', model.best_iteration)

# loss corresponding to best iteration

print('loss corresponding to best iteration is', rmse[model.best_iteration])


In [None]:
# including more features 

from sklearn.preprocessing import OneHotEncoder

numerical_data=data[['milage','model_year']]
categorical_data=data[['fuel_type','accident','clean_title']]
y=data[['price']]

ohe=OneHotEncoder().fit(categorical_data)
ohe_data = ohe.transform(categorical_data).toarray()

X=pd.concat([numerical_data,pd.DataFrame(ohe_data)],axis=1)


In [None]:
X

In [None]:
# running xgboost again

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.8, random_state = rs)
model = XGBRegressor(n_estimators= 5000,learning_rate=0.1, max_depth=3, objective ='reg:linear',random_state=rs)
model.fit(X_train,y_train, eval_set = [(X_val,y_val)],early_stopping_rounds = 40)
rmse=model.evals_result()['validation_0']['rmse']
plt.plot(rmse)
plt.xlabel("iteration")
plt.ylabel("loss")

# best iteration
print('best iteration is', model.best_iteration)

# loss corresponding to best iteration

print('loss corresponding to best iteration is', rmse[model.best_iteration])

In [None]:
# including more features 

numerical_data=data[['milage','model_year']]
categorical_data=data[['fuel_type','accident','clean_title','brand','transmission']]
y=data[['price']]

ohe=OneHotEncoder().fit(categorical_data)
ohe_data = ohe.transform(categorical_data).toarray()

X=pd.concat([numerical_data,pd.DataFrame(ohe_data)],axis=1)


In [None]:
X

In [None]:
# running xgboost again

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.8, random_state = rs)
model = XGBRegressor(n_estimators= 5000,learning_rate=0.1, max_depth=2, objective ='reg:linear',random_state=rs)
model.fit(X_train,y_train, eval_set = [(X_val,y_val)],early_stopping_rounds = 40)
rmse=model.evals_result()['validation_0']['rmse']
plt.plot(rmse)
plt.xlabel("iteration")
plt.ylabel("loss")

# best iteration
print('best iteration is', model.best_iteration)

# loss corresponding to best iteration

print('loss corresponding to best iteration is', rmse[model.best_iteration])

In [None]:
# including more features 

numerical_data=data[['milage','model_year']]
categorical_data=data[['fuel_type','accident','clean_title','brand','transmission','engine']]
y=data[['price']]

ohe=OneHotEncoder(handle_unknown='ignore').fit(categorical_data)
ohe_data = ohe.transform(categorical_data).toarray()

X=pd.concat([numerical_data,pd.DataFrame(ohe_data)],axis=1)


In [None]:
X

In [None]:
# running xgboost again

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.8, random_state = rs)
model = XGBRegressor(n_estimators= 5000,learning_rate=0.1, max_depth=1, objective ='reg:linear',random_state=rs)
model.fit(X_train,y_train, eval_set = [(X_val,y_val)],early_stopping_rounds = 40)
rmse=model.evals_result()['validation_0']['rmse']
plt.plot(rmse)
plt.xlabel("iteration")
plt.ylabel("loss")

# best iteration
print('best iteration is', model.best_iteration)

# loss corresponding to best iteration

print('loss corresponding to best iteration is', rmse[model.best_iteration])

In [None]:
# prediction on unlabeled data

unlabeled_data=pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')

test_ids=unlabeled_data['id']

numerical_data=unlabeled_data[['milage','model_year']]
categorical_data=unlabeled_data[['fuel_type','accident','clean_title','brand','transmission','engine']]


ohe_data = ohe.transform(categorical_data).toarray()

X=pd.concat([numerical_data,pd.DataFrame(ohe_data)],axis=1)

predictions=model.predict(X)


# submission file 

print('Generating submission.csv file...')

# Write the submission file
np.savetxt(
    'submission.csv',
    np.rec.fromarrays([test_ids, predictions]),
    fmt=['%s', '%d'],
    delimiter=',',
    header='id,label',
    comments='',
)

# Look at the first few predictions
!head submission.csv

In [None]:
# including all features

data=pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv')

numerical_data=data[['milage','model_year']]
categorical_data=data[['fuel_type','accident','clean_title','brand','transmission','engine','model','ext_col','int_col']]
y=data[['price']]

ohe=OneHotEncoder(handle_unknown='ignore').fit(categorical_data)
ohe_data = ohe.transform(categorical_data).toarray()

X=pd.concat([numerical_data,pd.DataFrame(ohe_data)],axis=1)

# running xgboost again

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.8, random_state = rs)
model = XGBRegressor(n_estimators= 5000,learning_rate=0.5, max_depth=1, objective ='reg:linear',random_state=rs)
model.fit(X_train,y_train, eval_set = [(X_val,y_val)],early_stopping_rounds = 80)
rmse=model.evals_result()['validation_0']['rmse']
plt.plot(rmse)
plt.xlabel("iteration")
plt.ylabel("loss")

# best iteration
print('best iteration is', model.best_iteration)

# loss corresponding to best iteration

print('loss corresponding to best iteration is', rmse[model.best_iteration])

It seems adding model type or color may not improve that much.
Best rmse so far is  68797.47268671425 

# Next possibilities 

1) Try with all features but increase early stopping rounds maybe to 60, 80 or so.

2) Add an extra feature: milage *(2025-year), so that less value of this feature will correspond to more price
   : no difference
   
3) With different learning rate : learning rate will increase speed, but accuracy ? doubtful

   Learning rate doesn't make difference. lr=0.1 is good estimate.
   
3) Try with different algorithms like neural network ? 

4) Add more data to training: not interseted

And so on.........