In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("../input/vehicle-dataset-from-cardekho/car data.csv")
df.head()

In [3]:
df.shape

In [4]:
print(df['Seller_Type'].unique())
print("------------------------------------")
print(df['Fuel_Type'].unique())
print("------------------------------------")
print(df['Transmission'].unique())
print("------------------------------------")
print(df['Owner'].unique())

In [5]:
fig, axes = plt.subplots(2, 3, figsize=(16, 9), sharey=True)
fig.suptitle('Categorical Plots')

# Seller_Type
sns.barplot(ax=axes[0,0], x='Seller_Type', y='Selling_Price',data=df)
axes[0,0].set_title("SELLER_TYPE")

# Fuel_Type
sns.barplot(ax=axes[0,1], x='Fuel_Type', y='Selling_Price',data=df)
axes[0,1].set_title("FUEL_TYPE")

# Transmission
sns.barplot(ax=axes[0,2], x='Transmission', y='Selling_Price',data=df)
axes[0,2].set_title('TRANSMISSION')

# Owner
sns.barplot(ax=axes[1,0], x='Owner', y='Selling_Price',data=df)
axes[1,0].set_title('OWNER')
fig.tight_layout()
plt.show()

In [6]:
# Checking missing values
df.isnull().sum()

In [7]:
df.describe()

### Handling Year Column
Using year column creating a new column named No of Years

In [8]:
df.columns

In [9]:
#Not taking Car name as there can be too many car names and for prediction
#name is not required.
new_df = df[['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']]

In [10]:
new_df['Current_year']=2021
new_df.head()

In [11]:
new_df["No of Years"] = new_df['Current_year']-new_df['Year']
new_df.drop(['Current_year','Year'],axis=1,inplace=True)
new_df.head()

### Handling Categorical Data

In [12]:
new_df = pd.get_dummies(new_df,drop_first=True)
new_df.head()

In [13]:
new_df.columns

In [14]:
sns.pairplot(new_df)

### Correlation

In [15]:
new_df.corr()

In [16]:
plt.figure(figsize=(16,9))
sns.heatmap(new_df.corr(),annot=True)

In [17]:
X = new_df.drop(['Selling_Price'],axis=1)
y = new_df['Selling_Price']

In [18]:
X.info()

In [31]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [32]:
print("---------------------------------------")
print("Shape of X_train: ",X_train.shape)
print("---------------------------------------")
print("Shape of X_test: ",X_test.shape)
print("---------------------------------------")
print("Shape of y_train: ",y_train.shape)
print("---------------------------------------")
print("Shape of y_test: ",y_test.shape)
print("---------------------------------------")

In [33]:
from xgboost import XGBRegressor
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV

xgb = XGBRegressor()
xgb.fit(X_train, y_train)

In [34]:
pred_xgb=xgb.predict(X_test)

print('MAE:', metrics.mean_absolute_error(y_test, pred_xgb))
print('MSE:', metrics.mean_squared_error(y_test, pred_xgb))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred_xgb)))

# Hyperparameter tuninig

1. max_depth (int) – Maximum tree depth for base learners.
2. learning_rate (float) – Boosting learning rate (xgb’s “eta”)
3. n_estimators (int) – Number of boosted trees to fit.
4. gamma (float) – Minimum loss reduction required to make a further partition on a leaf node of the tree.
5. min_child_weight (int) – Minimum sum of instance weight(hessian) needed in a child.
6. subsample (float) – Subsample ratio of the training instance.
7. colsample_bytree (float) – Subsample ratio of columns when constructing each tree.
8. objective (string or callable) – Specify the learning task and the corresponding learning objective or a custom objective function to be used (see note below).
9. nthread (int) – Number of parallel threads used to run xgboost. (Deprecated, please use n_jobs)
10. scale_pos_weight (float) – Balancing of positive and negative weights.

In [23]:
param_tuning = {
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7, 10],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.5, 0.7],
        'colsample_bytree': [0.5, 0.7],
        'n_estimators' : [100, 200, 500],
        'objective': ['reg:squarederror']
    }

xgb_model = XGBRegressor()
r_search = RandomizedSearchCV(estimator = xgb_model,
                           param_distributions  = param_tuning,                        
                           cv = 5,
                           n_jobs = 1,
                           verbose = 2)

r_search.fit(X_train,y_train)

In [24]:
r_search.best_params_

In [35]:
new_preds = r_search.predict(X_test)

print('MAE:', metrics.mean_absolute_error(y_test, new_preds))
print('MSE:', metrics.mean_squared_error(y_test, new_preds))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, new_preds)))