# Decision Tree Regression

## Data Preprocessing

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Loading the dataset from a CSV file
dataset = pd.read_csv('/content/drive/My Drive/KaggleDatasets/cardekho_data.csv')
dataset.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


### checking for Duplicates

In [4]:
dataset.duplicated().sum()

2

In [5]:
dataset.drop_duplicates(inplace=True)

### Splitting the Dataset into Features and Target Variable

In [6]:
X = dataset.iloc[:, [0, 1 ,3, 4, 5, 6, 7, 8]].values
y = dataset.iloc[:, 2].values

### Encoding categorical data

In [7]:
from sklearn.preprocessing import LabelEncoder

# Assuming X is a pandas DataFrame
label_encoder = LabelEncoder()
categorical_columns = [0, 4, 5, 6]

# Apply LabelEncoder to each categorical column
for column in categorical_columns:
    X[:, column] = label_encoder.fit_transform(X[:, column])

In [8]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Assuming you know the indices of the columns to be one-hot encoded
column_indices = [0, 4, 5, 6]  # Example indices, replace with actual indices of your columns

# Setup the ColumnTransformer with column indices
ct = ColumnTransformer(
    transformers=[
        ("one_hot", OneHotEncoder(), column_indices)
    ],
    remainder='passthrough'
)

# Apply the ColumnTransformer to X
X_transformed = ct.fit_transform(X)


## Splitting the dataset into the training set and Test se

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size = 0.2, random_state = 0)

## Fitting the Decision Tree Regression to the dataset

In [10]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)

## Model testing and evaluation

In [11]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Assuming y_test contains the actual target values for the testing set
# First, get the predictions for the test data
y_pred = regressor.predict(X_test)

# Calculate the evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"R^2 Score: {r2}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

R^2 Score: 0.9575895149308611
Mean Absolute Error (MAE): 0.5511666666666666
Mean Squared Error (MSE): 0.7418383333333334
Root Mean Squared Error (RMSE): 0.8613003734663844


In [12]:
df_pred = pd.DataFrame({'Real_Values':y_test, 'Predicted_Values':y_pred})
df_pred.head()

Unnamed: 0,Real_Values,Predicted_Values
0,7.9,7.75
1,0.2,0.12
2,7.5,6.7
3,4.5,3.75
4,2.0,2.7
