In [3]:
import numpy as np
import pandas as pd

data = pd.read_csv("./petrol_consumption.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Petrol_tax                    48 non-null     float64
 1   Average_income                48 non-null     int64  
 2   Paved_Highways                48 non-null     int64  
 3   Population_Driver_licence(%)  48 non-null     float64
 4   Petrol_Consumption            48 non-null     int64  
dtypes: float64(2), int64(3)
memory usage: 2.0 KB


In [4]:
data.isna().sum()

Petrol_tax                      0
Average_income                  0
Paved_Highways                  0
Population_Driver_licence(%)    0
Petrol_Consumption              0
dtype: int64

In [5]:
# Separate the features and the target variable
X = data.drop('Petrol_Consumption', axis=1)  # Features
y = data['Petrol_Consumption']  # Target
X.head(), y.head()

(   Petrol_tax  Average_income  Paved_Highways  Population_Driver_licence(%)
 0         9.0            3571            1976                         0.525
 1         9.0            4092            1250                         0.572
 2         9.0            3865            1586                         0.580
 3         7.5            4870            2351                         0.529
 4         8.0            4399             431                         0.544,
 0    541
 1    524
 2    561
 3    414
 4    410
 Name: Petrol_Consumption, dtype: int64)

In [6]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split  # for splitting the data
from sklearn.metrics import mean_squared_error  # for calculating the cost function
from sklearn.tree import DecisionTreeRegressor  # for creating the model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=59)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((38, 4), (10, 4), (38,), (10,))

In [7]:
# Fitting the model to the training dataset
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_train, y_train)
regressor.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 0,
 'splitter': 'best'}

In [8]:
# Calculating the loss after training
y_pred = regressor.predict(X_test)
rmse = float(format(np.sqrt(mean_squared_error(y_test, y_pred)), '.3f'))
print(f'RMSE: {rmse}')

RMSE: 58.072


In [9]:
# Visualizing the decision tree
from sklearn.tree import export_graphviz
export_graphviz(regressor, out_file='RegressionTree.dot', feature_names=X.columns)

-----------------------分割線-------------------------

In [10]:
import pickle

y_pred = regressor.predict(X_test)
rmse = float(format(np.sqrt(mean_squared_error(y_test, y_pred)), '.3f'))
train_score = regressor.score(X_train, y_train)
test_score = regressor.score(X_test, y_test)

print(f'RMSE: {rmse}\t train score: {train_score}\t test score: {test_score}')

pickle.dump(regressor, open("petrol_consumption_DT_model.pkl", "wb"))  # save the model

RMSE: 58.072	 train score: 1.0	 test score: 0.21695335574833285


In [11]:
loaded_model = pickle.load(open("petrol_consumption_DT_model.pkl", "rb"))  # load the model
result = loaded_model.score(X_test, y_test)  # test the loaded model
print(result)

0.21695335574833285
