In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
import numpy as np

In [2]:
data = pd.read_csv("auto-mpg.csv")

In [3]:
data.head(2)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [5]:
data = data.drop("car name",axis=1)

In [6]:
data.sample(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
165,20.0,8,262.0,110,3221,13.5,75,1
44,13.0,8,400.0,175,5140,12.0,71,1
355,33.7,4,107.0,75,2210,14.4,81,3


In [7]:
data["origin"].dtype


dtype('int64')

In [8]:
data.origin.value_counts()

1    249
3     79
2     70
Name: origin, dtype: int64

In [9]:
data["origin"] = data["origin"].replace({1:"america",2:"europe",3:"asia"})

In [10]:
data.sample()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
395,32.0,4,135.0,84,2295,11.6,82,america


In [11]:
data[data["horsepower"]=="?"]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
32,25.0,4,98.0,?,2046,19.0,71,america
126,21.0,6,200.0,?,2875,17.0,74,america
330,40.9,4,85.0,?,1835,17.3,80,europe
336,23.6,4,140.0,?,2905,14.3,80,america
354,34.5,4,100.0,?,2320,15.8,81,europe
374,23.0,4,151.0,?,3035,20.5,82,america


In [12]:
data = data.replace("?",np.nan)

In [13]:
data["horsepower"] = data["horsepower"].fillna(data["horsepower"].median())

In [14]:
data.median()

  data.median()


mpg               23.0
cylinders          4.0
displacement     148.5
horsepower        93.5
weight          2803.5
acceleration      15.5
model year        76.0
dtype: float64

In [15]:
data["horsepower"] = data.horsepower.astype("int64")

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    int64  
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    object 
dtypes: float64(3), int64(4), object(1)
memory usage: 25.0+ KB


In [17]:
data.duplicated().sum()

0

In [18]:
data = pd.get_dummies(data,"origin")

In [19]:
data.sample()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_america,origin_asia,origin_europe
165,20.0,8,262.0,110,3221,13.5,75,1,0,0


In [20]:
X = data.drop("mpg",axis=1)
y = data["mpg"]

In [21]:
X.sample()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin_america,origin_asia,origin_europe
195,4,85.0,52,2035,22.2,76,1,0,0


In [22]:
y.sample()

219    25.5
Name: mpg, dtype: float64

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [24]:
X_train.head(2)

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin_america,origin_asia,origin_europe
230,8,350.0,170,4165,11.4,77,1,0,0
357,4,119.0,100,2615,14.8,81,0,1,0


In [25]:
print("len(data) : ",len(data))
print("len(X_train) : ",len(X_train))
print("len(X_test) : ",len(X_test))

len(data) :  398
len(X_train) :  278
len(X_test) :  120


In [26]:
regressor = DecisionTreeRegressor()

In [27]:
regressor.fit(X_train,y_train)

DecisionTreeRegressor()

In [28]:
pred = regressor.predict(X_test)

In [29]:
pred

array([14. , 28. , 16. , 22. , 18. , 31. , 32. , 22. , 16. , 22. , 34.5,
       39.4, 15. , 26. , 16. , 33.5, 27. , 26.6, 16. , 38. , 16. , 23.8,
       28. , 23. , 32.1, 24.2, 29.5, 31. , 33.7, 16. , 19. , 32.2, 15. ,
       32. , 17.6, 25.1, 19.4, 15. , 33. , 12. , 15. , 17. , 28.4, 33.5,
       32.9, 20. , 19.4, 15. , 20. , 31. , 36. , 25. , 16.5, 26. , 14. ,
        9. , 18. , 30.7, 32. , 16. , 23.9, 27.5, 19. , 18. , 12. , 16. ,
       16. , 15. , 23.8, 15. , 36. , 12. , 25. , 18. , 24. , 30.5, 33.5,
       31.5, 30.5, 15. , 14. , 26. , 34.5, 26. , 29.8, 29.5, 31. , 19.9,
       28.1, 34.4, 26. , 10. , 20.3, 33. , 32.7, 20.2, 19.2, 26. , 19.8,
       13. , 19.8, 43.1, 26.4, 24. , 40.8, 24. , 24.5, 15. , 13. , 25. ,
       25. , 29. , 20.2, 40.8, 28. , 25.1, 36. , 12. , 15. , 39.4])

In [30]:
from sklearn.metrics import r2_score
r2_score(y_test,pred)

0.8193402209252856

In [31]:
data.sample()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_america,origin_asia,origin_europe
389,22.0,6,232.0,112,2835,14.7,82,1,0,0


In [32]:
# Predict one example where 
# cylinders = 9 displacement:23 horsepower:136 weight:2324 acceleration:23.6
# model year:45 origin_america:0 origin_asia:1 origin_europe:0
regressor.predict([[9,234,136,2324,23.6,45,0,1,0]])

array([16.2])