In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("auto-mpg.csv")

In [3]:
data.sample(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
20,25.0,4,110.0,87,2672,17.5,70,2,peugeot 504
187,17.5,8,305.0,140,4215,13.0,76,1,chevrolet chevelle malibu classic
364,26.6,8,350.0,105,3725,19.0,81,1,oldsmobile cutlass ls
55,27.0,4,97.0,60,1834,19.0,71,2,volkswagen model 111
110,22.0,4,108.0,94,2379,16.5,73,3,datsun 610


In [4]:
data.drop("car name",axis=1,inplace=True)

In [5]:
data.tail(2)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
396,28.0,4,120.0,79,2625,18.6,82,1
397,31.0,4,119.0,82,2720,19.4,82,1


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
dtypes: float64(3), int64(4), object(1)
memory usage: 25.0+ KB


In [7]:
data[data["horsepower"]=="?"]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
32,25.0,4,98.0,?,2046,19.0,71,1
126,21.0,6,200.0,?,2875,17.0,74,1
330,40.9,4,85.0,?,1835,17.3,80,2
336,23.6,4,140.0,?,2905,14.3,80,1
354,34.5,4,100.0,?,2320,15.8,81,2
374,23.0,4,151.0,?,3035,20.5,82,1


In [8]:
data = data.replace("?",np.nan)

In [9]:
data["horsepower"] = data["horsepower"].fillna(data["horsepower"].median())

In [10]:
data["horsepower"] = data.horsepower.astype("int64")

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    int64  
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
dtypes: float64(3), int64(5)
memory usage: 25.0 KB


In [12]:
data["origin"] = data["origin"].replace({1:"america",2:"europe",3:"asia"})

In [13]:
data = pd.get_dummies(data,"origin")

In [14]:
data.sample(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_america,origin_asia,origin_europe
232,16.0,8,351.0,149,4335,14.5,77,1,0,0
89,15.0,8,318.0,150,3777,12.5,73,1,0,0
198,33.0,4,91.0,53,1795,17.4,76,0,1,0
310,38.1,4,89.0,60,1968,18.8,80,0,1,0
241,22.0,6,146.0,97,2815,14.5,77,0,1,0


In [15]:
X = data.drop("mpg",axis=1)
y = data["mpg"]

In [16]:
X.sample(2)

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin_america,origin_asia,origin_europe
244,4,90.0,48,1985,21.5,78,0,0,1
132,4,140.0,75,2542,17.0,74,1,0,0


In [17]:
y.sample(2)

157    15.0
333    32.7
Name: mpg, dtype: float64

In [19]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=99)

In [55]:
Random_regressor = RandomForestRegressor(n_estimators=7,oob_score=True)

In [56]:
Random_regressor.fit(X_train,y_train)

  warn("Some inputs do not have OOB scores. "


RandomForestRegressor(n_estimators=7, oob_score=True)

In [57]:
pred = Random_regressor.predict(X_test)

In [58]:
pred

array([22.95714286, 28.21428571, 21.34285714, 14.71428571, 25.42857143,
       22.        , 15.57142857, 16.87142857, 19.71428571, 26.84285714,
       18.75714286, 15.42857143, 15.91428571, 29.22857143, 33.62857143,
       29.05714286, 33.14285714, 37.2       , 16.35714286, 29.71428571,
       12.71428571, 19.95714286, 36.51428571, 19.62857143, 29.64285714,
       15.72857143, 17.51428571, 37.47142857, 34.35714286, 17.42857143,
       34.07142857, 19.42857143, 14.42857143, 24.37142857, 25.72857143,
       36.3       , 19.        , 26.28571429, 21.58571429, 14.14285714,
       33.3       , 12.42857143, 35.34285714, 25.42857143, 35.18571429,
       19.55714286, 21.58571429, 31.85714286, 18.        , 34.34285714,
       34.98571429, 21.85714286, 16.28571429, 18.81428571, 23.21428571,
       14.78571429, 23.95714286, 36.85714286, 40.55714286, 28.14285714,
       31.61428571, 34.05714286, 31.01428571, 25.28571429, 20.85714286,
       12.71428571, 28.94285714, 26.78571429, 19.57142857, 31.85

In [59]:
from sklearn.metrics import accuracy_score,r2_score
r2_score(y_test,pred)

0.834398522764083