## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [2]:
dataset=pd.read_csv("MPG.csv")
dataset

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,mpg
0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu,18.0
1,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320,15.0
2,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite,18.0
3,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst,16.0
4,8,302.0,140.0,3449,10.5,70,usa,ford torino,17.0
...,...,...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl,27.0
394,4,97.0,52.0,2130,24.6,82,europe,vw pickup,44.0
395,4,135.0,84.0,2295,11.6,82,usa,dodge rampage,32.0
396,4,120.0,79.0,2625,18.6,82,usa,ford ranger,28.0


## Creating independent and dependent variables

In [3]:
X=dataset.iloc[:,:-1].values
y=dataset.iloc[:,-1].values



In [4]:
X

array([[8, 307.0, 130.0, ..., 70, 'usa', 'chevrolet chevelle malibu'],
       [8, 350.0, 165.0, ..., 70, 'usa', 'buick skylark 320'],
       [8, 318.0, 150.0, ..., 70, 'usa', 'plymouth satellite'],
       ...,
       [4, 135.0, 84.0, ..., 82, 'usa', 'dodge rampage'],
       [4, 120.0, 79.0, ..., 82, 'usa', 'ford ranger'],
       [4, 119.0, 82.0, ..., 82, 'usa', 'chevy s-10']], dtype=object)

## Taking care of missing values if present

In [5]:
dataset.isnull().sum()

cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
name            0
mpg             0
dtype: int64

## Encoding the categorical varible

In [6]:
from sklearn.preprocessing import OrdinalEncoder
encoder=OrdinalEncoder()
X[:,6:8]=encoder.fit_transform(X[:,6:8])

## Dealing with missing values present in horsepower feature

In [7]:
from sklearn.impute import SimpleImputer
impute=SimpleImputer(missing_values=np.nan,strategy="mean")
impute.fit(X[:,2:3])
X[:,2:3]=impute.transform(X[:,2:3])

In [8]:
dataset.isnull().sum()

cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
name            0
mpg             0
dtype: int64

In [9]:
X


array([[8, 307.0, 130.0, ..., 70, 2.0, 49.0],
       [8, 350.0, 165.0, ..., 70, 2.0, 36.0],
       [8, 318.0, 150.0, ..., 70, 2.0, 231.0],
       ...,
       [4, 135.0, 84.0, ..., 82, 2.0, 119.0],
       [4, 120.0, 79.0, ..., 82, 2.0, 159.0],
       [4, 119.0, 82.0, ..., 82, 2.0, 69.0]], dtype=object)

## Splitting the dataset into training set and testing set

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training set

In [11]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators=25)
regressor.fit(X_train,y_train)

## Predicting the test set results

In [12]:
y_pred=regressor.predict((X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_test.reshape(len(y_test),1),y_pred.reshape(len(y_pred),1)),1))


[[14.   14.1 ]
 [25.   24.34]
 [13.   14.38]
 [21.   21.89]
 [18.   17.64]
 [35.   29.68]
 [34.1  33.72]
 [20.   21.68]
 [15.   14.82]
 [23.5  26.4 ]
 [40.9  34.42]
 [37.2  37.17]
 [18.   19.86]
 [23.   25.54]
 [15.5  16.6 ]
 [35.7  32.7 ]
 [31.   27.57]
 [27.   29.18]
 [18.   17.41]
 [37.3  36.45]
 [15.5  15.86]
 [23.   23.54]
 [24.   24.15]
 [18.   20.45]
 [34.5  32.9 ]
 [25.4  29.57]
 [36.1  35.27]
 [34.   30.18]
 [30.   29.98]
 [16.   15.98]
 [18.6  18.72]
 [37.   32.39]
 [15.   20.2 ]
 [33.5  33.38]
 [22.4  21.65]
 [24.   24.62]
 [19.   18.5 ]
 [16.9  16.22]
 [31.9  33.84]
 [12.   11.96]
 [14.   13.4 ]
 [15.   15.69]
 [27.4  26.63]
 [23.7  33.88]
 [32.   29.72]
 [24.   21.74]
 [22.5  19.18]
 [16.5  14.2 ]
 [21.   20.45]
 [26.   30.52]
 [41.5  33.58]
 [29.   25.9 ]
 [15.   16.3 ]
 [30.   27.14]
 [15.   15.85]
 [10.   11.96]
 [18.   19.49]
 [20.2  23.26]
 [36.   30.37]
 [17.   16.12]
 [18.   19.98]
 [26.5  25.27]
 [18.   19.58]
 [22.   21.4 ]
 [12.   12.84]
 [14.   14.32]
 [15.   13

## Evaluating the performance of the model using R-Squared

In [13]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.8869494207726052