In [1]:
import pandas as pd


filepath = "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"

data = pd.read_fwf(filepath, header=None);

data = data.drop(8, axis=1)

data = data.rename(
  columns={0: 'mpg', 1: 'cylinders', 2: 'displacement', 3: 'hp', 4: 'weight', 5: 'acc', 6: 'year', 7: 'origin'}
)

data

Unnamed: 0,mpg,cylinders,displacement,hp,weight,acc,year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.00,2790.0,15.6,82,1
394,44.0,4,97.0,52.00,2130.0,24.6,82,2
395,32.0,4,135.0,84.00,2295.0,11.6,82,1
396,28.0,4,120.0,79.00,2625.0,18.6,82,1


In [2]:
## data cleansing

import numpy as np

data = data[data.hp != '?']

data['hp'] = data['hp'].astype(float)

data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['hp'] = data['hp'].astype(float)


Unnamed: 0,mpg,cylinders,displacement,hp,weight,acc,year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1
394,44.0,4,97.0,52.0,2130.0,24.6,82,2
395,32.0,4,135.0,84.0,2295.0,11.6,82,1
396,28.0,4,120.0,79.0,2625.0,18.6,82,1


In [3]:
##Split training set and test set

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)



In [4]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define features and labels
X = train_set.drop('mpg', axis=1)
y = train_set['mpg']

# Identify numerical and categorical columns
num_features = ['cylinders', 'displacement', 'hp', 'weight', 'acc', 'year']
cat_features = ['origin']

# Preprocessing for numerical data (scaling) and categorical data (one-hot encoding)
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(), cat_features)
])

# Combine the preprocessor into a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

# Prepare the training data
X_train_prepared = pipeline.fit_transform(X)


X_test = test_set.drop('mpg', axis=1)
y_test = test_set['mpg']
X_test_prepared = pipeline.transform(X_test)

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

# Train the model
model.fit(X_train_prepared, y)

# The model is now trained and ready for evaluation.

In [6]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Make predictions on the test set
y_pred = model.predict(X_test_prepared)

# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"r^2 score: {r2}" )

Root Mean Squared Error (RMSE): 2.373337814081012
r^2 score: 0.8896420843532333


In [3]:
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1
