# Fuel Consumption Prediction

In [1]:
import pandas as pd

In [2]:
# Define column names from UCI dataset
column_names = [
    'mpg', 'cylinders', 'displacement', 'horsepower',
    'weight', 'acceleration', 'model_year', 'origin', 'car_name'
]

# Read the .data file (space-separated, some spaces are irregular, so use delim_whitespace)
df = pd.read_csv("auto-mpg.data", delim_whitespace=True, names=column_names, na_values='?')

# Save as CSV
df.to_csv("auto_mpg.csv", index=False)

print("File converted successfully to CSV.")


File converted successfully to CSV.


In [18]:
#Step 1

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [19]:
# 2. Load the dataset

In [5]:
column_names = [
    'mpg', 'cylinders', 'displacement', 'horsepower',
    'weight', 'acceleration', 'model_year', 'origin', 'car_name'
]

In [6]:
df = pd.read_csv("auto-mpg.data", delim_whitespace=True, names=column_names, na_values='?')

In [30]:
print("Shape of dataset:", df.shape)

Shape of dataset: (398, 8)


In [31]:
print("\nFirst 5 rows:")
print(df.head())


First 5 rows:
    mpg  cylinders  displacement  horsepower  weight  acceleration  \
0  18.0          8         307.0       130.0  3504.0          12.0   
1  15.0          8         350.0       165.0  3693.0          11.5   
2  18.0          8         318.0       150.0  3436.0          11.0   
3  16.0          8         304.0       150.0  3433.0          12.0   
4  17.0          8         302.0       140.0  3449.0          10.5   

   model_year  origin  
0          70       1  
1          70       1  
2          70       1  
3          70       1  
4          70       1  


In [32]:
print("\nDataset Info:")
print(df.info())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 25.0 KB
None


In [33]:
print("\nMissing values in each column:")
print(df.isnull().sum())


Missing values in each column:
mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64


In [34]:
print("\nSummary statistics:")
print(df.describe())


Summary statistics:
              mpg   cylinders  displacement  horsepower       weight  \
count  398.000000  398.000000    398.000000  392.000000   398.000000   
mean    23.514573    5.454774    193.425879  104.469388  2970.424623   
std      7.815984    1.701004    104.269838   38.491160   846.841774   
min      9.000000    3.000000     68.000000   46.000000  1613.000000   
25%     17.500000    4.000000    104.250000   75.000000  2223.750000   
50%     23.000000    4.000000    148.500000   93.500000  2803.500000   
75%     29.000000    8.000000    262.000000  126.000000  3608.000000   
max     46.600000    8.000000    455.000000  230.000000  5140.000000   

       acceleration  model_year      origin  
count    398.000000  398.000000  398.000000  
mean      15.568090   76.010050    1.572864  
std        2.757689    3.697627    0.802055  
min        8.000000   70.000000    1.000000  
25%       13.825000   73.000000    1.000000  
50%       15.500000   76.000000    1.000000  
75%     

In [20]:
# 3. Drop rows with missing target

In [7]:
df = df.dropna(subset=['mpg'])

In [21]:
# 4. Drop 'car_name'

In [8]:
df = df.drop('car_name', axis=1)

In [22]:
# 5. Separate features and target

In [9]:
X = df.drop('mpg', axis=1)
y = df['mpg']

In [23]:
# 6. Identify categorical & numerical columns

In [10]:
categorical_cols = ['origin']
numerical_cols = X.drop('origin', axis=1).columns.tolist()

In [24]:
# 7. Preprocessing pipeline

In [11]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [25]:
# 8. Build full pipeline with model

In [12]:
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [26]:
# 9. Train-test split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# 10. Fit the model

In [14]:
model_pipeline.fit(X_train, y_train)

In [28]:
# 11. Predict

In [15]:
y_pred = model_pipeline.predict(X_test)

In [29]:
# 12. Evaluate the model

In [16]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [17]:
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R²) Score: {r2:.2f}")

Root Mean Squared Error (RMSE): 2.89
R-squared (R²) Score: 0.84
