### Q1. Pandas version

In [27]:
import pandas as pd
import numpy as np
print(pd.__version__)

2.3.1


In [28]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

--2025-09-22 15:21:06--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv.1’


2025-09-22 15:21:06 (102 MB/s) - ‘car_fuel_efficiency.csv.1’ saved [874188/874188]



In [29]:

data = pd.read_csv("car_fuel_efficiency.csv")
df = data
df.shape

(9704, 11)

In [30]:
df.columns

Index(['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight',
       'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain',
       'num_doors', 'fuel_efficiency_mpg'],
      dtype='object')

In [31]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


### Q2. Records count

In [32]:
# 9704 records in the dataset
print(len(df))

num_records = df.shape[0]
print("The no. of records:", num_records)

9704
The no. of records: 9704


In [33]:
df.index.size

9704

In [34]:
df.shape[0]

9704

In [35]:
df.shape

(9704, 11)

### Q3. Fuel types

In [36]:
# two types of fuel 
df['fuel_type'].unique()

array(['Gasoline', 'Diesel'], dtype=object)

In [37]:
unique_fuel_types = df['fuel_type'].unique()
num_fuel_types = df['fuel_type'].nunique()

print("Fuel types:", unique_fuel_types)
print("Number of fuel types:", num_fuel_types)

Fuel types: ['Gasoline' 'Diesel']
Number of fuel types: 2


### Q4. Missing values

In [38]:
# 4 columns with missing values
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [39]:
no_columns = df.isnull().any().sum()
print("The numbers of columns with missing values:", no_columns)

The numbers of columns with missing values: 4


In [40]:
missing_columns = data.columns[data.isnull().any()]
print(missing_columns)

Index(['num_cylinders', 'horsepower', 'acceleration', 'num_doors'], dtype='object')


### Q5. Max fuel efficiency

In [41]:
df['origin'].unique()

array(['Europe', 'USA', 'Asia'], dtype=object)

In [42]:
# the maximum fuel efficiency of cars from Asia: 23.76
df.groupby('origin').max('fuel_efficiency_mpg').round(2)

Unnamed: 0_level_0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,num_doors,fuel_efficiency_mpg
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Asia,370,11.0,245.0,4661.14,22.7,2023,4.0,23.76
Europe,380,13.0,271.0,4726.98,23.5,2023,3.0,25.97
USA,380,13.0,252.0,4739.08,24.3,2023,3.0,24.97


In [43]:
asian_cars_only = df[df["origin"]  == 'Asia']
print("the maximum fuel efficiency of cars from Asia: ", asian_cars_only["fuel_efficiency_mpg"].max().round(2))

the maximum fuel efficiency of cars from Asia:  23.76


### Q6. Median value of horsepower

In [44]:
data['horsepower'].isnull().sum()

np.int64(708)

In [45]:
df['horsepower'].isnull().sum()

np.int64(708)

In [46]:
print("the median value of horsepower before filling the nulls:",df['horsepower'].median().round(2))

most_freq_hp = df['horsepower'].value_counts().idxmax()
print("the most frequent value of horsepower:", most_freq_hp)

the median value of horsepower before filling the nulls: 149.0
the most frequent value of horsepower: 152.0


In [47]:
df = df.fillna({"horsepower": most_freq_hp})
print("the median value of horsepower after filling the nulls:",df['horsepower'].median().round(2))

the median value of horsepower after filling the nulls: 152.0


### Yes, it increased

In [48]:
df['horsepower'].isnull().sum()

np.int64(0)

### Q7. Sum of weights

In [49]:
asian_cars = df[df['origin']  == 'Asia']
asian_cars

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
8,250,1.0,174.0,2714.219310,10.3,2016,Asia,Diesel,Front-wheel drive,-1.0,16.823554
12,320,5.0,145.0,2783.868974,15.1,2010,Asia,Diesel,All-wheel drive,1.0,16.175820
14,200,6.0,160.0,3582.687368,14.9,2007,Asia,Diesel,All-wheel drive,0.0,11.871091
20,150,3.0,197.0,2231.808142,18.7,2011,Asia,Gasoline,Front-wheel drive,1.0,18.889083
21,160,4.0,133.0,2659.431451,,2016,Asia,Gasoline,Front-wheel drive,-1.0,16.077730
...,...,...,...,...,...,...,...,...,...,...,...
9688,260,4.0,152.0,3948.404625,15.5,2018,Asia,Diesel,All-wheel drive,-1.0,11.054830
9692,180,3.0,188.0,3680.341381,18.0,2016,Asia,Gasoline,Front-wheel drive,1.0,11.711653
9693,280,2.0,148.0,2545.070139,15.6,2012,Asia,Diesel,All-wheel drive,0.0,17.202782
9698,180,1.0,131.0,3107.427820,13.2,2005,Asia,Gasoline,Front-wheel drive,-2.0,13.933716


In [19]:
first_7 = asian_cars[['vehicle_weight', 'model_year']].head(7)
first_7

Unnamed: 0,vehicle_weight,model_year
8,2714.21931,2016
12,2783.868974,2010
14,3582.687368,2007
20,2231.808142,2011
21,2659.431451,2016
34,2844.227534,2014
38,3761.994038,2019


In [20]:
X = first_7.values
X

array([[2714.21930965, 2016.        ],
       [2783.86897424, 2010.        ],
       [3582.68736772, 2007.        ],
       [2231.8081416 , 2011.        ],
       [2659.43145076, 2016.        ],
       [2844.22753389, 2014.        ],
       [3761.99403819, 2019.        ]])

In [21]:
X.T

array([[2714.21930965, 2783.86897424, 3582.68736772, 2231.8081416 ,
        2659.43145076, 2844.22753389, 3761.99403819],
       [2016.        , 2010.        , 2007.        , 2011.        ,
        2016.        , 2014.        , 2019.        ]])

In [22]:
def vector_vector_multiplication(u, v):
    assert u.shape[0] == v.shape[0]
    
    n = u.shape[0]
    
    result = 0.0

    for i in range(n):
        result = result + u[i] * v[i]
    
    return result

def matrix_vector_mult(u, v):
    assert u.shape[1] == v.shape[0]
    
    rows_no = u.shape[0]

    result = np.zeros(rows_no)
    
    for i in range(rows_no):
        result[i] = vector_vector_multiplication(u[i], v)

    return result
    
def matrix_matrix_mult(u, m):
    assert u.shape[1] == m.shape[0],"no_rows != no_cols"
    
    rows_no = u.shape[0]
    cols_no = m.shape[1]

    result = np.zeros([rows_no, cols_no])
    
    for i in range(rows_no):
        for j in range(cols_no):
            # result[i, j] = np.vstack([result[i, j], vector_vector_multiplication(u[i], m[:, i])])
            result[i, j] = vector_vector_multiplication(u[i], m[:, j])
    return result

XTX = matrix_matrix_mult(X.T, X)
XTX

array([[62248334.33150762, 41431216.50732678],
       [41431216.50732678, 28373339.        ]])

In [23]:
XTX_inverse = np.linalg.inv(XTX)
XTX_inverse

array([[ 5.71497081e-07, -8.34509443e-07],
       [-8.34509443e-07,  1.25380877e-06]])

In [24]:
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])
y

array([1100, 1300,  800,  900, 1000, 1100, 1200])

In [25]:
sub_w = XTX_inverse.dot(X.T)
sub_w

array([[-1.31202622e-04, -8.63909858e-05,  3.72634923e-04,
        -4.02726650e-04, -1.62513724e-04, -5.52342829e-05,
         4.65094049e-04],
       [ 2.62636846e-04,  1.96990690e-04, -4.73392228e-04,
         6.58944477e-04,  3.08357831e-04,  1.51636137e-04,
        -6.07979633e-04]])

In [26]:
w = sub_w.dot(y)
w

array([0.01386421, 0.5049067 ])

In [32]:
w.sum()

np.float64(0.5187709081074003)

In [28]:
XTX_2 = (X.T).dot(X)

XTX_2_inverse = np.linalg.inv(XTX_2)

In [29]:
sub_w_2 = XTX_2_inverse.dot(X.T)
sub_w_2

array([[-1.31202622e-04, -8.63909858e-05,  3.72634923e-04,
        -4.02726650e-04, -1.62513724e-04, -5.52342829e-05,
         4.65094049e-04],
       [ 2.62636846e-04,  1.96990690e-04, -4.73392228e-04,
         6.58944477e-04,  3.08357831e-04,  1.51636137e-04,
        -6.07979633e-04]])

In [30]:
w2 = sub_w_2.dot(y)
w2

array([0.01386421, 0.5049067 ])

In [31]:
w2.sum()

np.float64(0.5187709081074016)