In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

In [3]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


## Q1. Pandas Version

In [4]:
pd.__version__

'2.3.1'

## Q2. Records Count

In [5]:
# How many records are in the dataset?

record = df.shape[0]

print(f"There are {record} records in the dataset.")

There are 9704 records in the dataset.


## Q3. Fuel Types

In [6]:
# How many fuel types are presented in the dataset?

fuel_type = df.fuel_type.value_counts().nunique()

print(f"There are {fuel_type} unique fuel types in the dataset.")

There are 2 unique fuel types in the dataset.


## Q4. Missing Values

In [7]:
# How many columns in the dataset have missing values?

df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [8]:
num_columns = len(df.columns[df.isnull().any()])

print(f"The are {num_columns} columns with missing values.")

The are 4 columns with missing values.


## Q5. Maximum Fuel Efficiency

In [9]:
# What's the maximum fuel efficiency of cars from Asia?

max_fuel_eff_asia = df[df['origin'] == 'Asia'].fuel_efficiency_mpg.max()

print(f"The maximum fuel efficiency of cars from Asia is {round(max_fuel_eff_asia, 3)}")

The maximum fuel efficiency of cars from Asia is 23.759


## Q6. Median Value of Horsepower

In [10]:
# Median value of the horsepower column in the dataset before filling the missing values

median_before = df.horsepower.median()
print(f"Median before filling: {median_before}")

Median before filling: 149.0


In [11]:
# Most frequent value of the horsepower column

most_frequent = df.horsepower.value_counts().idxmax() # Value with highest frequency

count =  df.horsepower.value_counts().max() # The number of occurence - frequency

print(f"{most_frequent} is the value with the highest frequency, appearing {count} times.")

152.0 is the value with the highest frequency, appearing 142 times.


In [12]:
# Fill the missing values in the horsepower column with the most frequent value

df['horsepower'] = df['horsepower'].fillna(most_frequent)


In [13]:
# Median value of the horsepower column after filling the missing values

median_after = df.horsepower.median()
print(f"Median after filling: {median_after}")

Median after filling: 152.0


##### _The median value increased after filling the missing values in the horsepower column with the most frequent value_

## Q7. Sum of Weights

In [14]:
# Select all the cars from Asia only vehicle_weight and model_year columns and the first 7 values

df_asia = df[df['origin'] == 'Asia'][['vehicle_weight', 'model_year']]
df_asia = df_asia.head(n=7)
df_asia

Unnamed: 0,vehicle_weight,model_year
8,2714.21931,2016
12,2783.868974,2010
14,3582.687368,2007
20,2231.808142,2011
21,2659.431451,2016
34,2844.227534,2014
38,3761.994038,2019


In [15]:
# Underlying NumPy array

X = df_asia[['vehicle_weight', 'model_year']].to_numpy()
X

array([[2714.21930965, 2016.        ],
       [2783.86897424, 2010.        ],
       [3582.68736772, 2007.        ],
       [2231.8081416 , 2011.        ],
       [2659.43145076, 2016.        ],
       [2844.22753389, 2014.        ],
       [3761.99403819, 2019.        ]])

In [16]:
# Matrix-matrix multiplication between the transpose of X and X

XTX = np.dot(X.T, X)

XTX

array([[62248334.33150762, 41431216.5073268 ],
       [41431216.5073268 , 28373339.        ]])

In [17]:
# Compute the inverse of XTX

XTX_inv = np.linalg.inv(XTX)

print(XTX_inv)

[[ 5.71497081e-07 -8.34509443e-07]
 [-8.34509443e-07  1.25380877e-06]]


In [18]:
# Create an array y

y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])
y

array([1100, 1300,  800,  900, 1000, 1100, 1200])

In [19]:
# Multiply the inverse of XTX with the transpose of X, and then multiply the result by y

w = np.dot(XTX_inv, np.dot(X.T, y))
print("w =", w)

w = [0.01386421 0.5049067 ]
