In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Q1. Pandas version
**What's the version of Pandas that you installed?**

In [7]:
pd.__version__

'2.2.2'

In [13]:
# url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
# df = pd.read_csv(url)

df = pd.read_csv('car_fuel_efficiency.csv')

In [15]:
df.head()       
df.info()        
df.describe()    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9704 entries, 0 to 9703
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   engine_displacement  9704 non-null   int64  
 1   num_cylinders        9222 non-null   float64
 2   horsepower           8996 non-null   float64
 3   vehicle_weight       9704 non-null   float64
 4   acceleration         8774 non-null   float64
 5   model_year           9704 non-null   int64  
 6   origin               9704 non-null   object 
 7   fuel_type            9704 non-null   object 
 8   drivetrain           9704 non-null   object 
 9   num_doors            9202 non-null   float64
 10  fuel_efficiency_mpg  9704 non-null   float64
dtypes: float64(6), int64(2), object(3)
memory usage: 834.1+ KB


Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,num_doors,fuel_efficiency_mpg
count,9704.0,9222.0,8996.0,9704.0,8774.0,9704.0,9202.0,9704.0
mean,199.708368,3.962481,149.657292,3001.280993,15.021928,2011.484027,-0.006412,14.985243
std,49.455319,1.999323,29.879555,497.89486,2.510339,6.659808,1.048162,2.556468
min,10.0,0.0,37.0,952.681761,6.0,2000.0,-4.0,6.200971
25%,170.0,3.0,130.0,2666.248985,13.3,2006.0,-1.0,13.267459
50%,200.0,4.0,149.0,2993.226296,15.0,2012.0,0.0,15.006037
75%,230.0,5.0,170.0,3334.957039,16.7,2017.0,1.0,16.707965
max,380.0,13.0,271.0,4739.077089,24.3,2023.0,4.0,25.967222


### Q2. Records count
**How many records are in the dataset?**

In [22]:
# To get number of records (rows)
num_records = len(df)
print("Number of records:", num_records)

Number of records: 9704


### Q3. Fuel types
**How many fuel types are presented in the dataset?**

In [30]:
# To get the number of fuel types in the dataset
num_fuel_types = df['fuel_type'].nunique()
print("Number of fuel types:", num_fuel_types)

Number of fuel types: 2


### Q4. Missing values
**How many columns in the dataset have missing values?**

In [35]:
# Check how many columns have missing values
missing_values_per_column = df.isnull().sum()
columns_with_missing = missing_values_per_column[missing_values_per_column > 0]

print("\nNumber of columns with missing values:", len(columns_with_missing))


Number of columns with missing values: 4


### Q5. Max fuel efficiency
**What's the maximum fuel efficiency of cars from Asia?**

In [41]:
# First, identify cars whose origin is Asia
asia_cars = df[df['origin'] == 'Asia']

# Get the max fuel efficiency in mpg and round it off to 2 dec 
max_fuel_efficiency = round(asia_cars['fuel_efficiency_mpg'].max(), 2)

print(f"Maximum fuel efficiency of cars from Asia: {max_fuel_efficiency} mpg")


Maximum fuel efficiency of cars from Asia: 23.759122836520497 mpg


### Q6. Median value of horsepower
**Find the median value of horsepower column in the dataset.
Next, calculate the most frequent value of the same horsepower column.
Use fillna method to fill the missing values in horsepower column with the most frequent value from the previous step.
Now, calculate the median value of horsepower once again.
Has it changed?**

In [45]:
# Step 1: Find the median value of the horsepower column
initial_median = df['horsepower'].median()
print(f"Initial median horsepower: {initial_median}")

Initial median horsepower: 149.0


In [47]:
# Step 2: Calculate the most frequent (mode) value of the horsepower column
mode_hp = df['horsepower'].mode()[0]
print(f"Most frequent horsepower value: {mode_hp}")

Most frequent horsepower value: 152.0


In [49]:
# Step 3: Fill missing values in horsepower with the mode value
df['horsepower'] = df['horsepower'].fillna(mode_hp)

In [51]:
# Step 4: Calculate the median horsepower again
new_median = df['horsepower'].median()
print(f"New median horsepower after filling missing values: {new_median}")

New median horsepower after filling missing values: 152.0


In [55]:
# Step 5: Has the median changed?
changed = initial_median != new_median
print(f"Has the median changed? {'Yes' if changed else 'No'}")

Has the median changed? Yes


Yes, it increased

### Q7. Sum of weights
**Select all the cars from Asia
Select only columns vehicle_weight and model_year
Select the first 7 values
Get the underlying NumPy array. Let's call it X.
Compute matrix-matrix multiplication between the transpose of X and X. To get the transpose, use X.T. Let's call the result XTX.
Invert XTX.
Create an array y with values [1100, 1300, 800, 900, 1000, 1100, 1200].
Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w.
What's the sum of all the elements of the result**

In [62]:
import pandas as pd
import numpy as np

# Load the dataset
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
df = pd.read_csv(url)

# Step 1: Select all cars from Asia
df_asia = df[df['origin'] == 'Asia']

# Step 2: Select only 'vehicle_weight' and 'model_year' columns
df_selected = df_asia[['vehicle_weight', 'model_year']]

# Step 3: Select the first 7 values
df_7 = df_selected.head(7)

# Step 4: Get the underlying NumPy array
X = df_7.values

# Step 5: Compute XTX = X.T @ X
XTX = X.T.dot(X)

# Step 6: Invert XTX
XTX_inv = np.linalg.inv(XTX)

# Step 7: Create y
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])

# Step 8: Compute w = (XTX)^(-1) * X.T * y
w = XTX_inv.dot(X.T).dot(y)

# Step 9: Sum of all elements in w
sum_w = w.sum()

print(f"Sum of elements in w: {sum_w}")


Sum of elements in w: 0.5187709081074008
