<mark> ########################### 07/19/2025 ###############################


<Mark> ############################ Saturday ##################################

In [None]:
# Load the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
cars = pd.read_csv('mtcars2.csv')
cars.shape

(32, 13)

In [None]:
# Verify(ensure) the data
cars

Unnamed: 0,S.No,Unnamed: 1,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,1,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,2,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,3,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,4,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,5,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
5,6,Valiant,18.1,6,225.0,105,2.76,3.46,,1,0,3,1
6,7,Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
7,8,Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
8,9,Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
9,10,Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


**Handling Missing Value**

In [None]:
# From previous session we know qsec has null values. (qsec = time taken to reach quarter miles)
# If the data loss is more 5% or more than we cannot drop the missing values
# When dropping is not an option, filling the data is doble option
#************************** Steps to take before modifying the data *****************************
# 1. First get the permission before modifying the data
# 2. Confirm on what value (Agreeable value)
cars['qsec'].describe()

Unnamed: 0,qsec
count,29.0
mean,17.674828
std,1.780394
min,14.5
25%,16.87
50%,17.42
75%,18.6
max,22.9


In [None]:
# from the above observation, The mean is 17.6748 seconds and it can be rounded to 18. So taking average 17.67 is a safe number to use for replacement
# Before replacing it with mean value
# Step 1 calculate the mean value of qsec
qsec_avg = cars.qsec.mean()
qsec_avg

np.float64(17.6748275862069)

In [None]:
# Verify the result (temporary change)
cars.qsec.fillna(qsec_avg)

Unnamed: 0,qsec
0,16.46
1,17.02
2,18.61
3,19.44
4,17.02
5,17.674828
6,15.84
7,20.0
8,22.9
9,18.3


In [None]:
cars['qsec'].isna().sum() # since this is a temporary change so the dataframe still has null value

np.int64(3)

In [None]:
# now Replace them permanently. For which there are two approaches
# Approach 1: Update itself

#              cars['qsec'] = cars.qsec.fillna(cars.qsec.mean(), inplace=True)
# or
cars['qsec'] = cars.qsec.fillna(cars['qsec'].mean())

In [None]:
cars.qsec.isna().sum()

np.int64(0)

In [None]:
# Approach 2: Inplace Parameter
qsec_avg = cars.qsec.mean()
cars.qsec.fillna(qsec_avg, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cars.qsec.fillna(qsec_avg, inplace=True)


In [None]:
cars.qsec.isna().sum()

np.int64(0)

**Drop the unwanted Data**

In [None]:
cars.head()

Unnamed: 0,S.No,Unnamed: 1,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,1,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,2,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,3,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,4,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,5,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


Drop S.No column


In [None]:
cars.drop(columns = ['S.No']) # It will give you the preview (Temporary drop)

Unnamed: 0,Unnamed: 1,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
5,Valiant,18.1,6,225.0,105,2.76,3.46,17.674828,1,0,3,1
6,Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
7,Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
8,Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
9,Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [None]:
# Permanent Drop
# Approach 1:
#                        cars = cars.drop(columns = ['S.No'])
# Approach 2:
cars.drop(columns = ['S.No'], inplace = True)
cars

Unnamed: 0,Unnamed: 1,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
5,Valiant,18.1,6,225.0,105,2.76,3.46,17.674828,1,0,3,1
6,Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
7,Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
8,Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
9,Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [None]:
cars.columns

Index(['Unnamed: 1', 'mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs',
       'am', 'gear', 'carb'],
      dtype='object')

Drop the rows

In [None]:
cars.drop(index = [17, 25,28,31]) # Dropping using index (Getting rid of fiats, ford and volvo)
# or
# cars.drop([17, 25,28,31]) are the same

Unnamed: 0,Unnamed: 1,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
5,Valiant,18.1,6,225.0,105,2.76,3.46,17.674828,1,0,3,1
6,Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
7,Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
8,Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
9,Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [None]:
# Example
cars.head(10).wt.sum() # This gives the total weight of first 10 cars

np.float64(31.28)

In [None]:
cancelled = [2, 5, 6]
cars.drop(cancelled).wt.sum() # This drops the rows 2, 5, 6 from the dataframe and gives the final weight

np.float64(93.602)

Renaming columns

In [None]:
cars.rename(columns = {'Unnamed: 1':'Model'})

Unnamed: 0,Model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
5,Valiant,18.1,6,225.0,105,2.76,3.46,17.674828,1,0,3,1
6,Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
7,Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
8,Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
9,Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [None]:
# Make the change permanent
cars.rename(columns = {'Unnamed: 1':'Model'}, inplace = True)
cars.columns

Index(['Model', 'mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am',
       'gear', 'carb'],
      dtype='object')

Slicing and Indexing

iloc - Accessing using index

loc - Accessing using Location

In [None]:
#