In [1]:
from google.colab import drive 
drive.mount('/content/gdrive')
path_to_csv = '/content/gdrive/My Drive/data.csv'
import warnings
warnings.filterwarnings("ignore")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import pandas as pd
# Read the provided CSV file ‘data.csv’
df = pd.read_csv(path_to_csv)
df

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
...,...,...,...,...
164,60,105,140,290.8
165,60,110,145,300.0
166,60,115,145,310.2
167,75,120,150,320.4


In [3]:
# Show the basic statistical description about the data.
df.describe()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
count,169.0,169.0,169.0,164.0
mean,63.846154,107.461538,134.047337,375.790244
std,42.299949,14.510259,16.450434,266.379919
min,15.0,80.0,100.0,50.3
25%,45.0,100.0,124.0,250.925
50%,60.0,105.0,131.0,318.6
75%,60.0,111.0,141.0,387.6
max,300.0,159.0,184.0,1860.4


In [5]:
# Check if the data has null values.
print('Are there any null values: ',df.isnull().values.any())
# Replace the null values with the mean
df.fillna(df.mean(),inplace=True)
print('Are there any null values after using fillna: ',df.isnull().values.any())

Are there any null values:  True
Are there any null values after using fillna:  False


In [6]:
# Select at least two columns and aggregate the data using: min, max, count, mean.
aggre = df.groupby('Duration').agg({'Calories':['mean','min','max','count']})
aggre

Unnamed: 0_level_0,Calories,Calories,Calories,Calories
Unnamed: 0_level_1,mean,min,max,count
Duration,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
15,87.35,50.5,124.2,2
20,151.6,50.3,229.4,9
25,244.2,244.2,244.2,1
30,192.125,86.2,319.2,16
45,279.096585,100.7,406.0,35
60,341.046465,215.2,486.0,79
75,325.4,320.4,330.4,2
80,643.1,643.1,643.1,1
90,541.8,466.4,700.0,8
120,666.833333,500.0,1000.1,3


In [7]:
# Filter the dataframe to select the rows with calories values between 500 and 1000
df[(df['Calories']>=500) & (df['Calories']<=1000)]

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
51,80,123,146,643.1
62,160,109,135,853.0
65,180,90,130,800.4
66,150,105,135,873.4
67,150,107,130,816.0
72,90,100,127,700.0
73,150,97,127,953.2
75,90,98,125,563.2
78,120,100,130,500.4
83,120,100,130,500.0


In [8]:
# Filter the dataframe to select the rows with calories values > 500 and pulse < 100
df[(df['Calories']>500) & (df['Pulse']<100)]

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
65,180,90,130,800.4
70,150,97,129,1115.0
73,150,97,127,953.2
75,90,98,125,563.2
99,90,93,124,604.1
103,90,90,100,500.4
106,180,90,120,800.3
108,90,90,120,500.3


In [9]:
# Create a new “df_modified” dataframe that contains all the columns from df except for “Maxpulse”
df_modified = df[['Duration', 'Pulse', 'Calories']]
df_modified

Unnamed: 0,Duration,Pulse,Calories
0,60,110,409.1
1,60,117,479.0
2,60,103,340.0
3,45,109,282.4
4,45,117,406.0
...,...,...,...
164,60,105,290.8
165,60,110,300.0
166,60,115,310.2
167,75,120,320.4


In [10]:
# Delete the “Maxpulse” column from the main df dataframe
df = df.drop('Maxpulse', axis=1)
df

Unnamed: 0,Duration,Pulse,Calories
0,60,110,409.1
1,60,117,479.0
2,60,103,340.0
3,45,109,282.4
4,45,117,406.0
...,...,...,...
164,60,105,290.8
165,60,110,300.0
166,60,115,310.2
167,75,120,320.4


In [11]:
# Convert the datatype of Calories column to int datatype
df['Calories'] = df['Calories'].astype('int64')
df.dtypes

Duration    int64
Pulse       int64
Calories    int64
dtype: object