<h3>Loading data</h3>

In [61]:
import pandas as pd
import numpy as np
import math 
from IPython.display import display



global pk
dataset = pd.read_csv("../Data/Practice.csv")

dataset.head(5)

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0


<h2> Locate/Location Function</h2>

In [62]:
#we can use the loc function to locate/access elements the data structure by labels 
print(float(dataset.loc[0,'Pulse']))

#we can also use the loc func to select specific parts of the dataStructure 
dataset.loc[5:10,['Date','Maxpulse']]

#loc supports boolean indexing for conditional selecting 
dataset.loc[dataset['Maxpulse'] > 130]

#boolean indexing with limited column 
dataset.loc[dataset['Maxpulse'] > 135,["Calories",'Date']]


110.0


Unnamed: 0,Calories,Date
1,479.0,'2020/12/02'
3,282.4,'2020/12/04'
4,406.0,'2020/12/05'
6,374.0,'2020/12/07'
10,329.3,'2020/12/11'


In [63]:
#iloc is similar to loc but uses indexes instead of labels
print(float(dataset.iloc[2,3]))

#returns dataStructures for more than 1 values
dataset.iloc[5:12,:4]         

#does not support boolean indexing 

135.0


Unnamed: 0,Duration,Date,Pulse,Maxpulse
5,60,'2020/12/06',102,127
6,60,'2020/12/07',110,136
7,450,'2020/12/08',104,134
8,30,'2020/12/09',109,133
9,20,'2020/12/10',98,124
10,60,'2020/12/11',103,147
11,60,'2020/12/12',100,120


<h3>at and iat</h3>

In [64]:
#at is used to access or change single values 
print(float(dataset.at[9,'Duration']))


dataset.at[7,'Duration'] = 120
display(dataset.loc[7])

#iat is similar to at but uses index addressing instead of label addressing 
print(int(dataset.iat[7,0]))

#modifying values with iat 
dataset.iat[10,0] = 55
display((dataset.loc[10]))

20.0


Duration             120
Date        '2020/12/08'
Pulse                104
Maxpulse             134
Calories           253.3
Name: 7, dtype: object

120


Duration              55
Date        '2020/12/11'
Pulse                103
Maxpulse             147
Calories           329.3
Name: 10, dtype: object

In [65]:
#query is used for more readable and sql like filtering but is slightly slower
dataset.query('Duration < 60')

#is more readable for more complex queries 
dataset.query('Duration < 60 and Maxpulse > 120 or Calories > 400')                                    #query

#Same query but using loc 
dataset.loc[(dataset['Duration'] < 60) & (dataset['Maxpulse'] > 120) | (dataset['Calories'] > 400)]    #loc



Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
8,30,'2020/12/09',109,133,195.1
9,20,'2020/12/10',98,124,269.0
10,55,'2020/12/11',103,147,329.3
20,45,'2020/12/20',97,125,243.0
24,45,'2020/12/24',105,132,246.0


<h3>Rename Function</h3>

In [66]:
#Renaming columns 
dataset.rename(columns={"Duration":"Time","Calories":"Cal"}).head(3)

#Rename indexes
dataset.rename(index={0:'row1',1:'row2'}).head(3)

#Copies are returned 


Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
row1,60,'2020/12/01',110,130,409.1
row2,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0


<h3>Replacing values</h3>

In [67]:
#replacing values over a dataframe 
display(dataset.head(2))
display(dataset.replace(130,23).head(2))

#replacing more than 1 values with more than 1 values can be acheived by using of dictionaries
display(dataset.iloc[:,0].head(4))
dataset.iloc[:,0].replace({60:52,45:32}).head(4)



Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0


Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,23,409.1
1,60,'2020/12/02',117,145,479.0


0    60
1    60
2    60
3    45
Name: Duration, dtype: int64

0    52
1    52
2    52
3    32
Name: Duration, dtype: int64

<h3>AsType Function</h3>

In [68]:
#AsType is used to change or modify datatypes
#generating copy to modify
duration = dataset.loc[:,'Duration'].copy()

#printing dtype before and after to verify change 
print(duration.dtype,duration.memory_usage(deep=True),'bytes')
#change
duration = duration.astype(np.int16)
#after
print(duration.dtype,duration.memory_usage(deep=True),'bytes')


int64 388 bytes
int16 196 bytes


<h3>Apply function</h3> 

In [69]:
#Generating a copy to modify 
dataset_copy = dataset.copy()

#creating function to apply
def to_seconds(x):
    return x * 60 

#using apply() to apply the function
dataset_copy.iloc[:,0].apply(to_seconds)

#using lambda functions as a shorthand 
dataset_copy.iloc[:,0] = dataset_copy.iloc[:,0].apply(lambda x : x * 60)
display(dataset_copy.head(3))


Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,3600,'2020/12/01',110,130,409.1
1,3600,'2020/12/02',117,145,479.0
2,3600,'2020/12/03',103,135,340.0


<h3>Duplicate Functions</h3>

In [70]:
#Generating copy for altering 
dataset_copy = dataset.copy()

#.duplicated() returns true for every row that is duplicated
display(dataset_copy.loc[dataset.duplicated() == True])

#removing duplicated rows 
dataset_copy = dataset_copy.drop_duplicates()

#after removing 
dataset_copy.loc[dataset.duplicated() == True]


Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
12,60,'2020/12/12',100,120,250.7


Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories


<h3>Clip() function</h3>

In [77]:
#Generating copy for altering 
dataset_copy = dataset.copy()

#clip is used to limit the values within a range 
#good to manage outliers 
display(dataset.loc[6:10,:])

#here 120 and 20 are outliers 
#we can use clip to limit them in 30-60 range 
dataset_copy.iloc[:,0] = dataset_copy.iloc[:,0].clip(lower=30,upper=60)
dataset.copy().iloc[6:11,:]

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
6,60,'2020/12/07',110,136,374.0
7,120,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,20,'2020/12/10',98,124,269.0
10,55,'2020/12/11',103,147,329.3


Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
6,60,'2020/12/07',110,136,374.0
7,120,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,20,'2020/12/10',98,124,269.0
10,55,'2020/12/11',103,147,329.3
