# GETTING STARTED

 ### Analysing/Understanding the Data

In [35]:
#importing libraries
#pandas and numpy are the libraries, pd and np are simply references to those libraries
import pandas as pd
import numpy as np

#opening the file - df is a common variable name used for holding data extracted from a data file - abbreviated form of DataFrame
#you can name your DataFrame anything you like
#if this ipynb document has been created in the same location as your data file, you should be able to access the file as follows
df = pd.read_csv('dirtydata.csv')

In [36]:
#to view the DataFrame type df - if there are too many records it will not display them all
  #if it is a small DataFrame like this one, the entire contents will be displayed
  #if it is an extremely large DataFrame you will only see some of the top rows and bottoms rows
  #Notice the indexing on the left is in bold - these are row references to make it easier for you to access data 
    # - they start at 0
  #At this point they are not actually part of your dataframe
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


**Important observations from the dataset above**  
  
From this simple command we can make several interesting observations about our data

The data set contains some empty cells ("Date" in row 22, and "Calories" in row 18 and 28).

The data set contains data that is in the wrong format ("Date" in row 26).

The data set contains incorrect data ("Duration" in row 7).

The data set contains duplicates (row 11 and 12).

**Remember** - your datasets will not always be this short, so you may not be able to 'see' potential issues. We will need to use other techniques to determine issues with the data. 

In [37]:
#print the shape of the data - the number of rows and the number of columns
#32 rows and 5 columns
df.shape

(32, 5)

In [38]:
#print the column headings
df.columns

Index(['Duration', 'Date', 'Pulse', 'Maxpulse', 'Calories'], dtype='object')

In [39]:
#print the data types - Python needs to work on numerical data only. 
  #We can see here that one of our columns has a type Object
df.dtypes

Duration      int64
Date         object
Pulse         int64
Maxpulse      int64
Calories    float64
dtype: object

In [40]:
#this code lets you have a look at the top 5 rows of the table
  #you can specify a number of rows to display - simply type the number of rows in the brackets
  #the column on the left (without a heading) is an index reference to each row. It does not exist in the actual data file.
df.head()

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0


In [41]:
#Look at the last 5 rows of the dataframe - do a google search for 'pandas view the last n rows in dataframe'
df.tail()

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
27,60,'2020/12/27',92,118,241.0
28,60,'2020/12/28',103,132,
29,60,'2020/12/29',100,132,280.0
30,60,'2020/12/30',102,129,380.3
31,60,'2020/12/31',92,115,243.0


In [42]:
#this code provides lots of useful information about the data - 
  #for example, you can see that the Calories column has 2 missing values
  #by default this just describes the numerical data
df.describe()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
count,32.0,32.0,32.0,30.0
mean,68.4375,103.5,128.5,304.68
std,70.039591,7.832933,12.998759,66.003779
min,30.0,90.0,101.0,195.1
25%,60.0,100.0,120.0,250.7
50%,60.0,102.5,127.5,291.2
75%,60.0,106.5,132.25,343.975
max,450.0,130.0,175.0,479.0


In [43]:
#you can explicitly specify numerical information - it will produce the same output as the cell above
df.describe(include=[np.number])

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
count,32.0,32.0,32.0,30.0
mean,68.4375,103.5,128.5,304.68
std,70.039591,7.832933,12.998759,66.003779
min,30.0,90.0,101.0,195.1
25%,60.0,100.0,120.0,250.7
50%,60.0,102.5,127.5,291.2
75%,60.0,106.5,132.25,343.975
max,450.0,130.0,175.0,479.0


In [44]:
#you can also retrieve information about the Object data - here you can see that we are missing a Date value
df.describe(include = "O" )

Unnamed: 0,Date
count,31
unique,30
top,'2020/12/12'
freq,2


### Dealing with Null Values

In [45]:
#If you have a large dataset you will have to use code to check for null values
  #This code will count the Null values in each column
df.isnull().sum(axis = 0)

Duration    0
Date        1
Pulse       0
Maxpulse    0
Calories    2
dtype: int64

In [46]:
#Do a google search to see if there are any other ways to count null values - Exact same thing output
df.isna().sum()

Duration    0
Date        1
Pulse       0
Maxpulse    0
Calories    2
dtype: int64

In [47]:
#False if there are no null values, but true if there is at least one null value. Won't display number values null
df.isnull().any()

Duration    False
Date         True
Pulse       False
Maxpulse    False
Calories     True
dtype: bool

In [48]:
#Total number of null values within data set
df.isnull().values.sum()

3

In [49]:
df.notnull().values.sum()

157

In [50]:
#This code will count the Null values in each row
df.isnull().sum(axis = 1)

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    1
19    0
20    0
21    0
22    1
23    0
24    0
25    0
26    0
27    0
28    1
29    0
30    0
31    0
dtype: int64

In [51]:
#if you simply want to drop any rows that contain null values use dropna

#for demonstration purposes read in a fresh copy of the data file to a new temporary DataFrame 
  #Our original df will not be changed
temp_df = pd.read_csv('dirtydata.csv')

#drop null values in the DataFrame
temp_df = temp_df.dropna()

#Check the shape of the DataFrame after dropping rows with null values
temp_df.shape

(29, 5)

In [52]:
#rows 18 and 28 in the dataset have null values in the Calories column - algorithms cannot handle null values
  #rather than simply drop rows and potentially lose other valuable data, 
  #data scientists employ various techniques to handle missing values
#if you want to replace null values in a specific column with a specfic value
    #using inplace=True will implement the change in the dataframe - you do not have to SAVE this change.

#to test this read in a fresh copy of your data file
temp_df = pd.read_csv('dirtydata.csv')

#replace null values in the Calories column with a specific value
temp_df['Calories'].fillna(130, inplace = True)

#display temp_df
temp_df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


In [53]:
#Count the Null values in each column again to see that the Calories column no longer contains null values
temp_df.isnull().sum(axis = 0)

Duration    0
Date        1
Pulse       0
Maxpulse    0
Calories    0
dtype: int64

In [54]:
#Or display the contents of temp_df to check rows 18 and 28
temp_df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


In [55]:
#You could also display the contents of just one column in the DataFrame
#When accessing columns in your code:
  #make sure the name is in single quotes and matches the exact spelling of the column name in the dataset
temp_df['Calories']

0     409.1
1     479.0
2     340.0
3     282.4
4     406.0
5     300.0
6     374.0
7     253.3
8     195.1
9     269.0
10    329.3
11    250.7
12    250.7
13    345.3
14    379.3
15    275.0
16    215.2
17    300.0
18    130.0
19    323.0
20    243.0
21    364.2
22    282.0
23    300.0
24    246.0
25    334.5
26    250.0
27    241.0
28    130.0
29    280.0
30    380.3
31    243.0
Name: Calories, dtype: float64

In [56]:
#Another way of accessing a single column is as follows:
temp_df.Calories
  #NOTE:This technique will not work if your column name has a space in it - brackets and speech marks always work

0     409.1
1     479.0
2     340.0
3     282.4
4     406.0
5     300.0
6     374.0
7     253.3
8     195.1
9     269.0
10    329.3
11    250.7
12    250.7
13    345.3
14    379.3
15    275.0
16    215.2
17    300.0
18    130.0
19    323.0
20    243.0
21    364.2
22    282.0
23    300.0
24    246.0
25    334.5
26    250.0
27    241.0
28    130.0
29    280.0
30    380.3
31    243.0
Name: Calories, dtype: float64

In [57]:
#It is more common to use either mean, medium or mode to fill in a missing value

#MEAN
#to test this read in a fresh copy of your data file
temp_df = pd.read_csv('dirtydata.csv')

#calculate the mean of the calories column - 304.68
x = temp_df['Calories'].mean()

#Print the value of x - just so you can see it
#print(x)

#If you want to print the contents of a variable that holds a numeric value and some text, 
  #you need to cast your numeric value to a string 
print("Mean:" + str(x))

#Use x to fill null values
temp_df['Calories'].fillna(x, inplace = True)

#display temp_df to check rows 18 and 28
temp_df

Mean:304.68


Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


In [58]:
#MEDIAN
#to test this read in a fresh copy of your data file
temp_df = pd.read_csv('dirtydata.csv')

#calculate the median of the Calories column - 291.2
x = temp_df['Calories'].median()
print("Median:" + str(x))

#Use x to fill null values
temp_df['Calories'].fillna(x, inplace = True)
#display temp_df - check rows 18 and 28
temp_df

Median:291.2


Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


In [59]:
#MODE
#to test this read in a fresh copy of your data file
temp_df = pd.read_csv('dirtydata.csv')

#calculate the mode of the Calories column - 300.0
x = temp_df['Calories'].mode()[0]
print("Mode:" + str(x))

#Use x to fill null values
temp_df['Calories'].fillna(x, inplace = True)
#display temp_df - check rows 18 and 28
temp_df

Mode:300.0


Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


In [60]:
#MAX - to calculate this use max()

#read in a fresh copy of your data file to temp_df
temp_df = pd.read_csv('dirtydata.csv')

#calculate the maximum value of the Calories column - 479.0
x = temp_df['Calories'].max()
#print the maximum value
print("Max:" + str(x))

#Use the maximum value to fill null values

#Use x to fill null values
temp_df['Calories'].fillna(x, inplace = True)

#display temp_df - check rows 18 and 28
temp_df

Max:479.0


Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


In [61]:
#MIN - to calculate this use min()

#read in a fresh copy of your data file to temp_df
temp_df = pd.read_csv('dirtydata.csv')

#calculate the minimum value in the Calories column - 195.1
x = temp_df['Calories'].min()

#print the minimum value
print("min:" + str(x))

#Use the minimum value to fill null values
#Use x to fill null values
temp_df['Calories'].fillna(x, inplace = True)

#display temp_df - check rows 18 and 28
temp_df

min:195.1


Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


In [62]:
#Choose either mean, median, mode, max or min 
  #and replace the null values in the Calories column of our original dataframe - df

import pandas as pd

# Read in a fresh copy of your data file into temp_df
temp_df = pd.read_csv('dirtydata.csv')

# Calculate the median of the Calories column in temp_df
median_calories = temp_df['Calories'].median()
print("Median:" + str(median_calories))

# Use median_calories to fill null values in temp_df's 'Calories' column
temp_df['Calories'].fillna(median_calories, inplace=True)

# Display specific rows of temp_df to check the changes
print(temp_df.loc[[18, 28]])


Median:291.2
    Duration          Date  Pulse  Maxpulse  Calories
18        45  '2020/12/18'     90       112     291.2
28        60  '2020/12/28'    103       132     291.2


In [63]:
#display the original dataframe
  #if we look at the Date column we can see an issue in row 22 and row 26
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


In [76]:
#from the data above we can see the issue with the date in row 26 was resolved
#the result in the example above gave us a NaT value in row 22, which can be handled as a NULL value
#we can remove the row by using the dropna() method. 
#Rather than just drop all rows with null values - we can explicity state to drop rows where there is a null value in Date
    #This would mean that other null values would not be deleted
    
df.dropna(subset=['Date'], inplace = True)
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60.0,'2020/12/01',110,130,409.1
1,60.0,'2020/12/02',117,145,479.0
2,60.0,'2020/12/03',103,135,340.0
3,45.0,'2020/12/04',109,175,282.4
4,45.0,'2020/12/05',117,148,406.0
5,60.0,'2020/12/06',102,127,300.0
6,60.0,'2020/12/07',110,136,374.0
8,30.0,'2020/12/09',109,133,195.1
9,60.0,'2020/12/10',98,124,269.0
10,60.0,'2020/12/11',103,147,329.3


In [None]:
#check the data types in the dataframe again and you will notice that date no longer has a type Object
df.dtypes

In [81]:
#when we run the following code it will attempt to format the fields as dates
import pandas as pd
df['Date'] = pd.to_datetime(df['Date'], format='mixed')
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60.0,2020-12-01,110,130,409.1
1,60.0,2020-12-02,117,145,479.0
2,60.0,2020-12-03,103,135,340.0
3,45.0,2020-12-04,109,175,282.4
4,45.0,2020-12-05,117,148,406.0
5,60.0,2020-12-06,102,127,300.0
6,60.0,2020-12-07,110,136,374.0
8,30.0,2020-12-09,109,133,195.1
9,60.0,2020-12-10,98,124,269.0
10,60.0,2020-12-11,103,147,329.3


### Handling Wrong Data 

In [None]:
#"Wrong data" does not have to be "empty cells" or "wrong format", 
#it can just be wrong, like if someone registered "199" instead of "1.99"
#The duration value in row 7 is evidently wrong

#We could change the value in the field with a specific value, as follows:
df.loc[7,'Duration'] = 45
df

In [65]:
#We will now change it back so we can try something else
df.loc[7,'Duration'] = 450
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


In [66]:
#If the data set it too large it would not be possible to manually locate errors
  #If we wanted to check which rows in a particular column contain incorrect data,
  #we could loop through all values in that column

#loop through all values in the "Duration" column.
  #if the value is higher than 120, print the row number and the value:
for x in df.index:
  if df.loc[x, 'Duration'] > 120:
    #print the row and value - just so we can see where they are
    print(x ,df.loc[x, 'Duration'])

7 450


In [67]:
#if the value is higher than 120, set it to 120

for x in df.index:
  if df.loc[x, 'Duration'] > 120:
    df.loc[x, 'Duration'] = 120
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,120,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


In [68]:
#We could replace values with a mean value
#reset the value again
df.loc[7,'Duration'] = 450

#calculate the mean value of the duration column - 69.19354838709677
m= df['Duration'].mean()

#You could round the result is you needed to - try this
#m = round(m, 0)

#loop through all values in the "Duration" column.
#if the value is higher than 120, set it to m (the mean):
for x in df.index:
  if df.loc[x, 'Duration'] > 120:
    df.loc[x, 'Duration'] = m
    
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60.0,'2020/12/01',110,130,409.1
1,60.0,'2020/12/02',117,145,479.0
2,60.0,'2020/12/03',103,135,340.0
3,45.0,'2020/12/04',109,175,282.4
4,45.0,'2020/12/05',117,148,406.0
5,60.0,'2020/12/06',102,127,300.0
6,60.0,'2020/12/07',110,136,374.0
7,68.4375,'2020/12/08',104,134,253.3
8,30.0,'2020/12/09',109,133,195.1
9,60.0,'2020/12/10',98,124,269.0


In [69]:
#alternatively we could drop the row if it exceeds a certain value in a specified column
#reset the value again
df.loc[7,'Duration'] = 450

for x in df.index:
  if df.loc[x, 'Duration'] > 120:
    df.drop(x, inplace = True)

df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60.0,'2020/12/01',110,130,409.1
1,60.0,'2020/12/02',117,145,479.0
2,60.0,'2020/12/03',103,135,340.0
3,45.0,'2020/12/04',109,175,282.4
4,45.0,'2020/12/05',117,148,406.0
5,60.0,'2020/12/06',102,127,300.0
6,60.0,'2020/12/07',110,136,374.0
8,30.0,'2020/12/09',109,133,195.1
9,60.0,'2020/12/10',98,124,269.0
10,60.0,'2020/12/11',103,147,329.3


### Removing Duplicates

In [70]:
#look at the dataframe above, rows 11 & 12 are duplicated

#This code returns True for every row that is a duplicate, otherwise it returns False
df.duplicated()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
8     False
9     False
10    False
11    False
12     True
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
dtype: bool

In [71]:
#This code will drop duplicated rows
df.drop_duplicates(inplace = True)
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60.0,'2020/12/01',110,130,409.1
1,60.0,'2020/12/02',117,145,479.0
2,60.0,'2020/12/03',103,135,340.0
3,45.0,'2020/12/04',109,175,282.4
4,45.0,'2020/12/05',117,148,406.0
5,60.0,'2020/12/06',102,127,300.0
6,60.0,'2020/12/07',110,136,374.0
8,30.0,'2020/12/09',109,133,195.1
9,60.0,'2020/12/10',98,124,269.0
10,60.0,'2020/12/11',103,147,329.3


In [72]:
#Save the dataframe as a new dataset.
#Index = false is used to ensure that the row index values are not included in the dataset
df.to_csv('cleandata.csv', index = False)