# GETTING STARTED

 ### Analysing/Understanding the Data

In [None]:
#importing libraries
#pandas and numpy are the libraries, pd and np are simply references to those libraries
import pandas as pd
import numpy as np

#opening the file - df is a common variable name used for holding data extracted from a data file - abbreviated form of DataFrame
#you can name your DataFrame anything you like
#if this ipynb document has been created in the same location as your data file, you should be able to access the file as follows
df = pd.read_csv('dirtydata.csv')

In [None]:
#to view the DataFrame type df - if there are too many records it will not display them all
  #if it is a small DataFrame like this one, the entire contents will be displayed
  #if it is an extremely large DataFrame you will only see some of the top rows and bottoms rows
  #Notice the indexing on the left is in bold - these are row references to make it easier for you to access data 
    # - they start at 0
  #At this point they are not actually part of your dataframe
df

**Important observations from the dataset above**  
  
From this simple command we can make several interesting observations about our data

The data set contains some empty cells ("Date" in row 22, and "Calories" in row 18 and 28).

The data set contains data that is in the wrong format ("Date" in row 26).

The data set contains incorrect data ("Duration" in row 7).

The data set contains duplicates (row 11 and 12).

**Remember** - your datasets will not always be this short, so you may not be able to 'see' potential issues. We will need to use other techniques to determine issues with the data. 

In [None]:
#print the shape of the data - the number of rows and the number of columns
#32 rows and 5 columns
df.shape

In [None]:
#print the column headings
df.columns

In [None]:
#print the data types - Python needs to work on numerical data only. 
  #We can see here that one of our columns has a type Object
df.dtypes

In [None]:
#this code lets you have a look at the top 5 rows of the table
  #you can specify a number of rows to display - simply type the number of rows in the brackets
  #the column on the left (without a heading) is an index reference to each row. It does not exist in the actual data file.
df.head()

In [None]:
#Look at the last 5 rows of the dataframe - do a google search for 'pandas view the last n rows in dataframe'


In [None]:
#this code provides lots of useful information about the data - 
  #for example, you can see that the Calories column has 2 missing values
  #by default this just describes the numerical data
df.describe()

In [None]:
#you can explicitly specify numerical information - it will produce the same output as the cell above
df.describe(include=[np.number])

In [None]:
#you can also retrieve information about the Object data - here you can see that we are missing a Date value
df.describe(include = "O" )

### Dealing with Null Values

In [None]:
#If you have a large dataset you will have to use code to check for null values
  #This code will count the Null values in each column
df.isnull().sum(axis = 0)

In [None]:
#Do a google search to see if there are any other ways to count null values


In [None]:
#This code will count the Null values in each row
df.isnull().sum(axis = 1)

In [None]:
#if you simply want to drop any rows that contain null values use dropna

#for demonstration purposes read in a fresh copy of the data file to a new temporary DataFrame 
  #Our original df will not be changed
temp_df = pd.read_csv('dirtydata.csv')

#drop null values in the DataFrame
temp_df = df.dropna()

#Check the shape of the DataFrame after dropping rows with null values
temp_df.shape

In [None]:
#rows 18 and 28 in the dataset have null values in the Calories column - algorithms cannot handle null values
  #rather than simply drop rows and potentially lose other valuable data, 
  #data scientists employ various techniques to handle missing values
#if you want to replace null values in a specific column with a specfic value
    #using inplace=True will implement the change in the dataframe - you do not have to SAVE this change.

#to test this read in a fresh copy of your data file
temp_df = pd.read_csv('dirtydata.csv')

#replace null values in the Calories column with a specific value
temp_df['Calories'].fillna(130, inplace = True)

#display temp_df
temp_df

In [None]:
#Count the Null values in each column again to see that the Calories column no longer contains null values
temp_df.isnull().sum(axis = 0)

In [None]:
#Or display the contents of temp_df to check rows 18 and 28
temp_df

In [None]:
#You could also display the contents of just one column in the DataFrame
#When accessing columns in your code:
  #make sure the name is in single quotes and matches the exact spelling of the column name in the dataset
temp_df['Calories']

In [None]:
#Another way of accessing a single column is as follows:
temp_df.Calories
  #NOTE:This technique will not work if your column name has a space in it - brackets and speech marks always work

In [None]:
#It is more common to use either mean, medium or mode to fill in a missing value

#MEAN
#to test this read in a fresh copy of your data file
temp_df = pd.read_csv('dirtydata.csv')

#calculate the mean of the calories column - 304.68
x = temp_df['Calories'].mean()

#Print the value of x - just so you can see it
#print(x)

#If you want to print the contents of a variable that holds a numeric value and some text, 
  #you need to cast your numeric value to a string 
print("Mean:" + str(x))

#Use x to fill null values
temp_df['Calories'].fillna(x, inplace = True)

#display temp_df to check rows 18 and 28
temp_df

In [None]:
#MEDIAN
#to test this read in a fresh copy of your data file
temp_df = pd.read_csv('dirtydata.csv')

#calculate the median of the Calories column - 291.2
x = temp_df['Calories'].median()
print("Median:" + str(x))

#Use x to fill null values
temp_df['Calories'].fillna(x, inplace = True)
#display temp_df - check rows 18 and 28
temp_df

In [None]:
#MODE
#to test this read in a fresh copy of your data file
temp_df = pd.read_csv('dirtydata.csv')

#calculate the mode of the Calories column - 300.0
x = temp_df['Calories'].mode()[0]
print("Mode:" + str(x))

#Use x to fill null values
temp_df['Calories'].fillna(x, inplace = True)
#display temp_df - check rows 18 and 28
temp_df

In [None]:
#MAX - to calculate this use max()

#read in a fresh copy of your data file to temp_df
temp_df = pd.read_csv('dirtydata.csv')

#calculate the maximum value of the Calories column - 479.0

#print the maximum value


#Use the maximum value to fill null values

#display temp_df - check rows 18 and 28
temp_df

In [None]:
#MIN - to calculate this use min()

#read in a fresh copy of your data file to temp_df
temp_df = pd.read_csv('dirtydata.csv')

#calculate the minimum value in the Calories column - 195.1

#print the minimum value

#Use the minimum value to fill null values

#display temp_df - check rows 18 and 28
temp_df

In [None]:
#Choose either mean, median, mode, max or min 
  #and replace the null values in the Calories column of our original dataframe - df

    

### Dealing with Date Issues - Nulls & Date Formats

In [None]:
#display the original dataframe
  #if we look at the Date column we can see an issue in row 22 and row 26
df

In [None]:
#when we run the following code it will attempt to format the fields as dates
df['Date'] = pd.to_datetime(df['Date'])
df

In [None]:
#check the data types in the dataframe again and you will notice that date no longer has a type Object
df.dtypes

In [None]:
#from the data above we can see the issue with the date in row 26 was resolved
#the result in the example above gave us a NaT value in row 22, which can be handled as a NULL value
#we can remove the row by using the dropna() method. 
#Rather than just drop all rows with null values - we can explicity state to drop rows where there is a null value in Date
    #This would mean that other null values would not be deleted
    
df.dropna(subset=['Date'], inplace = True)
df

### Handling Wrong Data 

In [None]:
#"Wrong data" does not have to be "empty cells" or "wrong format", 
#it can just be wrong, like if someone registered "199" instead of "1.99"
#The duration value in row 7 is evidently wrong

#We could change the value in the field with a specific value, as follows:
df.loc[7,'Duration'] = 45
df

In [None]:
#We will now change it back so we can try something else
df.loc[7,'Duration'] = 450
df

In [None]:
#If the data set it too large it would not be possible to manually locate errors
  #If we wanted to check which rows in a particular column contain incorrect data,
  #we could loop through all values in that column

#loop through all values in the "Duration" column.
  #if the value is higher than 120, print the row number and the value:
for x in df.index:
  if df.loc[x, 'Duration'] > 120:
    #print the row and value - just so we can see where they are
    print(x ,df.loc[x, 'Duration'])

In [None]:
#if the value is higher than 120, set it to 120

for x in df.index:
  if df.loc[x, 'Duration'] > 120:
    df.loc[x, 'Duration'] = 120
df

In [None]:
#We could replace values with a mean value
#reset the value again
df.loc[7,'Duration'] = 450

#calculate the mean value of the duration column - 69.19354838709677
m= df['Duration'].mean()

#You could round the result is you needed to - try this
#m = round(m, 0)

#loop through all values in the "Duration" column.
#if the value is higher than 120, set it to m (the mean):
for x in df.index:
  if df.loc[x, 'Duration'] > 120:
    df.loc[x, 'Duration'] = m
    
df

In [None]:
#alternatively we could drop the row if it exceeds a certain value in a specified column
#reset the value again
df.loc[7,'Duration'] = 450

for x in df.index:
  if df.loc[x, 'Duration'] > 120:
    df.drop(x, inplace = True)

df

### Removing Duplicates

In [None]:
#look at the dataframe above, rows 11 & 12 are duplicated

#This code returns True for every row that is a duplicate, otherwise it returns False
df.duplicated()

In [None]:
#This code will drop duplicated rows
df.drop_duplicates(inplace = True)
df

In [None]:
#Save the dataframe as a new dataset.
#Index = false is used to ensure that the row index values are not included in the dataset
df.to_csv('cleandata.csv', index = False)