In [1]:
import pandas as pd
import numpy as np

In [2]:
# Session 3 worksheet notes

# In this worksheet, we'll go through loops and conditionals. Loops allow us to go through things which are 'iterable' and
# perform repeated actions on each element of the iterable. The definition for iterable is a bit circular because
# an iterable is something we can loop over, but maybe an example will helps. We can iterate over elements in a list,
# keys/values/items in a dictionary, rows/columns in a dataframe etc.

# We will also be looking at conditionals. Conditionals allow us to execute something based on a set of conditions.
# To keep things simple, we'll largely be thinking of conditionals in terms of using them to make selections from dataframes,
# and using them to determine what actions to apply in certain instances. For instance, in a dataframe of children
# in need, we might want to select only rows where the age is above, say 16. We can do this with a conditional.
# We might also want to make a new column to say yes if a child is over 16, and no if they are under.
# We can use a conditional for this too.

# We'll look at loops first. We start a loop using 'for', then we give a name for the iterable, this can be anything but
# it's good practice to use something that makes sense in context as, just like the argument of a function,
# when it's used in the loop, it will be called by the name we give it, we then use 'in' and specify what object
# want to iterate over, and follow this up with a colon to start the loop.
# Everything we want inside the loop should be indented, when the indent stops, the loop stops.

# Let's look at a few examples.

In [22]:
# Lets make a list
numbers = [1, 2, 3, 4, 5]
# when we iterate over a list, for each element in the list, we run the loop once.
# Here we are saying 'for' to tell Python we want to start a loop, we are
# saying we will call the iterables number, and we will take them from 'numbers'.

# for something in thing:
#   do process on something
# for number in numbers:
#   do something to number
for number in numbers:
    print(number)
# the loop will go through each item in the loop and print them one by one

1
2
3
4
5


In [25]:
import time
# numbers = list(range(10)) gives list of 0-9, next row gives list of 1-10
numbers = list(range(1, 11))
for number in numbers:
    print(number)
    # time.sleep(1)

1
2
3
4
5
6
7
8
9
10


In [32]:
alphabet = ['a','b','c','d','e']
for letter in alphabet:
    print(letter)

a
b
c
d
e


In [30]:
# Character range function
def range_char(start, stop):
    return (chr(n) for n in range(ord(start), ord(stop) + 1))
        
# Example run
for character in range_char("a", "g"):
    print(character)

a
b
c
d
e
f
g


In [43]:
import time
word = 'abcdefghijk'
for character in word:
    print(character)
    time.sleep(0.5)
    # the 'time' line above slows down the iterations to 0.5 seconds per loop

a
b
c
d
e
f
g
h
i
j
k


In [39]:
for number in numbers:
    print(1*number)
    print(2*number)

1
2
2
4
3
6
4
8
5
10
6
12
7
14
8
16
9
18
10
20


In [40]:
for number in numbers:
    for letter in alphabet:
        print(f'{number} {letter}')

1 a
1 b
1 c
1 d
1 e
2 a
2 b
2 c
2 d
2 e
3 a
3 b
3 c
3 d
3 e
4 a
4 b
4 c
4 d
4 e
5 a
5 b
5 c
5 d
5 e
6 a
6 b
6 c
6 d
6 e
7 a
7 b
7 c
7 d
7 e
8 a
8 b
8 c
8 d
8 e
9 a
9 b
9 c
9 d
9 e
10 a
10 b
10 c
10 d
10 e


In [51]:
numbers = [1, 2, 3, 4, 5]
letters = ['a','b','c']
letter_numbers = []
for number in numbers:
    for letter in letters:
        letter_numbers.append(f'{number}{letter}')
print(letter_numbers)


['1a', '1b', '1c', '2a', '2b', '2c', '3a', '3b', '3c', '4a', '4b', '4c', '5a', '5b', '5c']


In [4]:
names = ['Moloch', 'Belial', 'Mulciber']
# We can also use more than one iterable, for going through a list 'enumerate' makes this easy as it 'enumerates'
# or gives a number to list values in order so we can access them by index (recall back to the worksheet on lists),
# and also allows us to use the value inside the list.
# Here I've used it to iterate through values in the list by index position, and use the original value to create a new value.
for i, name in enumerate(names):
# It is customary to use i, j, k as the names for iterables when iterating through numbers, just as we use them in Pysics for vectors.
    names[i] = f'{name} has fallen.'
# This says, in essence the value of the list names with index i is updated to be the original string, plus some extras.
print(names)

['Moloch has fallen.', 'Belial has fallen.', 'Mulciber has fallen.']


In [5]:
# In this cell, I've initialised a list.
# Write two loops now, one that prints each element from the list, and one that uses enumerate
# to update the string in each position to read...
# '<original value> is in index position <n>'
# Ensure you choose a suitable name for your iterators

loop_me = ['Beatrice', 'Virgil', 'Minos']
for i, name in enumerate(loop_me):
    loop_me[i] = f'{name} is in index position {loop_me.index(name)}'
print(loop_me)

['Beatrice is in index position 0', 'Virgil is in index position 1', 'Minos is in index position 2']


In [6]:
# If we are working with a lot of data form different sources in the form of dataframes,
# it can be really useful to to store them in dictionaries.
# If you use the pandas read_excel method on an excel workbook with multiple sheets,
# it'll store them in a dicitonary, for instance. This means that it can be really useful to
# be able to iterate through dictionaries. Let's have a look:

import pandas as pd

df_1 = pd.DataFrame({'col 1': ['foo', 'bar', 'baz'],
                     'col 2': [1, 2, 3]})

df_2 = pd.DataFrame({'col 1': ['Thing 1', 'Thing 2', 'Thing 3'],
                     'col 2': ['Green eggs', 'Ham', 'Sam I am']})

df_dict = {'df_1' : df_1,
           'df_2' : df_2,}

# A really useful way to iterate through dictionaries is to access the items,
# allowing us to use both the key and value in our loops.
for key, value in df_dict.items():
    print(key)
    print(value)
print("  ") # just putting an empty row in output for clarity
# We can also iterate through rows in a dataframe, although this should be avoided where
# possible as it's very inefficient and can make your code run very slowly.
for row in df_1.index:
    value = df_1['col 2'][row] * 3.5
    print(value)

df_1
  col 1  col 2
0   foo      1
1   bar      2
2   baz      3
df_2
     col 1       col 2
0  Thing 1  Green eggs
1  Thing 2         Ham
2  Thing 3    Sam I am
  
3.5
7.0
10.5


In [7]:
# Using df_dict, defined earlier, write a loop that iterates through the dictionary, printing out the first row of each dataframe.
# NEEDS COMPLETING
for key, value in df_dict.items():
    print(key)
    print(value.iloc[0])

df_1
col 1    foo
col 2      1
Name: 0, dtype: object
df_2
col 1       Thing 1
col 2    Green eggs
Name: 0, dtype: object


In [8]:
# There also might be times where it's useful to make a dataframe or a new column in a dataframe by
# making a dictionary in a loop. The 2 examples here would not be used in the real world as there arebetter methods, but
# they provide a useful illustration.

# Example 1
x = 1
number_list = []
for i in list(range(10)):
    print(f'{i}, {x}') # we can print out the i,x pairs using an f-string, like this
    number_list.append(x)
    x = (x + number_list[i-1])
# or we can construct the dictionary {'numbers':number_list},
# then use pd.Dataframe to construct a dataframe from the dictionary {'numbers':number_list},
# then print our dataframe.
dict1 = {'numbers':number_list}
df1 = pd.DataFrame(dict1)
print(df1)
# print(pd.DataFrame({'numbers':number_list}))   puts the last 3 code lines into 1 line of code.

0, 1
1, 2
2, 3
3, 5
4, 8
5, 13
6, 21
7, 34
8, 55
9, 89
   numbers
0        1
1        2
2        3
3        5
4        8
5       13
6       21
7       34
8       55
9       89


In [9]:
# Example 2
answers = []
times_tables = {}
# We can do a loop INSIDE another loop! Don't think too hard about it, its the same,
# but it runs through and finishes, then the outside loop moves on.
for i in list(range(1,11)):
    for j in list(range(1,11)):
        answers.append(i * j)
    times_tables[i] = answers
    answers = []

multiplication_table = pd.DataFrame(times_tables) # Making the dictionary into a dataframe.
multiplication_table.index += 1 # shifting the index up by 1 to start at row 1 (not row 0) so the table is cleaner.
multiplication_table # In Jupyter we don't HAVE to use print to display a variable.

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
1,1,2,3,4,5,6,7,8,9,10
2,2,4,6,8,10,12,14,16,18,20
3,3,6,9,12,15,18,21,24,27,30
4,4,8,12,16,20,24,28,32,36,40
5,5,10,15,20,25,30,35,40,45,50
6,6,12,18,24,30,36,42,48,54,60
7,7,14,21,28,35,42,49,56,63,70
8,8,16,24,32,40,48,56,64,72,80
9,9,18,27,36,45,54,63,72,81,90
10,10,20,30,40,50,60,70,80,90,100


In [124]:
# Using the examples above, in this cell use a loop to populate a dictionary that you'll then use to make a dataframe of
# all the possible combinations of sandwiches from the following two lists. Don't think too hard, it doesn't matter
# if combinations are repeated. You'll want the following three lines of the code at the end to format the dataframe:
# combinations_df = (pd.DataFrame(combinations)) 
# combinations_df['topping'] = toppings
# combinations_df.set_index('topping')

spreads = ['butter', 'jam', 'peanut butter']
toppings = ['ham', 'cheese', 'sardines']
filling = []
combinations = {}

In [89]:
# Using the examples above, in this cell use a loop to populate a dictionary that you'll then use to make a dataframe of
# all the possible combinations of sandwiches from the following two lists. Don't think too hard, it doesn't matter
spreads = ['butter', 'jam', 'peanut butter']
toppings = ['ham', 'cheese', 'sardines']
for spread in spreads:
    for topping in toppings:
        print(spread,topping)

butter ham
butter cheese
butter sardines
jam ham
jam cheese
jam sardines
peanut butter ham
peanut butter cheese
peanut butter sardines


In [135]:
# Using the examples above, in this cell use a loop to populate a dictionary that you'll then use to make a dataframe of
# all the possible combinations of sandwiches from the following two lists. Don't think too hard, it doesn't matter
# if combinations are repeated.
# At the end, use the following three lines of code  to format the dataframe
# combinations_df = (pd.DataFrame(combinations)) 
# combinations_df['topping'] = toppings
# combinations_df.set_index('topping')

spreads = ['butter', 'jam', 'peanut butter']
toppings = ['ham', 'cheese', 'sardines']
filling = []
# filling is an empty list at this point, we are going to populate it
for spread in spreads:
    for topping in toppings:
        filling.append(f'{spread} & {topping}')
print(type(filling))
# we now turn filling from a list into a Dataframe
combinations = pd.DataFrame(filling)
combinations

<class 'list'>


Unnamed: 0,0
0,butter & ham
1,butter & cheese
2,butter & sardines
3,jam & ham
4,jam & cheese
5,jam & sardines
6,peanut butter & ham
7,peanut butter & cheese
8,peanut butter & sardines


In [11]:
# start of notes taken by MN during Session
## see screenshots taken by MN from Session 3, they will help generally
import pandas as pd
import numpy as np
import os

filename = 'https://raw.githubusercontent.com/data-to-insight/ERN-sessions/main/data/1980%202023%20average%20house%20prices.csv'
df = pd.read_csv(filename)
print(df)

               Name   Period  House price index All property types  \
0    United Kingdom  1980-01                                 10.11   
1    United Kingdom  1980-02                                 10.11   
2    United Kingdom  1980-03                                 10.11   
3    United Kingdom  1980-04                                 10.51   
4    United Kingdom  1980-05                                 10.51   
..              ...      ...                                   ...   
518  United Kingdom  2023-03                                148.20   
519  United Kingdom  2023-04                                148.90   
520  United Kingdom  2023-05                                149.50   
521  United Kingdom  2023-06                                151.20   
522  United Kingdom  2023-07                                152.00   

     Average price All property types  \
0                               19273   
1                               19273   
2                               1927

In [12]:
import pandas as pd
import numpy as np

# 01/01/2000
# %d/%m/%Y
# strftime
# %b month but with local shortening e.g. jan
# %B full month name e.g. January
# N.B. coerce function is used here
# I think Will said the 'normalize' function puts all date/time values to 00:00 (12am midnight)
df['Period'] = pd.to_datetime(df['Period'], format='%Y-%m', errors='coerce')

df['Age of Data (Years)'] = pd.to_datetime('today').normalize() - df['Period']

df['Age of Data (Years)'] = (df['Age of Data (Years)'] / pd.Timedelta('365 days')).astype('int')
# in other versions of Python this would work here instead:
# df['Age of Data (Years)'] = (df['Age of Data (Years)'] / np.timedelta64(1, 'Y')).astype('int')
print(df.head(1))

             Name     Period  House price index All property types  \
0  United Kingdom 1980-01-01                                 10.11   

   Average price All property types  \
0                             19273   

   Percentage change (monthly) All property types  \
0                                            3.94   

   Percentage change (yearly) All property types  Age of Data (Years)  
0                                          28.59                   44  


In [13]:
import pandas as pd
import numpy as np
fileraw = 'https://raw.githubusercontent.com/data-to-insight/ERN-sessions/main/data/ChildIdentifiers.csv'
df2 = pd.read_csv(fileraw)
print(df2)

     Unnamed: 0        LAchildID            UPN  FormerUPN  UPNunknown  \
0             0  RND000215205141  A850728973744        NaN         NaN   
1             1  RND000824303014  A141396438491        NaN         NaN   
2             2  RND000750143123  A929946861554        NaN         NaN   
3             3  RND000909164501  A612330267292        NaN         NaN   
4             4  RND000382171815  A604459366806        NaN         NaN   
..          ...              ...            ...        ...         ...   
327         327  RND000112711501  A465246916125        NaN         NaN   
328         328  RND000513120794  A540014111973        NaN         NaN   
329         329  RND000541643134  A549582689058        NaN         NaN   
330         330  RND000404939452  A889492349196        NaN         NaN   
331         331  RND000589802835  A877624860226        NaN         NaN   

    PersonBirthDate ExpectedPersonBirthDate  GenderCurrent PersonDeathDate  
0        2019-12-06               

In [14]:
# this code posted in chat by will, it relates to the ChildIdentifiers.csv dataset
df2 = df2[['LAchildID', 'PersonBirthDate', 'GenderCurrent']]
df2['PersonBirthDate'] = pd.to_datetime(df2['PersonBirthDate'], format='%Y-%m-%d', errors='coerce')
df2['Age'] = pd.to_datetime('today').normalize() - df2['PersonBirthDate']
df2['Age'] = (df2['Age'] / pd.Timedelta('365 days')).astype('int')
print(df2)

           LAchildID PersonBirthDate  GenderCurrent  Age
0    RND000215205141      2019-12-06              1    4
1    RND000824303014      2011-04-27              9   12
2    RND000750143123      2017-06-06              1    6
3    RND000909164501      2014-10-03              0    9
4    RND000382171815      2019-09-25              2    4
..               ...             ...            ...  ...
327  RND000112711501      2010-07-07              2   13
328  RND000513120794      2018-08-14              2    5
329  RND000541643134      2021-12-09             51    2
330  RND000404939452      2013-07-23              2   10
331  RND000589802835      2021-10-25              9    2

[332 rows x 4 columns]


In [15]:
gender_map = {1:'Male',
              2:'Female',
              0:'unknown',
              9:'unknown',}
df2['GenderCurrent'] = df2['GenderCurrent'].map(gender_map)
print(df2)

           LAchildID PersonBirthDate GenderCurrent  Age
0    RND000215205141      2019-12-06          Male    4
1    RND000824303014      2011-04-27       unknown   12
2    RND000750143123      2017-06-06          Male    6
3    RND000909164501      2014-10-03       unknown    9
4    RND000382171815      2019-09-25        Female    4
..               ...             ...           ...  ...
327  RND000112711501      2010-07-07        Female   13
328  RND000513120794      2018-08-14        Female    5
329  RND000541643134      2021-12-09           NaN    2
330  RND000404939452      2013-07-23        Female   10
331  RND000589802835      2021-10-25       unknown    2

[332 rows x 4 columns]


In [16]:
# introducing more operators
# ==  equal to
# !=  not equal to
# ~   NOT
# &   AND
# |   OR
# > greater than
# < less than
# >= greater than or equal to
# <= less than or equal to
# % is a modulo operator, it gives the remainder when i is divided by e.g. 2 in this example

over_18_cond = df2['Age'] > 18
sliced_df2 = df2[over_18_cond]
# i.e. sliced_df2 = df2[df2['Age'] > 18]  if doing in 1 step
print(sliced_df2)

           LAchildID PersonBirthDate GenderCurrent  Age
23   RND000576956056      2004-11-12       unknown   19
45   RND000676691998      2004-12-05       unknown   19
46   RND000702784676      2004-09-15       unknown   19
61   RND000235844862      2004-08-23          Male   19
68   RND000075046261      2004-10-07       unknown   19
69   RND000241874800      2004-08-26          Male   19
71   RND000989010565      2005-02-09       unknown   19
106  RND000859630907      2004-09-10          Male   19
107  RND000751235542      2004-10-28       unknown   19
139  RND000972628432      2004-08-20       unknown   19
196  RND000813949639      2005-01-26        Female   19
197  RND000784759216      2004-09-01          Male   19
227  RND000482971048      2004-11-04        Female   19
261  RND000190463499      2004-12-01        Female   19


In [17]:
# using the new operators introduced above
over_18_or_male_condition = df2['Age'] > 18 | (df2['GenderCurrent'] == 'Male')
over_18_or_male = df2[over_18_or_male_condition]
over_18_male_condition = (df2['Age'] > 18) & (df2['GenderCurrent'] == 'Male')
over_18_male = df2[over_18_male_condition]
print(over_18_male)
print(over_18_or_male)

           LAchildID PersonBirthDate GenderCurrent  Age
61   RND000235844862      2004-08-23          Male   19
69   RND000241874800      2004-08-26          Male   19
106  RND000859630907      2004-09-10          Male   19
197  RND000784759216      2004-09-01          Male   19
           LAchildID PersonBirthDate GenderCurrent  Age
0    RND000215205141      2019-12-06          Male    4
1    RND000824303014      2011-04-27       unknown   12
2    RND000750143123      2017-06-06          Male    6
3    RND000909164501      2014-10-03       unknown    9
4    RND000382171815      2019-09-25        Female    4
..               ...             ...           ...  ...
327  RND000112711501      2010-07-07        Female   13
328  RND000513120794      2018-08-14        Female    5
329  RND000541643134      2021-12-09           NaN    2
330  RND000404939452      2013-07-23        Female   10
331  RND000589802835      2021-10-25       unknown    2

[325 rows x 4 columns]


In [18]:
# Exercise
# print the dataframe where
# a) everyone is 15 or under OR male
# b) everyone is 15 or over OR female
condition_1 = (df2['Age'] <= 15 | (df2['GenderCurrent'] == 'Male'))
df2_1 = df2[condition_1]
condition_2 = (df2['Age'] >= 15 | (df2['GenderCurrent'] == 'Female'))
df2_2 = df2[condition_2]
condition_3 = ~(df2['Age'] >= 15)

df2['GenderCurrent'] = df2['GenderCurrent'].str.lower()
condition_4 = df2['GenderCurrent'].str.lower() == 'male'
condition_5 = (df2['GenderCurrent'] == 'male') | (df2['GenderCurrent'] == 'female')
condition_6 = df2['GenderCurrent'].isin(['male','female'])
condition_7 = df2[df2['GenderCurrent'].str.contains('m') & df2['GenderCurrent'].notna()]
print(condition_7)
               

           LAchildID PersonBirthDate GenderCurrent  Age
0    RND000215205141      2019-12-06          male    4
2    RND000750143123      2017-06-06          male    6
4    RND000382171815      2019-09-25        female    4
6    RND000852873211      2013-01-19        female   11
7    RND000756924984      2021-04-04        female    2
..               ...             ...           ...  ...
323  RND000044483554      2020-06-16          male    3
324  RND000110305262      2022-06-13          male    1
327  RND000112711501      2010-07-07        female   13
328  RND000513120794      2018-08-14        female    5
330  RND000404939452      2013-07-23        female   10

[153 rows x 4 columns]


In [19]:
# Session 3 Group work
# Work out how to use zip() to collect elements from two lists together and use a loop to print them
# element-wise, so, if I had 1, 2, 3, 4, 5 and a, b, c, d, e stored as seprate lists, my loop would 
# print a1, b2, etc. Initialise two variables, one a list of first names and one a list of surnames of
# everyone in the group. Next, use nested loops to print all the different first/last name combinations.
# Write a for loop that takes a list of numbers, finds the square of each, and adds the squares to a new
# list that starts empty. Write a loop that sums all the numbers up to 100, and assigns the answer to a variable.