In [1]:
import pandas as pd
import numpy as np

In [2]:
# Session 3 worksheet notes

# In this worksheet, we'll go through loops and conditionals. Loops allow us to go through things which are 'iterable' and
# perform repeated actions on each element of the iterable. The definition for iterable is a bit circular because
# an iterable is something we can loop over, but maybe an example will helps. We can iterate over elements in a list,
# keys/values/items in a dictionary, rows/columns in a dataframe etc.

# We will also be looking at conditionals. Conditionals allow us to execute something based on a set of conditions.
# To keep things simple, we'll largely be thinking of conditionals in terms of using them to make selections from dataframes,
# and using them to determine what actions to apply in certain instances. For instance, in a dataframe of children
# in need, we might want to select only rows where the age is above, say 16. We can do this with a conditional.
# We might also want to make a new column to say yes if a child is over 16, and no if they are under.
# We can use a conditional for this too.

# We'll look at loops first. We start a loop using 'for', then we give a name for the iterable, this can be anything but
# it's good practice to use something that makes sense in context as, just like the argument of a function,
# when it's used in the loop, it will be called by the name we give it, we then use 'in' and specify what object
# want to iterate over, and follow this up with a colon to start the loop.
# Everything we want inside the loop should be indented, when the indent stops, the loop stops.

# Let's look at a few examples.

In [None]:
# Lets make a list
numbers = [1, 2, 3, 4, 5]
# when we iterate over a list, for each element in the list, we run the loop once.
# Here we are saying 'for' to tell Python we want to start a loop, we are
# saying we will call the iterables number, and we will take them from 'numbers'.

# for something in thing:
#   do process on something
# for number in numbers:
#   do something to number
for number in numbers:
    print(number)
# the loop will go through each item in the loop and print them one by one

In [None]:
import time
# numbers = list(range(10)) gives list of 0-9, next row gives list of 1-10
numbers = list(range(1, 11))
for number in numbers:
    print(number)
    time.sleep(1)

In [None]:
alphabet = ['a','b','c','d','e']
for letter in alphabet:
    print(letter)

In [None]:
# Character range function
def range_char(start, stop):
    return (chr(n) for n in range(ord(start), ord(stop) + 1))
        
# Example run
for character in range_char("a", "g"):
    print(character)

In [None]:
import time
word = 'abcdefghijk'
for character in word:
    print(character)
    time.sleep(0.5)
    # the 'time' line above slows down the iterations to 0.5 seconds per loop

In [None]:
for number in numbers:
    print(1*number)
    print(2*number)

In [None]:
for number in numbers:
    for letter in alphabet:
        print(f'{number} {letter}')

In [None]:
numbers = [1, 2, 3, 4, 5]
letters = ['a','b','c']
letter_numbers = []
for number in numbers:
    for letter in letters:
        letter_numbers.append(f'{number}{letter}')
print(letter_numbers)


In [None]:
names = ['Moloch', 'Belial', 'Mulciber']
# We can also use more than one iterable, for going through a list 'enumerate' makes this easy as it 'enumerates'
# or gives a number to list values in order so we can access them by index (recall back to the worksheet on lists),
# and also allows us to use the value inside the list.
# Here I've used it to iterate through values in the list by index position, and use the original value to create a new value.
for i, name in enumerate(names):
# It is customary to use i, j, k as the names for iterables when iterating through numbers, just as we use them in Pysics for vectors.
    names[i] = f'{name} has fallen.'
# This says, in essence the value of the list names with index i is updated to be the original string, plus some extras.
print(names)

In [None]:
# In this cell, I've initialised a list.
# Write two loops now, one that prints each element from the list, and one that uses enumerate
# to update the string in each position to read...
# '<original value> is in index position <n>'
# Ensure you choose a suitable name for your iterators

loop_me = ['Beatrice', 'Virgil', 'Minos']
for i, name in enumerate(loop_me):
    loop_me[i] = f'{name} is in index position {loop_me.index(name)}'
print(loop_me)

In [None]:
# If we are working with a lot of data form different sources in the form of dataframes,
# it can be really useful to to store them in dictionaries.
# If you use the pandas read_excel method on an excel workbook with multiple sheets,
# it'll store them in a dicitonary, for instance. This means that it can be really useful to
# be able to iterate through dictionaries. Let's have a look:

import pandas as pd

df_1 = pd.DataFrame({'col 1': ['foo', 'bar', 'baz'],
                     'col 2': [1, 2, 3]})

df_2 = pd.DataFrame({'col 1': ['Thing 1', 'Thing 2', 'Thing 3'],
                     'col 2': ['Green eggs', 'Ham', 'Sam I am']})

df_dict = {'df_1' : df_1,
           'df_2' : df_2,}

# A really useful way to iterate through dictionaries is to access the items,
# allowing us to use both the key and value in our loops.
for key, value in df_dict.items():
    print(key)
    print(value)
print("  ") # just putting an empty row in output for clarity
# We can also iterate through rows in a dataframe, although this should be avoided where
# possible as it's very inefficient and can make your code run very slowly.
for row in df_1.index:
    value = df_1['col 2'][row] * 3.5
    print(value)

In [None]:
# Using df_dict, defined earlier, write a loop that iterates through the dictionary, printing out the first row of each dataframe.
# NEEDS COMPLETING
for key, value in df_dict.items():
    print(key)
    print(value.iloc[0])

In [None]:
# There also might be times where it's useful to make a dataframe or a new column in a dataframe by
# making a dictionary in a loop. The 2 examples here would not be used in the real world as there arebetter methods, but
# they provide a useful illustration.

# Example 1
x = 1
number_list = []
for i in list(range(10)):
    print(f'{i}, {x}') # we can print out the i,x pairs using an f-string, like this
    number_list.append(x)
    x = (x + number_list[i-1])
# or we can construct the dictionary {'numbers':number_list},
# then use pd.Dataframe to construct a dataframe from the dictionary {'numbers':number_list},
# then print our dataframe.
dict1 = {'numbers':number_list}
df1 = pd.DataFrame(dict1)
print(df1)
# print(pd.DataFrame({'numbers':number_list}))   puts the last 3 code lines into 1 line of code.

In [None]:
# Example 2
answers = []
times_tables = {}
# We can do a loop INSIDE another loop! Don't think too hard about it, its the same,
# but it runs through and finishes, then the outside loop moves on.
for i in list(range(1,11)):
    for j in list(range(1,11)):
        answers.append(i * j)
    times_tables[i] = answers
    answers = []

multiplication_table = pd.DataFrame(times_tables) # Making the dictionary into a dataframe.
multiplication_table.index += 1 # shifting the index up by 1 to start at row 1 (not row 0) so the table is cleaner.
multiplication_table # In Jupyter we don't HAVE to use print to display a variable.

In [17]:
# Using the examples above, in this cell use a loop to populate a dictionary that you'll then use to make a dataframe of
# all the possible combinations of sandwiches from the following two lists. Don't think too hard, it doesn't matter
# if combinations are repeated. You'll want the following three lines of the code at the end to format the dataframe:
# combinations_df = (pd.DataFrame(combinations)) 
# combinations_df['topping'] = toppings
# combinations_df.set_index('topping')

spreads = ['butter', 'jam', 'peanut butter']
toppings = ['ham', 'cheese', 'sardines']
filling = []
combinations = {}

In [None]:
# Using the examples above, in this cell use a loop to populate a dictionary that you'll then use to make a dataframe of
# all the possible combinations of sandwiches from the following two lists. Don't think too hard, it doesn't matter
spreads = ['butter', 'jam', 'peanut butter']
toppings = ['ham', 'cheese', 'sardines']
for spread in spreads:
    for topping in toppings:
        print(spread,topping)

In [None]:
# Using the examples above, in this cell use a loop to populate a dictionary that you'll then use to make a dataframe of
# all the possible combinations of sandwiches from the following two lists. Don't think too hard, it doesn't matter
# if combinations are repeated.
# At the end, use the following three lines of code  to format the dataframe
# combinations_df = (pd.DataFrame(combinations)) 
# combinations_df['topping'] = toppings
# combinations_df.set_index('topping')

spreads = ['butter', 'jam', 'peanut butter']
toppings = ['ham', 'cheese', 'sardines']
filling = []
# filling is an empty list at this point, we are going to populate it
for spread in spreads:
    for topping in toppings:
        filling.append(f'{spread} & {topping}')
print(type(filling))
# we now turn filling from a list into a Dataframe
combinations = pd.DataFrame(filling)
combinations

In [None]:
# start of notes taken by MN during Session
## see screenshots taken by MN from Session 3, they will help generally
import pandas as pd
import numpy as np
import os

filename = 'https://raw.githubusercontent.com/data-to-insight/ERN-sessions/main/data/1980%202023%20average%20house%20prices.csv'
df = pd.read_csv(filename)
print(df)

In [None]:
import pandas as pd
import numpy as np

# 01/01/2000
# %d/%m/%Y
# strftime
# %b month but with local shortening e.g. jan
# %B full month name e.g. January
# N.B. coerce function is used here
# I think Will said the 'normalize' function puts all date/time values to 00:00 (12am midnight)
df['Period'] = pd.to_datetime(df['Period'], format='%Y-%m', errors='coerce')

df['Age of Data (Years)'] = pd.to_datetime('today').normalize() - df['Period']

df['Age of Data (Years)'] = (df['Age of Data (Years)'] / pd.Timedelta('365 days')).astype('int')
# in other versions of Python this would work here instead:
# df['Age of Data (Years)'] = (df['Age of Data (Years)'] / np.timedelta64(1, 'Y')).astype('int')
print(df.head(1))

In [None]:
import pandas as pd
import numpy as np
fileraw = 'https://raw.githubusercontent.com/data-to-insight/ERN-sessions/main/data/ChildIdentifiers.csv'
df2 = pd.read_csv(fileraw)
print(df2)

In [None]:
# this code posted in chat by will, it relates to the ChildIdentifiers.csv dataset
df2 = df2[['LAchildID', 'PersonBirthDate', 'GenderCurrent']]
df2['PersonBirthDate'] = pd.to_datetime(df2['PersonBirthDate'], format='%Y-%m-%d', errors='coerce')
df2['Age'] = pd.to_datetime('today').normalize() - df2['PersonBirthDate']
df2['Age'] = (df2['Age'] / pd.Timedelta('365 days')).astype('int')
print(df2)

In [None]:
gender_map = {1:'Male',
              2:'Female',
              0:'unknown',
              9:'unknown',}
df2['GenderCurrent'] = df2['GenderCurrent'].map(gender_map)
print(df2)

In [25]:
# introducing more operators
# ==  equal to
# !=  not equal to
# ~   NOT
# &   AND
# |   OR
# > greater than
# < less than
# >= greater than or equal to
# <= less than or equal to

# % is a modulo operator. The % gives the remainder after division, e.g.
# 21 % 4 gives output 1.  Mathematically this is saying:   21 ≡ 1 (mod 4)

# // is a floor division operator.  The // rounds the result down to the nearest whole number, e.g.
# 21 // 4 gives output 5.  Mathematically this is saying:   21 / 4 = 5 (and we are not concerned about the remainder)

In [None]:
over_18_cond = df2['Age'] > 18
sliced_df2 = df2[over_18_cond]
# i.e. sliced_df2 = df2[df2['Age'] > 18]  if doing in 1 step
print(sliced_df2)

In [None]:
# using the new operators introduced above
over_18_or_male_condition = df2['Age'] > 18 | (df2['GenderCurrent'] == 'Male')
over_18_or_male = df2[over_18_or_male_condition]
over_18_male_condition = (df2['Age'] > 18) & (df2['GenderCurrent'] == 'Male')
over_18_male = df2[over_18_male_condition]
print(over_18_male)
print(over_18_or_male)

In [None]:
# Exercise
# print the dataframe where
# a) everyone is 15 or under OR male
# b) everyone is 15 or over OR female
condition_1 = (df2['Age'] <= 15 | (df2['GenderCurrent'] == 'Male'))
df2_1 = df2[condition_1]
condition_2 = (df2['Age'] >= 15 | (df2['GenderCurrent'] == 'Female'))
df2_2 = df2[condition_2]
condition_3 = ~(df2['Age'] >= 15)

df2['GenderCurrent'] = df2['GenderCurrent'].str.lower()
condition_4 = df2['GenderCurrent'].str.lower() == 'male'
condition_5 = (df2['GenderCurrent'] == 'male') | (df2['GenderCurrent'] == 'female')
condition_6 = df2['GenderCurrent'].isin(['male','female'])
condition_7 = df2[df2['GenderCurrent'].str.contains('m') & df2['GenderCurrent'].notna()]
print(condition_7)
               

In [None]:
# Session 3 Group Work
# Work out how to use zip() to collect elements from two lists together and use a loop to print them
# element-wise, so, if I had 1, 2, 3, 4, 5 and a, b, c, d, e stored as separate lists, my loop would print a1, b2, etc.

numbers = [1, 2, 3, 4, 5]
letterz = ['a', 'b', 'c', 'd', 'e']
zippo = zip(numbers, letterz)
# print(list(zippo))
for x, y in list(zippo):
    print(f"{x}{y}")

In [None]:
# Initialise two variables, one a list of first names and one a list of surnames of everyone in the group.
# Next, use nested loops to print all the different first/last name combinations.

fore = ['Fred', 'Clive', 'Paul', 'Tim']
sur = ['Smith', 'Jones', 'Taylor', 'Dawson']
full = zip(fore, sur)
print(list(full))
for x in fore:
    for y in sur:
        print(x, y)

In [None]:

# Write a for loop that takes a list of numbers, finds the square of each, and adds the squares to a new list that starts empty.
numbers = [4, 5, 7, 11]
squares = []
for x in numbers:
    squares.append(x**2)
print(squares)

In [None]:
# Write a loop that sums all the numbers up to 100, and assigns the answer to a variable.
MN_list = list(range(1,101))
MN_sum = 0
for x in MN_list:
    MN_sum = MN_sum + x
print(MN_sum)