# I/O (input / output)

Here we are going to introduce some basics ways of reading and writing
files. This is useful when we require some large input for our program
or when we want to save result of our calculations to a place in the disk.

In [None]:
# we can open a file in python with the open() function and passing
# the file name as a string as the first arguments. We also in many
# cases need to specify the mode we want to open the file for now we
# will only focus on read, write and append. But you are free to
# read about the others.

# We create a new file called temporary_file.txt and specify
# we want to write to it.
new_file = open('temporary_file.txt', 'w')
print(new_file) # files have their own type (object) the _io.TextIOWrapper

In [None]:
# More correctly our variable is in fact an object or an instance of
# a class which we will not go into details at the moment however
# by we can see the possible functions related to this object by
# typing the object name followed by a dot and then pressing tab
# try this below

# press tab after the dot
new_file.

In [None]:
# One posibility is the write method lets try it

write_this = "I am writing to a file\n" # the \n is used to create a new line
new_file.write(write_this)

# if you check the file you will see that the is nothing written to it
# for efficiency reasons when writing to a file the data is stored in a
# buffer when the buffer limit is reached then it writes everything in
# the buffer to the file

In [None]:
# we can force it to write by flushing the buffer
new_file.flush()
# If we inspect the file we will now see that our line has been written.
# The buffer size can be modified when creating the file.

In [None]:
# Finally lets close our file since we are done writting to it
# Closing the file also flushes the buffer automatically
new_file.close()

In [None]:
new_file.write(write_this) # As you can see we can no longer write to it

In [None]:
# Alright lets put everything together
log_file = open('temporary_file.txt', 'w')
for i in range(101):
    log_file.write(str(i) + '\n')
log_file.close()

# We have written numbers from 0 to 100 to a file each on a separate line
# As you can I named this file the same as out previous example, however,
# the line we had before "I am writing to this file" disappeared. This
# is because the 'w' mode or writing mode replaces any file with this
# name when it creates it.

In [None]:
# But lets say we wrote to file at some point in our program we closed it
# and we know want to write to it again without losing our previous data
# the way to do this is with the 'a' mode or append.

# We have the same code as before but now I am opening and closing
# the file within the loop

for i in range(101):
    log_file = open('temporary_file.txt', 'w')
    log_file.write(str(i) + '\n')
    log_file.close()
    
# As expected we only get the last number in our file

In [None]:
# We have now change the mode from 'w' to 'a'

for i in range(101):
    log_file = open('temporary_file.txt', 'a')
    log_file.write(str(i) + '\n')
    log_file.close()

# As you can see even though we are closing the file we are not loosing
# what we write from before. However it is important to be careful
# since as we can see we get the number from 0 to 100 but we also get 
# an extra 100 at the beginning that we had from the previous cell. 

In [None]:
# Lets now read the content of our file

reading = open('temporary_file.txt', 'r')
lines = reading.readlines()
print(lines)

# With the readlines() we get a list of our lines. However for large
# files readlines() is not the best because it can fill up our memory
# in many cases is better to read line by line

In [None]:
# We can also read and write to files using the with keyword

lines = []
with open('temporary_file.txt', 'r') as f:
    for line in f:
        lines.append(line)
print(lines)

# Here we are accomplishing the same as before however the benefit is 
# that we can treat our file in chunks and do not worry about the memory.

In [None]:
# Since we are reading line by line we can also perform actions to each
# line for example the data we are getting is not completly useful as it
# is one long string and has the '\n' and we also lost our integer type
# since the number are now strings

# So lets do some operations to treat the data

lines = []
with open('temporary_file.txt', 'r') as f:
    for line in f:
        # The split method is very important when reading a file
        # specially when we have more than one column as it
        # creates a list of values separated by whitespace
        data = line.split()  
        lines.append(int(data[0])) # We take the only element in the list and convert back to int
print(lines)


In [None]:
# With the with keyword the file gets also closed automatically. Let
# see an example of writing to a file

with open('temporary_file.txt', 'w') as f:
    for i in range(101):
        f.write(str(i) + '\n')

### Exercise 1: Using the files inside of data called classroom_1.txt and classroom_2.txt find the average GPA of classroom 1 and classroom 2 and the average age of students with GPA above or equal to 3.8

# String formatting

In [None]:
# Many files have a strict formatting this means that the fields in the
# text file have specific lengths and other programs expect this lengths
# when reading the file so they have to be respected how can we 
# accomplish this?

# We will start with simple strings

word = 'formatting'
print(word)
print(f'{word}') # This is how we specify a formatted string without a format

# Now lets add the format
print(f'{word:>15s}') # the 15 refers to the # of colums we assign
# The > means that we want it right aligned

print(f'{word:<15s}') # Left aligned compared to the first ones it has white spaces to the right up to 15
print(f'{word:^15s}') # Centered

In [None]:
names = ['Michelle', 'Jacob', 'Mary']
last_names = ['Browning', 'Brown', 'Myers']

# Without formatting the last names are not aligned.
# If we save it to a file and try to add a comma in between the name
# and lastname it would not be as easy.

# Also imagine there is a program that reads names and lastnames and it 
# expects that the names from column 1 to 20 and last names from 21 to 40
# in our case say for the first one it would actually read:

# name = 'MichelleBrowning     '
# last_name = '                   '

for i in range(len(names)):
    print(names[i], last_names[i])

In [None]:
# Lets fix this issue by giving 20 spaces to each string
for i in range(len(names)):
    print(f'{names[i]:20s}{last_names[i]:20s}')

In [None]:
# We can also use string format for numeric values for integers there is
# nothing additional to learn with just use the letter d instead of s
# to indicate is a integer value

for i in range(len(names)):
    print(f'{i:10d}{names[i]:20s}') # by default numeric values are aligned right
# while strings are aligned left lets modify this to look better    

In [None]:
for i in range(len(names)):
    print(f'{i:10d}{names[i]:>20s}')

In [None]:
# In the case of floats, we can also specify the number of decimal places
# using a dot after the specified number of columns and we use an f
import numpy as np

for i in range(10):
    rand_num = 1000000 * np.random.random()
    print(rand_num)

In [None]:
for i in range(10):
    rand_num = 1000000 * np.random.random()
    print(f'{rand_num:12.2f}') # We are specifying it is a float of 12 colunms and 2 of them are decimal places

In [None]:
# Another way of formating is using the format() function

# We define our particular format before
fmt = '{:20s}{:20s}'
for i in range(len(names)):
    print(fmt.format(names[i], last_names[i])) # we call the format function 

In [None]:
# One final note useful for string formatting but that can also be used for 
# other purposes is the unpack operator (*)

# For example we have a list of 3 items and we want to
# create a variable for each item in the list

fruits = ['apple', 'pear', 'orange']

fruit1 = fruits[0]
fruit2 = fruits[1]
fruit3 = fruits[2]
print(fruit1, fruit2, fruit3)

# we can accomplish this in on line
fruit1, fruit2, fruit3 = fruits
print(fruit1, fruit2, fruit3)

# The equal sign unpacks our list as long as we have enough variables to unpack to
# when we are not using the equal sign but instead formatting we can unpack by ourselves with (*)

fmt = '{:10s}{:10s}{:10s}'
print(fmt.format(fruits[0], fruits[1], fruits[2]))

# Now that we wrote all the items individually lets see how unpacking makes it easier

fmt = '{:10s}{:10s}{:10s}'
print(fmt.format(*fruits)) # The star unpacks our list for us as long as it can unpack to

### Exercise 2: The PDB file format is a type of format used by the Protein Data Bank to specify atomic coordinates. Read about this type of file https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html. The idea is to create a fake macromolecule with only carbon atoms and then to set the positions of these atoms based on the output of a given 3d function.

In [None]:
# Here we create our x,y and z positions of our saddle function
 
x = np.linspace(-1, 1, 50) # Creates 50 numbers from -1 to 1
y = np.linspace(-1, 1, 50) # Creates 50 numbers from -1 to 1
X, Y = np.meshgrid(x, y) # Creates a 2d grid based on x and y
z = X ** 2 - Y ** 2 # Our output is a 2d array with a z value for every (x, y) pair

# Create the appropriate formatting string 
fmt = ""
# Create the a list of fields with the values that wont change and placeholders for
# the ones that will be modified in the loop
fields = []

# Iterate over the data and print to a file called function.pdb 

<img src="images/pymol_plot.png" width=800 height=800/>

# More ways of looping

In [None]:
# We have seen some ways on how to repeate an operation inside a loop
# now lets go into some details and talk about other ways of looping

animals = ['lion', 'dog', 'cat', 'horse', 'pig']

# We know we can iterate by looping over the list item by item doing

for animal in animals:
    print(animal)

In [None]:
# We can also iterate by index

for i in range(len(animals)):
    print(animals[i])

In [None]:
# What happens if we want to count the animals and output the name as well
# then we can do

for i in range(len(animals)):
    print(i + 1, animals[i]) # The plus one is so it starts at 1 and not at zero

In [None]:
# There is an alternative for doing this using the enumerate() function
# that enumerate the items of our list as it iterates

for i, animal in enumerate(animals): # enumerate returns 2 values so we nee to unpack to 2 variables
    print(i + 1, animal)

In [None]:
# Enumerate might to seem that useful but it is very useful
# in cases we do not know the size of the list

# For example lets read the classroom_1.txt and count how many lines it contains
# as well as outputting the line number and GPA as we go along

counter = 0
with open('./data/classroom_1.txt', 'r') as f:
    for line in f: # In principle we do not know the number of lines
        data = line.split()
        print(counter, data[4])
        counter += 1
print('Number of lines: ', counter) # we have 1002 lines

In [None]:
# Lets use enumerate to obtain the same

counter = 0
with open('./data/classroom_1.txt', 'r') as f:
    for i, line in enumerate(f): # We use enumerate to give a index to our lines
        data = line.split()
        print(i, data[4])
print('Number of lines: ', i + 1) # we have to add one since enumerate starts at 0

In [None]:
# Lets look at the zip() function now

fruits = ['orange', 'banana', 'pear', 'grape']
colors = ['orange', 'yellow', 'green', 'purple']

zipped = zip(fruits, colors)
print(zipped) # We created a zip object, with the combination of both lists
# we can iterate over it or convert it back to a list

fruits_colors = list(zipped)
print(fruits_colors)

In [None]:
# We added a fruit to our list but not to our colors
# so the sizes do not match
fruits.append('apple')
print(list(zip(fruits, colors))) # We zipped the list but did not get apple because it could not be
# zipped since there was no element in the colors list

In [None]:
# In many cases this can be unwanted behaviour and instead of checking 
# the size of the lists manually we can use the following argument
print(list(zip(fruits, colors, strict=True))) # This will only work in python 3.10 and up

In [None]:
# We can therefore use the zip function to loop

fruits = ['orange', 'banana', 'pear', 'grape']
colors = ['orange', 'yellow', 'green', 'purple']

for fruit, color in zip(fruits, colors):
    print(fruit, color)

In [None]:
# Finally lets look at while loops
# A while loop would look something like this if written as a for


for i in range(1000):
    if i > 100:
        break # break is use to break out of the loop
    else:
        print(i)

In [None]:
# Lets accomplish the same using a while loop

i = -1
while i < 100: # As long as i is less than 100 if it is greater it does not go in
    i += 1
    print(i)

In [None]:
# while loop can really be helpful when we in principle do not
# know how long to iterate for

# Imagine we want the first number whose digits add up to
# more than 50

for i in range(1000): # lets try looping up to 1000
    num = str(i)
    sum_ = 0
    for digit in num:
        sum_ += int(digit)
    if sum_ > 50:
        break
print(i) # as you can see i is 999 as we reach the end of the loop
# but the digits add up to 9 + 9 + 9 = 27 which is less than 50

In [None]:
# As we can see we reached a limitation in principle we can put
# a very large number but a while loop is more suited for this
i = 0
sum_ = 0
while sum_ < 50:
    sum_ = 0
    num = str(i)
    for digit in num:
        sum_ += int(digit)
    i += 1
print(i - 1) # We subtract one since the number we want is the one before the condition ended

### Exercise 3: Using the classroom_1.txt and the classroom_2.txt. Create a new file called all_below_3.txt. The file needs to contain the name and last name (field of 20 columns right aligned) the GPA with 2 decimals (field of 4 columns) and a new column of the class number either 1 or 2 of all the students with GPA less than 3.

# Other data types dictionaries or hash tables

In [None]:
# So far we have dealt with data types such as list and arrays
# that are indexed by an integer that goes from 0 to the size - 1 of 
# the list. However, it would be great if we had a data type
# that can be indexed by a hash or a key meaning that we could have
# something like fruits['red'] and it would give as red fruits
# this type of data structure in python is called a dictionary 

dic = {}
print(type(dic)) # We created an empty dictionary

In [None]:
fruits_dic = {'red' : 'apple', 'green' : 'pear', 'purple' : 'grape'} # Here we have defined 3 keys that map to 3 values

# we can access the values using the keys

print(fruits_dic['red'])

# we can also access all the keys by typing 

keys = list(fruits_dic.keys()) # we use list to convert to a list
print(keys)

# we can also access all the values by typing

values = list(fruits_dic.values())
print(values)

# we can also accces the pair of key and values as follows

items = list(fruits_dic.items())
print(items) # this is very similar to zip and we can use any of this to iterate over the elements

In [None]:
for key, value in fruits_dic.items():
    print(key, value)

In [None]:
len(fruits_dic) # the size of a dictionary is the same as the number of keys

In [None]:
# we can also create a key that maps to a list of values for example

fruits_dic = {'red' : ['apple, cherry'], 'green' : ['pear', 'apple'], 'purple' : ['grape']}
print(fruits_dic)

In [None]:
# We can now use this for example to count the number of fruits of a given color

for key in fruits_dic.keys():
    print(key, len(fruits_dic[key]))

In [None]:
# We can also add append values o a given list for example

new_purple = ['eggplant', 'blackberry']
for new_fruit in new_purple:
    fruits_dic['purple'].append(new_fruit)
print(fruits_dic)

In [None]:
# One useful application of a dictionary is to for example to count

np.random.seed(0) # sets a random seed to ensure we always get the same random numbers
rand_ints = np.random.randint(1, 10, 1000) # Creates a list of 1000 random integers 1 to 9
print(rand_ints)

In [None]:
# Before we go into how to count we have to do a parentheses and
# look at the try and except keywords. There are many uses for this
# keywords and I recommend reading about it however, for now we will
# see how to use them in this context

# if we have an empty dictionary and try to reference a key
# we should get and error as the key does not exist

dic = {}
dic['apple']

In [None]:
# As expected we get an error now lets say for example we were attemting to do this in a loop

dic = {1 : 'one', 2 : 'two', 3 : 'three'} # This maps a number to its word

for i in range(4):
    print(dic[i]) # We get an error straight away as key 0 does not exists

In [None]:
# But we want our program to keep running regardless
# we want to know the words for the values we do have a key
# to achieve this we have to handle our error

for i in range(4):
    try: # we want to try if the following line is possible if it works it just excecutes
        print(dic[i]) # if this fails the we go to except and we handle it
    except:
        print(f'We got an error when attempted to referece {i}')
# As we can see the program excecutes without error and we 
# get our message when it fails.

In [None]:
# If we remember our list of random integers the idea is to count how many
# times they occur in the list but in principle we do not know how many distinct
# elements we have so and dictionary is ideal


# Lets first try it using arrays

count = np.zeros(10)
for num in rand_ints:
    count[num] += 1
print(count[1:]) # Since we do not have 0 we ignore the element 0

# This approach has some problems

# What would happen if we have number larger than 10?
# If we dont have an idea of the type of numbers we are counting is it a good idea to set np.ones(100000)?
# What if we are not counting numbers but instead words?

In [None]:
# Lets now look how to do it with a dictionary

count = {} # Create an empty dictionary
for num in rand_ints:
    try: # we see if the key exists
        count[num] += 1 # if the key exists we add one to the value
    except: # this occurs if the key does not exist
        count[num] = 1 # when the key does not exist we set the value to one
print(count)

### Exercise 4: The hbonds.txt file in the data folder contains information about a Molecular Dynamics simulation. We are interested in the second field that has the 3 letter representation of an aminoacid. Count how many times each amino acid occurs in the file.

In [None]:
# Dictionaries are also extremely useful for optimizing algorithms
# as they can usually be used to save a history of previous calculations
# a memory of the program so if you have to compute a a new value that at some point
# in the computation transforms to something you already know the answer
# for then you go straight to it without further computation

# To look at this in more detail we first need to introduce recursion
# again this is also one of those topics that are very wide but the simple
# idea is to have a function that to compute a given value requires to call the same function again

# For example lets look at a factorial


# 5! = 5 * 4 * 3 * 2 * 1 = 120

# A naive implementation would be

def factorial_naive(n):
    prod = 1
    for i in range(n, 1, -1):
        prod *= i
    return prod

print(factorial_naive(5))

In [None]:
# We are going to introduce now the time module that help us time our function

from time import time

t0 = time() # We save the current time to t0
result = factorial_naive(2500) # we call our function
took = time() - t0 # We save the current time - t0 to time
print(f'It took {took:6.4f} seconds to run')

In [None]:
# This below is the recursive approach. We first define our extreme case
# When we have reached the bottom that is when n = 1. When this happens
# we simply return n which is our result up to that point.
# If we are not in the limit case then we do recursion
# for the factorial it means n * (n-1)!


# 5! = 5 * 4! = 5 * 4 * 3! basically to compute 5!
# we need to compute 4! and to compute 4! we need to compute 3!
# and so on until we reach 1


# Recursion is not simple to understand at first 

def factorial_recursive(n):
    if n == 1:
        return n
    else:
        return n * factorial_recursive(n - 1)
print(factorial_recursive(5))

In [None]:
t0 = time()
result = factorial_recursive(2500)
took = time() - t0
print(f'It took {took:6.4f} seconds to run') # We can see that our recursive attempt takes almost
# twice as long as our naive approach
# in many cases recursion is useful but in many others
# although elegant it is not efficient

In [None]:
def factorial_dyn(n):
    dic = {1 : 1} # We create a dictionary with our extreme case
    for i in range(2, n + 1): # We loop over the remaining values
        value = dic[i - 1] * i # We perform the operation for the next value using the stored result of the dictinary
        dic[i] = value # Add the new value to the dictionary
    return dic
print(factorial_dyn(5)[5]) # Since we return the entire dictionary we index element 5 

In [None]:
t0 = time()
result = factorial_dyn(2500)
took = time() - t0
print(f'It took {took:6.4f} seconds to run') # The dynamic approach is similar to recursion but it is faster

### Exercise 5: Go over to the dynamic.py to learn more about dynamic programming