### Data
United Kingdom: Countries and Major Cities
https://www.citypopulation.de/en/uk/cities/

What you need:
`./data/uk_countries.csv`

### File open and close

In [1]:
# CSV stands for Comma Separated Values, which is a file format

f = open(file='./data/uk_countries.csv', mode='r', encoding='utf-8')

for line in f:
    print(line)

# Always remember to close the file handler
f.close()

name,capital,area_km2,population_1991,population_2001,population_2011,population_2021

England,London,130278,47055205,49138831,53012456,56490048

Northern Ireland,Belfast,13562,1577836,1685267,1810863,1903175

Scotland,Edinburgh,77925,4998567,5062011,5295403,5418400

Wales,Cardiff,20735,2835073,2903085,3063456,3107494



In [2]:
# Alternatively, we can use the 'with' statement
# 'f' will be active within this code block, and there is no need to do f.close()

with open(file='./data/uk_countries.csv', mode='r', encoding='utf-8') as f:
    for line in f:
        print(line)

# This won't work as 'f' does not exist anymore
# for line in f:
#     print(line)

name,capital,area_km2,population_1991,population_2001,population_2011,population_2021

England,London,130278,47055205,49138831,53012456,56490048

Northern Ireland,Belfast,13562,1577836,1685267,1810863,1903175

Scotland,Edinburgh,77925,4998567,5062011,5295403,5418400

Wales,Cardiff,20735,2835073,2903085,3063456,3107494



#### Why use 'with'?
- Make sure the file is properly closed
- When writing to a file, data may not be written until you explicitly close the file

#### Why we have extra blank lines?

Because each print() will generate a new line, while at the end of each line in the original file there is already a 'new line character' (\n), so effectively each line of content will be followed by two 'new line characters'.

### Working on a simple dataset (UK population)

#### Task: get the sum of UK population across the countries, by year

In [3]:
# It's always a good practice to initilise something like this, so that you can have a concerte understanding of what you will achieve
result = ['UK', 'London', 0, 0, 0, 0, 0]

with open("./data/uk_countries.csv", mode="r", encoding="utf-8") as f:
    for line in f.readlines()[1:]:
        row = line.strip().split(",")
# We could do this, but if we have say 100+ columns then that would be a hassle, so we use a 'for' loop
#         result[3] = result[3] + int(row[3])
#         result[4] = result[4] + int(row[4])
#         result[5] = result[5] + int(row[5])
#         result[6] = result[6] + int(row[6])
#         print(result)
        for i in [3, 4, 5, 6]:
            result[i] += int(row[i])

print(result)

result_str = ",".join([str(r) for r in result])

result_str

['UK', 'London', 0, 56466681, 58789194, 63182178, 66919117]


'UK,London,0,56466681,58789194,63182178,66919117'

In [4]:
# To make a copy of the original file because we still need it (without the sum row) in the next section
import shutil
shutil.copyfile("./data/uk_countries.csv", "./data/uk_countries_sum.csv")

# Append the result to the data file
# See now we specify the mode as 'a' (append), while previously we used 'r' (read)
with open(file="./data/uk_countries_sum.csv", mode="a", encoding="utf-8") as f:
    f.write(result_str + "\n")

# NOTE: You should only do this once, otherwise you will have duplicate lines. To remove the line you just added, use any text editor. Just remember to leave a blank line at the end.

### Re-organise data

We need to re-organise the data into the JSON format, which will be used later

In [5]:
output = {
    'country': [],
    'year': [],
    'population': []
}

with open("./data/uk_countries.csv", mode="r", encoding="utf-8") as f:
    fields = f.readline().strip().split(",")
    for line in f.readlines():
        row = line.strip().split(",")
        for i in [3, 4, 5, 6]:
            output['country'].append(row[0])
            year = fields[i].split("_")[1]
            output['year'].append(year)
            output['population'].append(row[i])
output

{'country': ['England',
  'England',
  'England',
  'England',
  'Northern Ireland',
  'Northern Ireland',
  'Northern Ireland',
  'Northern Ireland',
  'Scotland',
  'Scotland',
  'Scotland',
  'Scotland',
  'Wales',
  'Wales',
  'Wales',
  'Wales'],
 'year': ['1991',
  '2001',
  '2011',
  '2021',
  '1991',
  '2001',
  '2011',
  '2021',
  '1991',
  '2001',
  '2011',
  '2021',
  '1991',
  '2001',
  '2011',
  '2021'],
 'population': ['47055205',
  '49138831',
  '53012456',
  '56490048',
  '1577836',
  '1685267',
  '1810863',
  '1903175',
  '4998567',
  '5062011',
  '5295403',
  '5418400',
  '2835073',
  '2903085',
  '3063456',
  '3107494']}

### Save file

In [6]:
import json

with open('./data/uk_population.json', 'w') as f:
    json.dump(output, f)