In [21]:
track_times = [
    [13.10, 13.59, 13.44],
    [13.93, 13.85, 13.47],
    [14.12, 14.41, 13.89],
    [14.42, 13.55, 13.43]
]
track_times

[[13.1, 13.59, 13.44],
 [13.93, 13.85, 13.47],
 [14.12, 14.41, 13.89],
 [14.42, 13.55, 13.43]]

In [22]:
#Serialize (we get CSV from this lists of lists)
track_times_csv=''

#loop over all lists in the overall list
for index,athlete_times in enumerate(track_times):
    #Join together the values in the nested list using
    #a comma as a separator
    athlete_times_string = ','.join([str(time) for time in athlete_times])
    #print(athlete_times_string)
    #Append the values to the overall string 
    track_times_csv += athlete_times_string
    #Append a newline,unless this is the last row
    if index < (len(track_times)-1):
        track_times_csv += '\n'
print(track_times_csv)

13.1,13.59,13.44
13.93,13.85,13.47
14.12,14.41,13.89
14.42,13.55,13.43


In [24]:
#Write to a file
with open('track_times.csv','w') as f:
    f.write(track_times_csv)

In [25]:
#open the file
with open('track_times.csv') as f:
    track_times_csv_from_disk = f.read()
print(track_times_csv_from_disk)

13.1,13.59,13.44
13.93,13.85,13.47
14.12,14.41,13.89
14.42,13.55,13.43


In [31]:
#Deserialixe the data back into a list of lists
track_times_from_disk = []
for row in track_times_csv_from_disk.split('\n'):
    times = [float(time) for time in row.split(',')]
    track_times_from_disk.append(times)
track_times_from_disk

[[13.1, 13.59, 13.44],
 [13.93, 13.85, 13.47],
 [14.12, 14.41, 13.89],
 [14.42, 13.55, 13.43]]

In [32]:
#If we did everything correctly, this new list of lists should contain the exact same data as the original:
track_times_from_disk == track_times

True

### csv Module
That wasn't too bad, but in general you don't actually need to serialize and deserialize your data "by hand" like that. Instead, there is a module built in to Python called csv that can do the same thing using fewer lines of code! It also has helpful functionality for dealing with:

CSV files with headers that indicate what each column represents
More kinds of plain text delimited data, such as tab-separated values (TSV) files (where the delimiter is a tab \t rather than a comma)
Properly handling text data inside a CSV, e.g. if your data contains the text "Hello, World!" you want to make sure that the , is treated as part of the contents of that cell, not treated as a delimiter separating the columns
Reading and writing CSV files that are compatible with spreadsheet software such as Excel
You can find full documentation for this module here.

To use the csv module, start by importing it:

In [39]:
import csv
#csv.reader
#If we wanted to replicate the previous example of opening the track times CSV, this time using the csv module, that would look like this:
with open('track_times.csv') as f:
    #pass the file into a 'reader' object and specify that
    #values without explicit quotes(i.e all values in this dataset) should
    #be treated as numbers
    #reader = list(csv.reader(f,quoting=csv.QUOTE_NONNUMERIC))
    reader = csv.reader(f,quoting=csv.QUOTE_NONNUMERIC)
    #Get all of the data from the reader using 'list'
    track_times_with_csv_module=list(reader)
track_times_with_csv_module

[[13.1, 13.59, 13.44],
 [13.93, 13.85, 13.47],
 [14.12, 14.41, 13.89],
 [14.42, 13.55, 13.43]]

In [108]:
#Try using csv.reader
# encoding='utf-8' prevents decoding error
with open('olympic_medals.csv', encoding='utf-8') as f:
    reader = csv.reader(f)
    #printing only the header and the first 5 rows of data
    for i in range(6):
        print(next(reader))


['Gender', 'Event', 'Location', 'Year', 'Medal', 'Name', 'Nationality', 'Result']
['M', '10000M Men', 'Rio', '2016', 'G', 'Mohamed FARAH', 'GBR', '25:05.17']
['M', '10000M Men', 'Rio', '2016', 'S', 'Paul Kipngetich TANUI', 'KEN', '27:05.64']
['M', '10000M Men', 'Rio', '2016', 'B', 'Tamirat TOLA', 'ETH', '27:06.26']
['M', '10000M Men', 'Beijing', '2008', 'G', 'Kenenisa BEKELE', 'ETH', '27:01.17']
['M', '10000M Men', 'Beijing', '2008', 'S', 'Sileshi SIHINE', 'ETH', '27:02.77']


Then we could use list indexing to access the gender field of a given row using [0] or the nationality field using [-2]. But you can see how that would create some hard-to-read and error-prone code!

Fortunately we aren't limited to just using the list data structure — we can use a dict instead, so that we could look up the gender field using ["Gender"] or the nationality field using ["Nationality"].

In order to read the data in as a list of dictionaries rather than a list of lists, we can use csv.DictReader:

In [56]:
with open('olympic_medals.csv', encoding='utf-8') as f:
    olympics_data = list(csv.DictReader(f))
    #print the first 5 rows of data
    for index in range(5):
        print(olympics_data[index])

{'Gender': 'M', 'Event': '10000M Men', 'Location': 'Rio', 'Year': '2016', 'Medal': 'G', 'Name': 'Mohamed FARAH', 'Nationality': 'GBR', 'Result': '25:05.17'}
{'Gender': 'M', 'Event': '10000M Men', 'Location': 'Rio', 'Year': '2016', 'Medal': 'S', 'Name': 'Paul Kipngetich TANUI', 'Nationality': 'KEN', 'Result': '27:05.64'}
{'Gender': 'M', 'Event': '10000M Men', 'Location': 'Rio', 'Year': '2016', 'Medal': 'B', 'Name': 'Tamirat TOLA', 'Nationality': 'ETH', 'Result': '27:06.26'}
{'Gender': 'M', 'Event': '10000M Men', 'Location': 'Beijing', 'Year': '2008', 'Medal': 'G', 'Name': 'Kenenisa BEKELE', 'Nationality': 'ETH', 'Result': '27:01.17'}
{'Gender': 'M', 'Event': '10000M Men', 'Location': 'Beijing', 'Year': '2008', 'Medal': 'S', 'Name': 'Sileshi SIHINE', 'Nationality': 'ETH', 'Result': '27:02.77'}


In [57]:
# The number of rows in the dataset is just the length of the resulting list:
len(olympics_data)

2394

### Now we can perform data analysis and cleaning tasks in a neater, clearer way.

##### 1.For example, if our task was filter the data so that it only includes gold medals, that logic would look something like this:

In [73]:
gold_medals = []
for gold in olympics_data:
    if gold['Medal'] == 'G':
        gold_medals.append(gold)

print(f'out of {len(olympics_data)} total medals, this dataset contains\
      information about {len(gold_medals)} gold medals')

out of 2394 total medals, this dataset contains      information about 799 gold medals


In [74]:
#using list comprehension
gold_medals2 = [gold for gold in olympics_data if gold['Medal'] == 'G']

print(f'out of {len(olympics_data)} total medals, this dataset contains\
      information about {len(gold_medals2)} gold medals')

out of 2394 total medals, this dataset contains      information about 799 gold medals


#### 2.Or if it was for all USA gold medals in 2016, what were the events and the names of the athletes, that logic would look something like this:

In [100]:
#list comprehension
usa_2016_gold_medals = [{'Event':gold['Event'],'Name':gold['Name']} for gold in olympics_data if gold['Medal'] == 'G' and gold['Year']=='2016' and gold['Nationality']=='USA']
usa_2016_gold_medals

[{'Event': '1500M Men', 'Name': 'Matthew CENTROWITZ'},
 {'Event': '400M Hurdles Men', 'Name': 'Kerron CLEMENT'},
 {'Event': '4X400M Relay Men', 'Name': 'null'},
 {'Event': 'Decathlon Men', 'Name': 'Ashton EATON'},
 {'Event': 'Long Jump Men', 'Name': 'Jeff HENDERSON'},
 {'Event': 'Shot Put Men', 'Name': 'Ryan CROUSER'},
 {'Event': 'Triple Jump Men', 'Name': 'Christian TAYLOR'},
 {'Event': '100M Hurdles Women', 'Name': 'Brianna ROLLINS'},
 {'Event': '400M Hurdles Women', 'Name': 'Dalilah MUHAMMAD'},
 {'Event': '4X100M Relay Women', 'Name': 'null'},
 {'Event': '4X400M Relay Women', 'Name': 'null'},
 {'Event': 'Long Jump Women', 'Name': 'Tianna BARTOLETTA'},
 {'Event': 'Shot Put Women', 'Name': 'Michelle CARTER'}]

In [99]:
#using for loop
usa_2016_gold_medals2 = []
for gold in olympics_data:
    if gold['Medal'] == 'G' and gold['Year']=='2016' and gold['Nationality']=='USA':
        usa_2016_gold_medals2.append({'Event':gold['Event'],'Name':gold['Name']})

usa_2016_gold_medals2

[{'Event': '1500M Men', 'Name': 'Matthew CENTROWITZ'},
 {'Event': '400M Hurdles Men', 'Name': 'Kerron CLEMENT'},
 {'Event': '4X400M Relay Men', 'Name': 'null'},
 {'Event': 'Decathlon Men', 'Name': 'Ashton EATON'},
 {'Event': 'Long Jump Men', 'Name': 'Jeff HENDERSON'},
 {'Event': 'Shot Put Men', 'Name': 'Ryan CROUSER'},
 {'Event': 'Triple Jump Men', 'Name': 'Christian TAYLOR'},
 {'Event': '100M Hurdles Women', 'Name': 'Brianna ROLLINS'},
 {'Event': '400M Hurdles Women', 'Name': 'Dalilah MUHAMMAD'},
 {'Event': '4X100M Relay Women', 'Name': 'null'},
 {'Event': '4X400M Relay Women', 'Name': 'null'},
 {'Event': 'Long Jump Women', 'Name': 'Tianna BARTOLETTA'},
 {'Event': 'Shot Put Women', 'Name': 'Michelle CARTER'}]

In [101]:
#And we could write that result to a file using csv.DictWriter

In [106]:
#dialect='unix' removes unnecesaary blank lines
with open('usa_2016_gold_medals.csv','w') as f:
    writer = csv.DictWriter(f,fieldnames=['Event','Name'],dialect='unix')
    writer.writeheader()
    for row in usa_2016_gold_medals:
        writer.writerow(row)

In [111]:
#We can use the bash command cat to visually inspect the file that was created:
!cat usa_2016_gold_medals.csv

"Event","Name"
"1500M Men","Matthew CENTROWITZ"
"400M Hurdles Men","Kerron CLEMENT"
"4X400M Relay Men","null"
"Decathlon Men","Ashton EATON"
"Long Jump Men","Jeff HENDERSON"
"Shot Put Men","Ryan CROUSER"
"Triple Jump Men","Christian TAYLOR"
"100M Hurdles Women","Brianna ROLLINS"
"400M Hurdles Women","Dalilah MUHAMMAD"
"4X100M Relay Women","null"
"4X400M Relay Women","null"
"Long Jump Women","Tianna BARTOLETTA"
"Shot Put Women","Michelle CARTER"


### What does each column represent?

To extract the basics of this information, you can just call the .keys() on one of the row dictionaries. In this case, the columns are:

In [115]:
olympics_data[0].keys()

dict_keys(['Gender', 'Event', 'Location', 'Year', 'Medal', 'Name', 'Nationality', 'Result'])

### If you need to understand further details such as the capitalization style of the names or the units of the results, you can often find this information in documentation about the dataset. If not, you may need to apply your own judgment.

For example, let's look at a record from this dataset:

In [116]:
olympics_data[0]

{'Gender': 'M',
 'Event': '10000M Men',
 'Location': 'Rio',
 'Year': '2016',
 'Medal': 'G',
 'Name': 'Mohamed FARAH',
 'Nationality': 'GBR',
 'Result': '25:05.17'}

In [117]:
olympics_data[85]

{'Gender': 'M',
 'Event': '100M Men',
 'Location': 'Montreal',
 'Year': '1976',
 'Medal': 'S',
 'Name': 'Donald QUARRIE',
 'Nationality': 'JAM',
 'Result': '10.08'}

In [118]:
olympics_data[1100]

{'Gender': 'M',
 'Event': 'Discus Throw Men',
 'Location': 'Los Angeles',
 'Year': '1984',
 'Medal': 'B',
 'Name': 'John POWELL',
 'Nationality': 'USA',
 'Result': '65.46'}