### using base python(csv.DictReader)

In [41]:
import csv
with open('olympic_medals.csv') as f:
    olympics_data = list(csv.DictReader(f))

# Print the first 5 rows of data
for i in range(5):
    print(olympics_data[i])


{'Gender': 'M', 'Event': '10000M Men', 'Location': 'Rio', 'Year': '2016', 'Medal': 'G', 'Name': 'Mohamed FARAH', 'Nationality': 'GBR', 'Result': '25:05.17'}
{'Gender': 'M', 'Event': '10000M Men', 'Location': 'Rio', 'Year': '2016', 'Medal': 'S', 'Name': 'Paul Kipngetich TANUI', 'Nationality': 'KEN', 'Result': '27:05.64'}
{'Gender': 'M', 'Event': '10000M Men', 'Location': 'Rio', 'Year': '2016', 'Medal': 'B', 'Name': 'Tamirat TOLA', 'Nationality': 'ETH', 'Result': '27:06.26'}
{'Gender': 'M', 'Event': '10000M Men', 'Location': 'Beijing', 'Year': '2008', 'Medal': 'G', 'Name': 'Kenenisa BEKELE', 'Nationality': 'ETH', 'Result': '27:01.17'}
{'Gender': 'M', 'Event': '10000M Men', 'Location': 'Beijing', 'Year': '2008', 'Medal': 'S', 'Name': 'Sileshi SIHINE', 'Nationality': 'ETH', 'Result': '27:02.77'}


In [42]:
# date from row3
olympics_data[2]

{'Gender': 'M',
 'Event': '10000M Men',
 'Location': 'Rio',
 'Year': '2016',
 'Medal': 'B',
 'Name': 'Tamirat TOLA',
 'Nationality': 'ETH',
 'Result': '27:06.26'}

In [43]:
#Acess all data from the 3rd column (i.e values associated with the 'Location' keys)
for olympic in olympics_data[:100]:
    print(olympic['Location'])

Rio
Rio
Rio
Beijing
Beijing
Beijing
Sydney
Sydney
Sydney
Barcelona
Barcelona
Barcelona
Los Angeles
Los Angeles
Los Angeles
Montreal
Montreal
Montreal
Mexico
Mexico
Mexico
Rome
Rome
Rome
Helsinki
Helsinki
Helsinki
Berlin
Berlin
Berlin
Amsterdam
Amsterdam
Amsterdam
Antwerp
Antwerp
Antwerp
London
London
London
Athens
Athens
Athens
Atlanta
Atlanta
Atlanta
Moscow
Moscow
Moscow
Munich
Munich
Munich
Tokyo
Tokyo
Tokyo
Melbourne / Stockholm
Melbourne / Stockholm
Melbourne / Stockholm
London
London
London
Los Angeles
Los Angeles
Los Angeles
Paris
Paris
Paris
Stockholm
Stockholm
Stockholm
Rio
Rio
Rio
Beijing
Beijing
Beijing
Sydney
Sydney
Sydney
Barcelona
Barcelona
Barcelona
Los Angeles
Los Angeles
Los Angeles
Montreal
Montreal
Montreal
Mexico
Mexico
Mexico
Rome
Rome
Rome
Helsinki
Helsinki
Helsinki
Berlin
Berlin
Berlin
Amsterdam


### With Pandas
With pandas, accessing columns is just as simple as accessing rows. For example, if we convert olympics_data (a list of dictionaries) in a dataframe, then view the first five rows:

In [76]:
#We just imported the pandas library as pd, the standard alias,then used the DataFrame constructor 
# to make a dataframe out of our existing list of dictionaries.
import pandas as pd
df = pd.DataFrame(olympics_data)
df.head()

Unnamed: 0,Gender,Event,Location,Year,Medal,Name,Nationality,Result,NaN
0,M,10000M Men,Rio,2016,G,Mohamed FARAH,GBR,25:05.17,
1,M,10000M Men,Rio,2016,S,Paul Kipngetich TANUI,KEN,27:05.64,
2,M,10000M Men,Rio,2016,B,Tamirat TOLA,ETH,27:06.26,
3,M,10000M Men,Beijing,2008,G,Kenenisa BEKELE,ETH,27:01.17,
4,M,10000M Men,Beijing,2008,S,Sileshi SIHINE,ETH,27:02.77,


In [87]:
df[(df['Name']=='David PAYNE')]

Unnamed: 0,Gender,Event,Location,Year,Medal,Name,Nationality,Result,NaN
155,M,110M Hurdles Men,Beijing,2008,S,David PAYNE,USA,13.17,[+0.1]


In [80]:
df['Name'].unique()

array(['Mohamed FARAH', 'Paul Kipngetich TANUI', 'Tamirat TOLA', ...,
       'Hrysopiyi DEVETZI', 'Inna LASOVSKAYA', 'Sarka KASPARKOVA'],
      dtype=object)

In [45]:
#Now we can extract all of the information from the 3rd column with a simpler syntax:
df['Location']

0           Rio
1           Rio
2           Rio
3       Beijing
4       Beijing
         ...   
2389     Athens
2390     Athens
2391    Atlanta
2392    Atlanta
2393    Atlanta
Name: Location, Length: 2394, dtype: object

### Also easy to extract info by row, just like with the list of dictionaries


In [69]:
df.iloc[2]

Gender                    M
Event            10000M Men
Location                Rio
Year                   2016
Medal                     B
Name           Tamirat TOLA
Nationality             ETH
Result             27:06.26
Name: 2, dtype: object

In [47]:
data = [[50, True], [40, False], [30, False]]

df = pd.DataFrame(data)

In [48]:
df

Unnamed: 0,0,1
0,50,True
1,40,False
2,30,False


In [49]:
#The iloc property gets, or sets, the value(s) of the specified indexes.
df.iloc[1,0] #Single indexes for both row and column [1, 0] returns the content of that cell.


40

In [50]:
df.iloc[2]

0       30
1    False
Name: 2, dtype: object

In [51]:
df[1] #Single index for one row 

0     True
1    False
2    False
Name: 1, dtype: bool

In [52]:
#To access more than one row, use double brackets and specify the indexes, separated by commas:
df.iloc[[0,2]]

Unnamed: 0,0,1
0,50,True
2,30,False


In [53]:
#Specify columns by including their indexes in another list:
df.iloc[[0, 2], [0, 1]]

Unnamed: 0,0,1
0,50,True
2,30,False


In [54]:
#You can also specify a slice of the DataFrame with from and to indexes, separated by a colon:
df.iloc[0:2]

Unnamed: 0,0,1
0,50,True
1,40,False


In [55]:
df.iloc[[0, 2], [1]]

Unnamed: 0,1
0,True
2,False


### We can also skip the csv model and olympics_data variable altogether, and just read the data from the CSV file directly

In [3]:
#data = pd.read_csv('file1.csv', on_bad_lines='skip') # skips the offending lines
#For Pandas < 1.3.0
#data = pd.read_csv("file1.csv", error_bad_lines=False)
#pd.__version__ # mine is '1.1.3'
#update version :  conda update pandas (close editor and open again)
#checking the bad data
#olympics_data[155:160]
df = pd.read_csv('olympic_medals.csv',on_bad_lines='warn')
df


b'Skipping line 156: expected 8 fields, saw 9\nSkipping line 157: expected 8 fields, saw 9\nSkipping line 158: expected 8 fields, saw 9\nSkipping line 317: expected 8 fields, saw 9\nSkipping line 318: expected 8 fields, saw 9\nSkipping line 319: expected 8 fields, saw 9\nSkipping line 1658: expected 8 fields, saw 9\nSkipping line 1659: expected 8 fields, saw 9\nSkipping line 1660: expected 8 fields, saw 9\nSkipping line 1784: expected 8 fields, saw 9\nSkipping line 1785: expected 8 fields, saw 9\nSkipping line 1786: expected 8 fields, saw 9\n'


Unnamed: 0,Gender,Event,Location,Year,Medal,Name,Nationality,Result
0,M,10000M Men,Rio,2016,G,Mohamed FARAH,GBR,25:05.17
1,M,10000M Men,Rio,2016,S,Paul Kipngetich TANUI,KEN,27:05.64
2,M,10000M Men,Rio,2016,B,Tamirat TOLA,ETH,27:06.26
3,M,10000M Men,Beijing,2008,G,Kenenisa BEKELE,ETH,27:01.17
4,M,10000M Men,Beijing,2008,S,Sileshi SIHINE,ETH,27:02.77
...,...,...,...,...,...,...,...,...
2377,W,Triple Jump Women,Athens,2004,S,Hrysopiyi DEVETZI,GRE,15.25
2378,W,Triple Jump Women,Athens,2004,B,Tatyana LEBEDEVA,RUS,15.14
2379,W,Triple Jump Women,Atlanta,1996,G,Inessa KRAVETS,UKR,15.33
2380,W,Triple Jump Women,Atlanta,1996,S,Inna LASOVSKAYA,RUS,14.98


In [66]:
#check pd version
len(df) #2382

2394

In [2]:
import pandas as pd
pd.__version__

'1.4.4'