# Series
## A Series is a one-dimensional labelled array that holds any type of data (integers, strings, floats, etc)

In [1]:
import pandas as pd

# Pandas Series
nums = pd.Series([10, 20, 30, 40])
nums

0    10
1    20
2    30
3    40
dtype: int64

In [2]:
temps = pd.Series(
    [18, 23, 27, 20], 
    index = ['Monday', 'Tuesday', 'Wednesday', 'Thursday'])

temps

Monday       18
Tuesday      23
Wednesday    27
Thursday     20
dtype: int64

# DataFrame
## A DataFrame is a two-dimensional table (like an Excel sheet) that consists of rows and columns. It's essentially a collection of multiple Series.

In [3]:
# Pandas DataFrame
data = {
    'Name': ['Francis', 'Githaiga', 'Olin'],
    'Age': [25, 23, 21],
    'County': ['Nairobi', 'Kiambu', 'Nyeri']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,County
0,Francis,25,Nairobi
1,Githaiga,23,Kiambu
2,Olin,21,Nyeri


In [5]:
#set your own IDs
# Pandas DataFrame
data = {
    'Name': ['Francis', 'Githaiga', 'Olin'],
    'Age': [25, 23, 21],
    'County': ['Nairobi', 'Kiambu', "Murang'a"]
}

ids = [101, 102, 103]
df = pd.DataFrame(data , index = ids)
df

Unnamed: 0,Name,Age,County
101,Francis,25,Nairobi
102,Githaiga,23,Kiambu
103,Olin,21,Murang'a


# CSV DictReader
### Used to convert all details to dictionary

In [10]:
import csv
with open("./results.csv") as f:
    reader = csv.DictReader(f)
    olympics_data = list(reader)

In [11]:
    
#Print the first five rows
for index in range(5):
    #range(5) == range(0, 5) == 0, 1, 2, 3, 4
    print(olympics_data[index])

{'Gender': 'M', 'Event': '10000M Men', 'Location': 'Rio', 'Year': '2016', 'Medal': 'G', 'Name': 'Mohamed FARAH', 'Nationality': 'USA', 'Result': '25:05.17'}
{'Gender': 'M', 'Event': '10000M Men', 'Location': 'Rio', 'Year': '2016', 'Medal': 'S', 'Name': 'Paul Kipngetich TANUI', 'Nationality': 'KEN', 'Result': '27:05.64'}
{'Gender': 'M', 'Event': '10000M Men', 'Location': 'Rio', 'Year': '2016', 'Medal': 'B', 'Name': 'Tamirat TOLA', 'Nationality': 'ETH', 'Result': '27:06.26'}
{'Gender': 'M', 'Event': '10000M Men', 'Location': 'Beijing', 'Year': '2008', 'Medal': 'G', 'Name': 'Kenenisa BEKELE', 'Nationality': 'ETH', 'Result': '27:01.17'}
{'Gender': 'M', 'Event': '10000M Men', 'Location': 'Beijing', 'Year': '2008', 'Medal': 'S', 'Name': 'Sileshi SIHINE', 'Nationality': 'ETH', 'Result': '27:02.77'}


In [13]:
olympics_data

[{'Gender': 'M',
  'Event': '10000M Men',
  'Location': 'Rio',
  'Year': '2016',
  'Medal': 'G',
  'Name': 'Mohamed FARAH',
  'Nationality': 'USA',
  'Result': '25:05.17'},
 {'Gender': 'M',
  'Event': '10000M Men',
  'Location': 'Rio',
  'Year': '2016',
  'Medal': 'S',
  'Name': 'Paul Kipngetich TANUI',
  'Nationality': 'KEN',
  'Result': '27:05.64'},
 {'Gender': 'M',
  'Event': '10000M Men',
  'Location': 'Rio',
  'Year': '2016',
  'Medal': 'B',
  'Name': 'Tamirat TOLA',
  'Nationality': 'ETH',
  'Result': '27:06.26'},
 {'Gender': 'M',
  'Event': '10000M Men',
  'Location': 'Beijing',
  'Year': '2008',
  'Medal': 'G',
  'Name': 'Kenenisa BEKELE',
  'Nationality': 'ETH',
  'Result': '27:01.17'},
 {'Gender': 'M',
  'Event': '10000M Men',
  'Location': 'Beijing',
  'Year': '2008',
  'Medal': 'S',
  'Name': 'Sileshi SIHINE',
  'Nationality': 'ETH',
  'Result': '27:02.77'},
 {'Gender': 'M',
  'Event': '10000M Men',
  'Location': 'Beijing',
  'Year': '2008',
  'Medal': 'B',
  'Name': 'Micah

In [17]:
olympics_data[2]

{'Gender': 'M',
 'Event': '10000M Men',
 'Location': 'Rio',
 'Year': '2016',
 'Medal': 'B',
 'Name': 'Tamirat TOLA',
 'Nationality': 'ETH',
 'Result': '27:06.26'}

In [20]:
print([row['Location'] for row in olympics_data][:10])

['Rio', 'Rio', 'Rio', 'Beijing', 'Beijing', 'Beijing', 'Sydney', 'Sydney', 'Sydney', 'Barcelona']


In [21]:
import pandas as pd
df = pd.DataFrame(olympics_data)
df

Unnamed: 0,Gender,Event,Location,Year,Medal,Name,Nationality,Result,NaN
0,M,10000M Men,Rio,2016,G,Mohamed FARAH,USA,25:05.17,
1,M,10000M Men,Rio,2016,S,Paul Kipngetich TANUI,KEN,27:05.64,
2,M,10000M Men,Rio,2016,B,Tamirat TOLA,ETH,27:06.26,
3,M,10000M Men,Beijing,2008,G,Kenenisa BEKELE,ETH,27:01.17,
4,M,10000M Men,Beijing,2008,S,Sileshi SIHINE,ETH,27:02.77,
...,...,...,...,...,...,...,...,...,...
2389,W,Triple Jump Women,Athens,2004,S,Hrysopiyi DEVETZI,GRE,15.25,
2390,W,Triple Jump Women,Athens,2004,B,Tatyana LEBEDEVA,RUS,15.14,
2391,W,Triple Jump Women,Atlanta,1996,G,Inessa KRAVETS,UKR,15.33,
2392,W,Triple Jump Women,Atlanta,1996,S,Inna LASOVSKAYA,RUS,14.98,


In [22]:
df.head()

Unnamed: 0,Gender,Event,Location,Year,Medal,Name,Nationality,Result,NaN
0,M,10000M Men,Rio,2016,G,Mohamed FARAH,USA,25:05.17,
1,M,10000M Men,Rio,2016,S,Paul Kipngetich TANUI,KEN,27:05.64,
2,M,10000M Men,Rio,2016,B,Tamirat TOLA,ETH,27:06.26,
3,M,10000M Men,Beijing,2008,G,Kenenisa BEKELE,ETH,27:01.17,
4,M,10000M Men,Beijing,2008,S,Sileshi SIHINE,ETH,27:02.77,


In [23]:
df.tail()

Unnamed: 0,Gender,Event,Location,Year,Medal,Name,Nationality,Result,NaN
2389,W,Triple Jump Women,Athens,2004,S,Hrysopiyi DEVETZI,GRE,15.25,
2390,W,Triple Jump Women,Athens,2004,B,Tatyana LEBEDEVA,RUS,15.14,
2391,W,Triple Jump Women,Atlanta,1996,G,Inessa KRAVETS,UKR,15.33,
2392,W,Triple Jump Women,Atlanta,1996,S,Inna LASOVSKAYA,RUS,14.98,
2393,W,Triple Jump Women,Atlanta,1996,B,Sarka KASPARKOVA,CZE,14.98,


In [24]:
df2=pd.read_csv('./results.csv')

ParserError: Error tokenizing data. C error: Expected 8 fields in line 156, saw 9


In [33]:
df2 = pd.read_csv('./results.csv', error_bad_lines=False)
df2.head()

b'Skipping line 156: expected 8 fields, saw 9\nSkipping line 157: expected 8 fields, saw 9\nSkipping line 158: expected 8 fields, saw 9\nSkipping line 317: expected 8 fields, saw 9\nSkipping line 318: expected 8 fields, saw 9\nSkipping line 319: expected 8 fields, saw 9\nSkipping line 1658: expected 8 fields, saw 9\nSkipping line 1659: expected 8 fields, saw 9\nSkipping line 1660: expected 8 fields, saw 9\nSkipping line 1784: expected 8 fields, saw 9\nSkipping line 1785: expected 8 fields, saw 9\nSkipping line 1786: expected 8 fields, saw 9\n'


Unnamed: 0,Gender,Event,Location,Year,Medal,Name,Nationality,Result
0,M,10000M Men,Rio,2016,G,Mohamed FARAH,USA,25:05.17
1,M,10000M Men,Rio,2016,S,Paul Kipngetich TANUI,KEN,27:05.64
2,M,10000M Men,Rio,2016,B,Tamirat TOLA,ETH,27:06.26
3,M,10000M Men,Beijing,2008,G,Kenenisa BEKELE,ETH,27:01.17
4,M,10000M Men,Beijing,2008,S,Sileshi SIHINE,ETH,27:02.77


In [38]:
df['Location']

0           Rio
1           Rio
2           Rio
3       Beijing
4       Beijing
         ...   
2389     Athens
2390     Athens
2391    Atlanta
2392    Atlanta
2393    Atlanta
Name: Location, Length: 2394, dtype: object

In [35]:
df.index

RangeIndex(start=0, stop=2394, step=1)

In [36]:
df.shape

(2394, 9)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2394 entries, 0 to 2393
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Gender       2394 non-null   object
 1   Event        2394 non-null   object
 2   Location     2394 non-null   object
 3   Year         2394 non-null   object
 4   Medal        2394 non-null   object
 5   Name         2394 non-null   object
 6   Nationality  2394 non-null   object
 7   Result       2394 non-null   object
 8   None         12 non-null     object
dtypes: object(9)
memory usage: 168.5+ KB


In [39]:
len(df)

2394

In [40]:
df.columns

Index([     'Gender',       'Event',    'Location',        'Year',
             'Medal',        'Name', 'Nationality',      'Result',
                None],
      dtype='object')

In [41]:
df.dtypes

Gender         object
Event          object
Location       object
Year           object
Medal          object
Name           object
Nationality    object
Result         object
NaN            object
dtype: object

# Selecting DataFrame Information
## Two main methods are used to do the above
### .iloc  and  .loc

....................................................................................................................................................................................................................................................

## .iloc
### Index-Based Selection: Allows selection of data in a dataframe based on integer positions (row & column numbers)

In [43]:
df.iloc[3]

Gender                       M
Event               10000M Men
Location               Beijing
Year                      2008
Medal                        G
Name           Kenenisa BEKELE
Nationality                ETH
Result                27:01.17
NaN                        NaN
Name: 3, dtype: object

In [46]:
#To select several rows
df.iloc[5:10]

Unnamed: 0,Gender,Event,Location,Year,Medal,Name,Nationality,Result,NaN
5,M,10000M Men,Beijing,2008,B,Micah KOGO,KEN,27:04.11,
6,M,10000M Men,Sydney,2000,G,Haile GEBRSELASSIE,ETH,27:18.20,
7,M,10000M Men,Sydney,2000,S,Paul TERGAT,KEN,27:18.29,
8,M,10000M Men,Sydney,2000,B,Assefa MEZGEBU,ETH,27:19.75,
9,M,10000M Men,Barcelona,1992,G,Khalid SKAH,MAR,27:46.70,


In [47]:
df[5:10]

Unnamed: 0,Gender,Event,Location,Year,Medal,Name,Nationality,Result,NaN
5,M,10000M Men,Beijing,2008,B,Micah KOGO,KEN,27:04.11,
6,M,10000M Men,Sydney,2000,G,Haile GEBRSELASSIE,ETH,27:18.20,
7,M,10000M Men,Sydney,2000,S,Paul TERGAT,KEN,27:18.29,
8,M,10000M Men,Sydney,2000,B,Assefa MEZGEBU,ETH,27:19.75,
9,M,10000M Men,Barcelona,1992,G,Khalid SKAH,MAR,27:46.70,


In [49]:
df.iloc[:, 3:4]

Unnamed: 0,Year
0,2016
1,2016
2,2016
3,2008
4,2008
...,...
2389,2004
2390,2004
2391,1996
2392,1996


In [50]:
df.loc[:, 'Medal']
df.loc[3:7, 'Nationality']

3    ETH
4    ETH
5    KEN
6    ETH
7    KEN
Name: Nationality, dtype: object