In [1]:
import pandas as pd
df = pd.read_csv('/Data/gapminder.tsv', sep='\t') # read_table uses tabs as its default separator

In [2]:
# Retrieve the country column to its own variable (country_df).
country_df = df['country']

In [3]:
# Show the first 5 observations.
print(country_df.head())

0    Afghanistan
1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
Name: country, dtype: object


In [4]:
# Show the last 5 observations.
print(country_df.tail())

1699    Zimbabwe
1700    Zimbabwe
1701    Zimbabwe
1702    Zimbabwe
1703    Zimbabwe
Name: country, dtype: object


In [5]:
# Specify multiple column names using a list.
subset = df[['country', 'continent', 'year']]

In [6]:
# Show the first 5 observations.
print(subset.head())

       country continent  year
0  Afghanistan      Asia  1952
1  Afghanistan      Asia  1957
2  Afghanistan      Asia  1962
3  Afghanistan      Asia  1967
4  Afghanistan      Asia  1972


In [7]:
# Show the last 5 observations.
print(subset.tail())

       country continent  year
1699  Zimbabwe    Africa  1987
1700  Zimbabwe    Africa  1992
1701  Zimbabwe    Africa  1997
1702  Zimbabwe    Africa  2002
1703  Zimbabwe    Africa  2007


In [8]:
print(df.head()) # Observe the left-side of the DataFrame object.
# Subset based on index label (row/col name), 'loc'.
# Subset based on row index (row/col number), 'iloc'.
# Subset based on index label or row index, 'ix'. No longer works.

       country continent  year  lifeExp       pop   gdpPercap
0  Afghanistan      Asia  1952   28.801   8425333  779.445314
1  Afghanistan      Asia  1957   30.332   9240934  820.853030
2  Afghanistan      Asia  1962   31.997  10267083  853.100710
3  Afghanistan      Asia  1967   34.020  11537966  836.197138
4  Afghanistan      Asia  1972   36.088  13079460  739.981106


In [9]:
# Return the first row using 'loc'.
print(df.loc[0])

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap        779.445
Name: 0, dtype: object


In [10]:
# Return the 100th row using 'loc'.
print(df.loc[99])

country      Bangladesh
continent          Asia
year               1967
lifeExp          43.453
pop            62821884
gdpPercap       721.186
Name: 99, dtype: object


In [11]:
# Return the last row using 'loc'.
print(df.loc[-1]) # Returns a KeyError.

KeyError: 'the label [-1] is not in the [index]'

In [12]:
# Return the number of rows.
number_of_rows = df.shape[0] # 1704
print(number_of_rows)

1704


In [13]:
# Calculate and return the last row index.
last_row_index = number_of_rows - 1
print(last_row_index)

1703


In [14]:
# Return the last row using the index calculated.
print(df.loc[last_row_index])

country      Zimbabwe
continent      Africa
year             2007
lifeExp        43.487
pop          12311143
gdpPercap     469.709
Name: 1703, dtype: object


In [15]:
# Return the last row using the tail(n=number) method. Pass 1 to the argument 'n'.
print(df.tail(n=1))

       country continent  year  lifeExp       pop   gdpPercap
1703  Zimbabwe    Africa  2007   43.487  12311143  469.709298


In [16]:
subset_loc = df.loc[0]
subset_head = df.head(n=1)

In [17]:
# Observe the type of loc of 1 row.
print(type(subset_loc))

<class 'pandas.core.series.Series'>


In [18]:
# Observe the type using the head() of 1 row.
print(type(subset_head))

<class 'pandas.core.frame.DataFrame'>


In [19]:
# Subset multiple rows of the DataFrame object: 1, 100, 1000. Remember the brackets [].
print(df.loc[[0, 99, 999]])

         country continent  year  lifeExp       pop    gdpPercap
0    Afghanistan      Asia  1952   28.801   8425333   779.445314
99    Bangladesh      Asia  1967   43.453  62821884   721.186086
999     Mongolia      Asia  1967   51.253   1149500  1226.041130


In [20]:
# iloc subsets the DataFrame using row index number.
# Retrieve the second row. 
print(df.iloc[1]) # In this case the index is an integer but does not necessarily have to be.

country      Afghanistan
continent           Asia
year                1957
lifeExp           30.332
pop              9240934
gdpPercap        820.853
Name: 1, dtype: object


In [21]:
# Return the 100th row. Zero-indexed behavior.
print(df.iloc[99])

country      Bangladesh
continent          Asia
year               1967
lifeExp          43.453
pop            62821884
gdpPercap       721.186
Name: 99, dtype: object


In [22]:
# Return the last row of the DataFrame using iloc[].
print(df.iloc[-1]) # Can use the -1 with iloc.

country      Zimbabwe
continent      Africa
year             2007
lifeExp        43.487
pop          12311143
gdpPercap     469.709
Name: 1703, dtype: object


In [23]:
# Pass a list of integers to return the rows: 0, 99, 999.
print(df.iloc[[0, 99, 999]])

         country continent  year  lifeExp       pop    gdpPercap
0    Afghanistan      Asia  1952   28.801   8425333   779.445314
99    Bangladesh      Asia  1967   43.453  62821884   721.186086
999     Mongolia      Asia  1967   51.253   1149500  1226.041130


In [24]:
# ix method is no longer used.
print(df.ix[0])
print(df.ix[99])
print(df.ix[[0, 99, 999]]) # ix method is 'deprecated'

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap        779.445
Name: 0, dtype: object
country      Bangladesh
continent          Asia
year               1967
lifeExp          43.453
pop            62821884
gdpPercap       721.186
Name: 99, dtype: object
         country continent  year  lifeExp       pop    gdpPercap
0    Afghanistan      Asia  1952   28.801   8425333   779.445314
99    Bangladesh      Asia  1967   43.453  62821884   721.186086
999     Mongolia      Asia  1967   51.253   1149500  1226.041130


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.


In [25]:
# loc attribute syntax: loc[[rows], [columns]]
# iloc attribute syntax: iloc[[rows], [columns]]
# Subsetting columns: df.loc[:, [columns]]
subset = df.loc[:, ['year', 'pop']]
print(subset.head())

   year       pop
0  1952   8425333
1  1957   9240934
2  1962  10267083
3  1967  11537966
4  1972  13079460


In [26]:
# Subsettting with the iloc attribute.
subset = df.iloc[:, [2, 4, -1]]
print(subset.head())

   year       pop   gdpPercap
0  1952   8425333  779.445314
1  1957   9240934  820.853030
2  1962  10267083  853.100710
3  1967  11537966  836.197138
4  1972  13079460  739.981106


In [27]:
# Returns a KeyError if we mix up the attributes.
subset = df.loc[:, [2, 4, -1]] # KeyError
print(subset.head())

KeyError: 'None of [[2, 4, -1]] are in the [columns]'

In [28]:
subset = df.iloc[:, ['year', 'pop']] # TypeError
print(subset.head())

TypeError: cannot perform reduce with flexible type

In [29]:
# Subset columns with range().
small_range = list(range(5)) # [0, 1, 2, 3, 4] is returned
subset = df.iloc[:, small_range]
print(subset.head())

       country continent  year  lifeExp       pop
0  Afghanistan      Asia  1952   28.801   8425333
1  Afghanistan      Asia  1957   30.332   9240934
2  Afghanistan      Asia  1962   31.997  10267083
3  Afghanistan      Asia  1967   34.020  11537966
4  Afghanistan      Asia  1972   36.088  13079460


In [30]:
# Subset columns with range().
small_range = list(range(3, 6)) # [3, 4, 5] is returned
subset = df.iloc[:, small_range]
print(subset.head())

   lifeExp       pop   gdpPercap
0   28.801   8425333  779.445314
1   30.332   9240934  820.853030
2   31.997  10267083  853.100710
3   34.020  11537966  836.197138
4   36.088  13079460  739.981106


In [31]:
# Subset columns with range().
small_range = list(range(0, 6, 2)) # [0, 2, 4] is returned
subset = df.iloc[:, small_range]
print(subset.head())

       country  year       pop
0  Afghanistan  1952   8425333
1  Afghanistan  1957   9240934
2  Afghanistan  1962  10267083
3  Afghanistan  1967  11537966
4  Afghanistan  1972  13079460


In [32]:
# Subset columns using slicing only.
small_range = list(range(3)) # [0, 1, 2]
subset = df.iloc[:, small_range]
print(subset.head(), end='\n'*2)

subset = df.iloc[:, :3] # Exclusive of the integer passed.
print(subset.head())

       country continent  year
0  Afghanistan      Asia  1952
1  Afghanistan      Asia  1957
2  Afghanistan      Asia  1962
3  Afghanistan      Asia  1967
4  Afghanistan      Asia  1972

       country continent  year
0  Afghanistan      Asia  1952
1  Afghanistan      Asia  1957
2  Afghanistan      Asia  1962
3  Afghanistan      Asia  1967
4  Afghanistan      Asia  1972


In [33]:
subset = df.iloc[:, 3:6] # Exclusive of the integer passed.
print(subset.head())

   lifeExp       pop   gdpPercap
0   28.801   8425333  779.445314
1   30.332   9240934  820.853030
2   31.997  10267083  853.100710
3   34.020  11537966  836.197138
4   36.088  13079460  739.981106


In [34]:
subset = df.iloc[:, 0:6:2] # Exclusive of the integer passed.
print(subset.head())

       country  year       pop
0  Afghanistan  1952   8425333
1  Afghanistan  1957   9240934
2  Afghanistan  1962  10267083
3  Afghanistan  1967  11537966
4  Afghanistan  1972  13079460


In [35]:
# Slicing works like Python, so you can leave values out.
subset_1 = df.iloc[:, 0:6:].head()
subset_2 = df.iloc[:, 0::2].head()
subset_3 = df.iloc[:, :6:2].head()
subset_4 = df.iloc[:, ::].head()

print(subset_1, end='\n' * 2)
print(subset_2, end='\n' * 2)
print(subset_3, end='\n' * 2)
print(subset_4)

       country continent  year  lifeExp       pop   gdpPercap
0  Afghanistan      Asia  1952   28.801   8425333  779.445314
1  Afghanistan      Asia  1957   30.332   9240934  820.853030
2  Afghanistan      Asia  1962   31.997  10267083  853.100710
3  Afghanistan      Asia  1967   34.020  11537966  836.197138
4  Afghanistan      Asia  1972   36.088  13079460  739.981106

       country  year       pop
0  Afghanistan  1952   8425333
1  Afghanistan  1957   9240934
2  Afghanistan  1962  10267083
3  Afghanistan  1967  11537966
4  Afghanistan  1972  13079460

       country  year       pop
0  Afghanistan  1952   8425333
1  Afghanistan  1957   9240934
2  Afghanistan  1962  10267083
3  Afghanistan  1967  11537966
4  Afghanistan  1972  13079460

       country continent  year  lifeExp       pop   gdpPercap
0  Afghanistan      Asia  1952   28.801   8425333  779.445314
1  Afghanistan      Asia  1957   30.332   9240934  820.853030
2  Afghanistan      Asia  1962   31.997  10267083  853.100710
3  Af

In [36]:
# Specify rows and columns using loc attribute.
print(df.loc[42, 'country'])

Angola


In [37]:
# Specify rows and columns using iloc attribute.
print(df.iloc[42, 0])

Angola


In [38]:
# Subsetting mutiple rows and columns. Rows: 0, 99, 999. Columns: country, lifeExp, gdpPercap.
print(df.iloc[[0, 99, 999], [0, 3, 5]])

         country  lifeExp    gdpPercap
0    Afghanistan   28.801   779.445314
99    Bangladesh   43.453   721.186086
999     Mongolia   51.253  1226.041130


In [39]:
# Subsetting mutiple rows and columns. Rows: 0, 99, 999. Columns: country, lifeExp, gdpPercap.
print(df.loc[[0, 99, 999], ['country', 'lifeExp', 'gdpPercap']])

         country  lifeExp    gdpPercap
0    Afghanistan   28.801   779.445314
99    Bangladesh   43.453   721.186086
999     Mongolia   51.253  1226.041130


In [40]:
# Subsetting mutiple rows and columns. Rows: 0, 99, 999. Columns: country, lifeExp, gdpPercap.
print(df.loc[10:13, ['country', 'lifeExp', 'gdpPercap']])

        country  lifeExp    gdpPercap
10  Afghanistan   42.129   726.734055
11  Afghanistan   43.828   974.580338
12      Albania   55.230  1601.056136
13      Albania   59.280  1942.284244
