## Indexing with Pandas? 
#Reasons for the index: Identification, Selection, Allignement

In [33]:
import pandas as pd

In [34]:
drinks_path = 'http://bit.ly/drinksbycountry'
drinks = pd.read_csv(drinks_path)

In [35]:
drinks.head(3)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa


In [36]:
#read #index #pandas
drinks.index

RangeIndex(start=0, stop=193, step=1)

In [37]:
#read #cols #pandas
drinks.columns

Index(['country', 'beer_servings', 'spirit_servings', 'wine_servings',
       'total_litres_of_pure_alcohol', 'continent'],
      dtype='object')

### Why and how to use indexes

In [38]:
#Identification: index stays the same, no re-indexing when calling a subset
drinks[drinks.continent=='South America']

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
6,Argentina,193,25,221,8.3,South America
20,Bolivia,167,41,8,3.8,South America
23,Brazil,245,145,16,7.2,South America
35,Chile,130,124,172,7.6,South America
37,Colombia,159,76,3,4.2,South America
52,Ecuador,162,74,3,4.2,South America
72,Guyana,93,302,1,7.1,South America
132,Paraguay,213,117,74,7.3,South America
133,Peru,163,160,21,6.1,South America
163,Suriname,128,178,7,5.6,South America


In [39]:
#use index to avoid difficulties with identification 
drinks.loc[23,'country']

'Brazil'

In [40]:
#IMPORTANT: PANDAS
#set_index #index #select #read #prepare #pandas
drinks.set_index('country', inplace=True)
drinks.head()

Unnamed: 0_level_0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,0,0,0,0.0,Asia
Albania,89,132,54,4.9,Europe
Algeria,25,0,14,0.7,Africa
Andorra,245,138,312,12.4,Europe
Angola,217,57,45,5.9,Africa


In [41]:
drinks.index

Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       'Tanzania', 'USA', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela',
       'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],
      dtype='object', name='country', length=193)

In [42]:
drinks.columns

Index(['beer_servings', 'spirit_servings', 'wine_servings',
       'total_litres_of_pure_alcohol', 'continent'],
      dtype='object')

In [43]:
#Indexing is more intuitiv with countrys rather than with numbers
drinks.loc['Albania','beer_servings']

89

### How to delete the index name

In [44]:
#Delete Index-Name (see row 13 above)
drinks.index.name=None
drinks.head(3)

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
Afghanistan,0,0,0,0.0,Asia
Albania,89,132,54,4.9,Europe
Algeria,25,0,14,0.7,Africa


### How to reset the index

In [45]:
#IMPORTANT: PANDAS
#reset_index #index #select #read #prepare #pandas
#Reset Index and move Country back into col called country
#define index-name first, so the col will have the right name
drinks.index.name='country'
drinks.reset_index(inplace=True)
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [46]:
drinks.describe()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
count,193.0,193.0,193.0,193.0
mean,106.160622,80.994819,49.450777,4.717098
std,101.143103,88.284312,79.697598,3.773298
min,0.0,0.0,0.0,0.0
25%,20.0,4.0,1.0,1.3
50%,76.0,56.0,8.0,4.2
75%,188.0,128.0,59.0,7.2
max,376.0,438.0,370.0,14.4


### Some examples how to work with indexes

In [47]:
#the result of methods are often new DataFrames (which have indexes and cols)
drinks.describe().index

Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'], dtype='object')

In [48]:
#call the row and col of .describe()
drinks.describe().loc['25%', 'beer_servings']

20.0

In [49]:
  drinks.set_index('country', inplace=True)
  drinks.head(3)

Unnamed: 0_level_0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,0,0,0,0.0,Asia
Albania,89,132,54,4.9,Europe
Algeria,25,0,14,0.7,Africa


In [50]:
drinks.continent.head()

country
Afghanistan      Asia
Albania        Europe
Algeria        Africa
Andorra        Europe
Angola         Africa
Name: continent, dtype: object

In [51]:
#Continent is index of DataFrame resulting by .valuecounts()
drinks.continent.value_counts()

Africa           53
Europe           45
Asia             44
North America    23
Oceania          16
South America    12
Name: continent, dtype: int64

In [52]:
#IMPORTANT: PANDAS
#value_counts #index #select #pandas
#call index at position 0 by index-function
drinks.continent.value_counts().index[0]

'Africa'

In [53]:
#IMPORTANT: PANDAS
#value_counts #index #select #pandas
#call value at position 0 by value-function
drinks.continent.value_counts().values[0]

53

In [54]:
#IMPORTANT: PANDAS
#sort_index #value_counts #select #pandas
#Nach Index sortieren: sort_index, nach values sortieren: sort_values
drinks.continent.value_counts().sort_index()

Africa           53
Asia             44
Europe           45
North America    23
Oceania          16
South America    12
Name: continent, dtype: int64

### Show how the same index of 2 dataframe works out with eatch other

In [55]:
#create a DataFrame with index as countries (like in df drinks)
people = pd.Series([300000, 85000], index=['Albania', 'Andorra'], 
name='population')

In [56]:
people

Albania    300000
Andorra     85000
Name: population, dtype: int64

In [57]:
drinks.index

Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       'Tanzania', 'USA', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela',
       'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],
      dtype='object', name='country', length=193)

In [58]:
#IMPORTANT: PANDAS
#Calculate total beer servings (people * mean beer servings)
#Pandas automatically does the operation for each row by the index
drinks.beer_servings*people

Afghanistan           NaN
Albania        26700000.0
Algeria               NaN
Andorra        20825000.0
Angola                NaN
                  ...    
Venezuela             NaN
Vietnam               NaN
Yemen                 NaN
Zambia                NaN
Zimbabwe              NaN
Length: 193, dtype: float64

In [59]:
pd.concat([drinks,people],axis=1).head()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent,population
Afghanistan,0,0,0,0.0,Asia,
Albania,89,132,54,4.9,Europe,300000.0
Algeria,25,0,14,0.7,Africa,
Andorra,245,138,312,12.4,Europe,85000.0
Angola,217,57,45,5.9,Africa,
