.columns
.index
.set_index("name")
.reset_index(drop=True)
.sort_index()

In [2]:
import pandas as np
temperatures = np.read_csv("temperatures.csv")

In [3]:
# 1. Setting and removing indexes
# Look at temperatures
print(temperatures)

# Index temperatures by city
temperatures_ind = temperatures.set_index("city")

# Look at temperatures_ind
print(temperatures_ind)

# Reset the index, keeping its contents
print(temperatures_ind.reset_index())

# Reset the index, dropping its contents
print(temperatures_ind.reset_index(drop=True))

       Unnamed: 0        date     city        country  avg_temp_c
0               0  2000-01-01  Abidjan  Côte D'Ivoire      27.293
1               1  2000-02-01  Abidjan  Côte D'Ivoire      27.685
2               2  2000-03-01  Abidjan  Côte D'Ivoire      29.061
3               3  2000-04-01  Abidjan  Côte D'Ivoire      28.162
4               4  2000-05-01  Abidjan  Côte D'Ivoire      27.547
5               5  2000-06-01  Abidjan  Côte D'Ivoire      25.812
6               6  2000-07-01  Abidjan  Côte D'Ivoire      24.870
7               7  2000-08-01  Abidjan  Côte D'Ivoire      24.884
8               8  2000-09-01  Abidjan  Côte D'Ivoire      25.405
9               9  2000-10-01  Abidjan  Côte D'Ivoire      26.074
10             10  2000-11-01  Abidjan  Côte D'Ivoire      27.315
11             11  2000-12-01  Abidjan  Côte D'Ivoire      26.929
12             12  2001-01-01  Abidjan  Côte D'Ivoire      26.920
13             13  2001-02-01  Abidjan  Côte D'Ivoire      28.234
14        

In [4]:
# 2. Subsetting with .loc[]
# Make a list of cities to subset on
cities = ["Moscow", "Saint Petersburg"]

# Subset temperatures using square brackets
print(temperatures[temperatures["city"].isin(cities)])

# Subset temperatures_ind using .loc[]
print(temperatures_ind.loc[cities])

       Unnamed: 0        date              city country  avg_temp_c
10725       10725  2000-01-01            Moscow  Russia      -7.313
10726       10726  2000-02-01            Moscow  Russia      -3.551
10727       10727  2000-03-01            Moscow  Russia      -1.661
10728       10728  2000-04-01            Moscow  Russia      10.096
10729       10729  2000-05-01            Moscow  Russia      10.357
10730       10730  2000-06-01            Moscow  Russia      15.243
10731       10731  2000-07-01            Moscow  Russia      18.676
10732       10732  2000-08-01            Moscow  Russia      16.420
10733       10733  2000-09-01            Moscow  Russia       9.775
10734       10734  2000-10-01            Moscow  Russia       6.611
10735       10735  2000-11-01            Moscow  Russia      -0.168
10736       10736  2000-12-01            Moscow  Russia      -2.954
10737       10737  2001-01-01            Moscow  Russia      -4.914
10738       10738  2001-02-01            Moscow 

In [5]:
# 3. Setting multi-level indexes
# Index temperatures by country & city
temperatures_ind = temperatures.set_index(["country", "city"])

# List of tuples: Brazil, Rio De Janeiro & Pakistan, Lahore
rows_to_keep = [("Brazil", "Rio De Janeiro"), ("Pakistan", "Lahore")]

# Subset for rows to keep
print(temperatures_ind.loc[rows_to_keep])

                         Unnamed: 0        date  avg_temp_c
country  city                                              
Brazil   Rio De Janeiro       12540  2000-01-01      25.974
         Rio De Janeiro       12541  2000-02-01      26.699
         Rio De Janeiro       12542  2000-03-01      26.270
         Rio De Janeiro       12543  2000-04-01      25.750
         Rio De Janeiro       12544  2000-05-01      24.356
         Rio De Janeiro       12545  2000-06-01      24.059
         Rio De Janeiro       12546  2000-07-01      22.049
         Rio De Janeiro       12547  2000-08-01      22.447
         Rio De Janeiro       12548  2000-09-01      22.099
         Rio De Janeiro       12549  2000-10-01      23.844
         Rio De Janeiro       12550  2000-11-01      23.601
         Rio De Janeiro       12551  2000-12-01      25.209
         Rio De Janeiro       12552  2001-01-01      26.852
         Rio De Janeiro       12553  2001-02-01      27.992
         Rio De Janeiro       12554  200

In [6]:
# 4. Sorting by index values
# Sort temperatures_ind by index values
print(temperatures_ind.sort_index())

# Sort temperatures_ind by index values at the city level
print(temperatures_ind.sort_index(level="city"))

# Sort temperatures_ind by country then descending city
print(temperatures_ind.sort_index(level=["country", "city"], ascending =[True, False]))

                    Unnamed: 0        date  avg_temp_c
country     city                                      
Afghanistan Kabul         7260  2000-01-01       3.326
            Kabul         7261  2000-02-01       3.454
            Kabul         7262  2000-03-01       9.612
            Kabul         7263  2000-04-01      17.925
            Kabul         7264  2000-05-01      24.658
            Kabul         7265  2000-06-01      25.582
            Kabul         7266  2000-07-01      26.107
            Kabul         7267  2000-08-01      25.459
            Kabul         7268  2000-09-01      22.116
            Kabul         7269  2000-10-01      16.806
            Kabul         7270  2000-11-01       9.720
            Kabul         7271  2000-12-01       5.107
            Kabul         7272  2001-01-01       2.208
            Kabul         7273  2001-02-01       5.567
            Kabul         7274  2001-03-01      10.807
            Kabul         7275  2001-04-01      16.587
          

Slicing and subsetting with .loc and .iloc


In [7]:
# 5. Slicing index values
# Sort the index of temperatures_ind
temperatures_srt = temperatures_ind.sort_index()

# Subset rows from Pakistan to Russia
print(temperatures_srt.loc["Pakistan":"Russia"])

# Try to subset rows from Lahore to Moscow
print(temperatures_srt.loc["Lahore":"Moscow"])

# Subset rows from Pakistan, Lahore to Russia, Moscow
print(temperatures_srt.loc[("Pakistan", "Lahore"): ("Russia","Moscow")])

                           Unnamed: 0        date  avg_temp_c
country  city                                                
Pakistan Faisalabad              4785  2000-01-01      12.792
         Faisalabad              4786  2000-02-01      14.339
         Faisalabad              4787  2000-03-01      20.309
         Faisalabad              4788  2000-04-01      29.072
         Faisalabad              4789  2000-05-01      34.845
         Faisalabad              4790  2000-06-01      34.299
         Faisalabad              4791  2000-07-01      32.302
         Faisalabad              4792  2000-08-01      32.255
         Faisalabad              4793  2000-09-01      30.438
         Faisalabad              4794  2000-10-01      27.395
         Faisalabad              4795  2000-11-01      20.640
         Faisalabad              4796  2000-12-01      15.195
         Faisalabad              4797  2001-01-01      11.853
         Faisalabad              4798  2001-02-01      16.701
        

In [8]:
# 6. Slicing in both directions
# Subset rows from India, Hyderabad to Iraq, Baghdad
print(temperatures_srt.loc[("India","Hyderabad"):("Iraq", "Baghdad")])

# Subset columns from date to avg_temp_c
print(temperatures_srt.loc[:, "date":"avg_temp_c"])

# Subset in both directions at once
print(temperatures_srt.loc[("India","Hyderabad"):("Iraq", "Baghdad"), "date":"avg_temp_c"])

                   Unnamed: 0        date  avg_temp_c
country city                                         
India   Hyderabad        5940  2000-01-01      23.779
        Hyderabad        5941  2000-02-01      25.826
        Hyderabad        5942  2000-03-01      28.821
        Hyderabad        5943  2000-04-01      32.698
        Hyderabad        5944  2000-05-01      32.438
        Hyderabad        5945  2000-06-01      28.422
        Hyderabad        5946  2000-07-01      27.137
        Hyderabad        5947  2000-08-01      26.576
        Hyderabad        5948  2000-09-01      27.433
        Hyderabad        5949  2000-10-01      26.927
        Hyderabad        5950  2000-11-01      24.776
        Hyderabad        5951  2000-12-01      21.949
        Hyderabad        5952  2001-01-01      23.406
        Hyderabad        5953  2001-02-01      26.677
        Hyderabad        5954  2001-03-01      29.393
        Hyderabad        5955  2001-04-01      31.289
        Hyderabad        595

In [9]:
# 7. Slicing time series
# Use Boolean conditions to subset temperatures for rows in 2010 and 2011
temperatures_bool = temperatures[(temperatures["date"] >= "2010") & (temperatures["date"] < "2012")]
print(temperatures_bool)

# Set date as an index and sort the index
temperatures_ind = temperatures.set_index("date").sort_index()

# Use .loc[] to subset temperatures_ind for rows in 2010 and 2011
print(temperatures_ind.loc["2010":"2011"])

# Use .loc[] to subset temperatures_ind for rows from Aug 2010 to Feb 2011
print(temperatures_ind.loc["2010-08":"2011-02"])

       Unnamed: 0        date         city        country  avg_temp_c
120           120  2010-01-01      Abidjan  Côte D'Ivoire      28.270
121           121  2010-02-01      Abidjan  Côte D'Ivoire      29.262
122           122  2010-03-01      Abidjan  Côte D'Ivoire      29.596
123           123  2010-04-01      Abidjan  Côte D'Ivoire      29.068
124           124  2010-05-01      Abidjan  Côte D'Ivoire      28.258
125           125  2010-06-01      Abidjan  Côte D'Ivoire      26.683
126           126  2010-07-01      Abidjan  Côte D'Ivoire      25.589
127           127  2010-08-01      Abidjan  Côte D'Ivoire      25.400
128           128  2010-09-01      Abidjan  Côte D'Ivoire      25.710
129           129  2010-10-01      Abidjan  Côte D'Ivoire      26.397
130           130  2010-11-01      Abidjan  Côte D'Ivoire      27.446
131           131  2010-12-01      Abidjan  Côte D'Ivoire      27.666
132           132  2011-01-01      Abidjan  Côte D'Ivoire      27.360
133           133  2

In [10]:
# 8. Subsetting by row/column number
# Use .iloc[] on temperatures to take subsets.

# Get the 23rd row, 2nd column (index positions 22 and 1).
# Get the first 5 rows (index positions 0 to 5).
# Get all rows, columns 3 and 4 (index positions 2 to 4).
# Get the first 5 rows, columns 3 and 4.
# Get 23rd row, 2nd column (index 22, 1)
print(temperatures.iloc[22,1])

# Use slicing to get the first 5 rows
print(temperatures.iloc[0:6])

# Use slicing to get columns 3 to 4
print(temperatures.iloc[:, 2:4])

# Use slicing in both directions at once
print(temperatures.iloc[0:5, 2:4])

2001-11-01
   Unnamed: 0        date     city        country  avg_temp_c
0           0  2000-01-01  Abidjan  Côte D'Ivoire      27.293
1           1  2000-02-01  Abidjan  Côte D'Ivoire      27.685
2           2  2000-03-01  Abidjan  Côte D'Ivoire      29.061
3           3  2000-04-01  Abidjan  Côte D'Ivoire      28.162
4           4  2000-05-01  Abidjan  Côte D'Ivoire      27.547
5           5  2000-06-01  Abidjan  Côte D'Ivoire      25.812
          city        country
0      Abidjan  Côte D'Ivoire
1      Abidjan  Côte D'Ivoire
2      Abidjan  Côte D'Ivoire
3      Abidjan  Côte D'Ivoire
4      Abidjan  Côte D'Ivoire
5      Abidjan  Côte D'Ivoire
6      Abidjan  Côte D'Ivoire
7      Abidjan  Côte D'Ivoire
8      Abidjan  Côte D'Ivoire
9      Abidjan  Côte D'Ivoire
10     Abidjan  Côte D'Ivoire
11     Abidjan  Côte D'Ivoire
12     Abidjan  Côte D'Ivoire
13     Abidjan  Côte D'Ivoire
14     Abidjan  Côte D'Ivoire
15     Abidjan  Côte D'Ivoire
16     Abidjan  Côte D'Ivoire
17     Abidjan 

In [12]:
# 9. Pivot temperature by city and year
# You can access the components of a date (year, month and day) using code of the form dataframe["column"].dt.component
# Add a year column to temperatures
temperatures["year"] = temperatures["date"].dt.year

# Pivot avg_temp_c by country and city vs year
temp_by_country_city_vs_year = temperatures.pivot_table("avg_temp_c", index=["country", "city"], columns="year")

# See the result
print(temp_by_country_city_vs_year)

AttributeError: Can only use .dt accessor with datetimelike values

In [None]:
# 10. Subsetting pivot tables
# Subset for Egypt to India
temp_by_country_city_vs_year.loc["Egypt":"India"]

# Subset for Egypt, Cairo to India, Delhi
temp_by_country_city_vs_year.loc[("Egypt", "Cairo"): ("India", "Delhi")]

# Subset in both directions at once
temp_by_country_city_vs_year.loc[("Egypt", "Cairo"): ("India", "Delhi"), "2005":"2010"]

In [None]:
# 11. Calculating on a pivot table
# Get the worldwide mean temp by year
mean_temp_by_year = temp_by_country_city_vs_year.mean()

# Filter for the year that had the highest mean temp
print(mean_temp_by_year[mean_temp_by_year == mean_temp_by_year.max()])

# Get the mean temp by city  for each city (across columns)
mean_temp_by_city = temp_by_country_city_vs_year.mean(axis="columns")

# Filter for the city that had the lowest mean temp
print(mean_temp_by_city[mean_temp_by_city == mean_temp_by_city.min()])