# Introduction to Pandas
Pandas is a high-level data manipulation package which was built on top of Numpy.
The key structures within pandas include Series and DataFrames

## Series
A Series is a one-dimensional ndarray with axis labels (an index). 

In [2]:
# Importing packages
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [3]:
# Creating a series out of a list
# What can we see? - We have an index, our data, and the data type
x = pd.Series([10,20,30,40,50])
x

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [4]:
# We can access the different components separately:

In [5]:
# Accessing the index
x.index

RangeIndex(start=0, stop=5, step=1)

In [6]:
# Accessing the values
x.values

array([10, 20, 30, 40, 50], dtype=int64)

In [9]:
# Accessing the data type
# As Series is an ndarray, it is homogeneous and can't store multiple data types
x.dtype

dtype('int64')

In [8]:
# Creating a Series with Index
data = [450,650,870]
Sales = Series(data,index=['Don','Mike','Edwin'])
Sales

Don      450
Mike     650
Edwin    870
dtype: int64

In [22]:
# We can check the type of Sales, it is a series
type(Sales)

pandas.core.series.Series

In [23]:
# When we check the index now, we can see the values rather than a range, since it is a string
Sales.index

Index(['Don', 'Mike', 'Edwin'], dtype='object')

### Accessing values 

In [24]:
# Accessing values using the index name
Sales['Don']

450

In [25]:
# Accessing values using a positional index
Sales[0]

450

 ### Checking for conditions

In [26]:
# We can filter our data based on conditions we specify, we can use booleans to do this
# If we want sales greater than 500:
# Note that doing this returns booleans
Sales>500

Don      False
Mike      True
Edwin     True
dtype: bool

In [27]:
# What happens when we use these booleans? - we can use them to filter and show data that is True
Sales[[False,True,True]]

Mike     650
Edwin    870
dtype: int64

In [28]:
# To return values that are greater than 500, we need to use those booleans
Sales[Sales>500]

Mike     650
Edwin    870
dtype: int64

In [29]:
# Checking Names in the Index
'Don' in Sales

True

In [30]:
# This will not be true as it is not in the Index
'Sally' in Sales

False

In [31]:
# What about this? - This will be false as it is a value and not a part of our index
450 in Sales

False

### Working with Dictionaries 

In [32]:
# Converting Series to Dictionaries
sales_dict = Sales.to_dict()

In [33]:
sales_dict

{'Don': 450, 'Mike': 650, 'Edwin': 870}

In [35]:
# Converting Dictionaries to Series
sales_ser = Series(sales_dict)

In [40]:
sales_ser

Don      450
Mike     650
Edwin    870
dtype: int64

### Adding entries and working with NaN/null values

In [42]:
# We can create a new Series from an already existing series.
# If we specify names in the index that were not there already, NaN values will be assigned 
# What is NaN? - Not a Number
new_sales = Series(Sales, index=['Don','Mike','Sally','Edwin','Lucy'])
new_sales

Don      450.0
Mike     650.0
Sally      NaN
Edwin    870.0
Lucy       NaN
dtype: float64

In [43]:
# Checking if entries are NaN - We can use numpy
np.isnan(new_sales['Sally'])

True

In [44]:
# This is different to None
new_sales['Sally'] is None

False

In [45]:
# We can use isnan on the entire Series
np.isnan(new_sales)

Don      False
Mike     False
Sally     True
Edwin    False
Lucy      True
dtype: bool

In [46]:
# Checking for null values using pandas
pd.isnull(new_sales)

Don      False
Mike     False
Sally     True
Edwin    False
Lucy      True
dtype: bool

### Naming components in Series

In [38]:
# Naming an index
Sales.index.name = "Sales person"

In [39]:
Sales

Sales person
Don      450
Mike     650
Edwin    870
dtype: int64

In [47]:
# Naming a series
Sales.name = 'Total TV Sales'

In [48]:
Sales

Sales person
Don      450
Mike     650
Edwin    870
Name: Total TV Sales, dtype: int64

## DataFrames
DataFrames are two-dimensional, size-mutable, potentially heterogeneous (diverse) tabular data structures. This data structure contains labeled axes (rows and columns).

### Creating DataFrames

In [169]:
# Creating a DataFrame from a list
data = [['Adrian',20],['Bethany',23],['Chloe',41]]

# As we create a DataFrame, we can specify what the column names are, and that the data type is
df = pd.DataFrame(data,columns=['Name','Age'],dtype=int)

df

Unnamed: 0,Name,Age
0,Adrian,20
1,Bethany,23
2,Chloe,41


In [170]:
# Creating a DataFrame from a dictionary 
new_dict = {'Name':['Tom','Jane','Steve','Lucy'],'Sales':[250,300,350,400]}

# Note that we are not specifying column names, column names are automatically assigned from the keys
df_dict = pd.DataFrame(new_dict)
df_dict

Unnamed: 0,Name,Sales
0,Tom,250
1,Jane,300
2,Steve,350
3,Lucy,400


In [171]:
# Adding a custom index
# So far we have created DataFrames with a default index
# We can use the index parameter to add an index
df_dict_index = pd.DataFrame(new_dict, index=['rank1','rank2','rank3','rank4'])
df_dict_index

Unnamed: 0,Name,Sales
rank1,Tom,250
rank2,Jane,300
rank3,Steve,350
rank4,Lucy,400


In [172]:
# Creating a DataFrame from a list of Dictionaries
# This is the same data as we had previously, but in a different format
# This is useful when working with json
dict_list = [{'Name':'Tom','Sales':250},{'Name':'Jane','Sales':300},{'Name':'Steve','Sales':350}
            ,{'Name':'Lucy','Sales':400}]

df_dict_list = pd.DataFrame(dict_list)
df_dict_list

Unnamed: 0,Name,Sales
0,Tom,250
1,Jane,300
2,Steve,350
3,Lucy,400


In [173]:
# Creating a DataFrame from Dictionary of Series
east = pd.Series([1000,1200,3400],index=['Q1','Q2','Q3'])
west = pd.Series([1100,1300,2400,3500],index=['Q1','Q2','Q3','Q4'])
east
west

Q1    1100
Q2    1300
Q3    2400
Q4    3500
dtype: int64

In [174]:
# If we have series we want to put into a DataFrame, we can easily combine them together
# If we wanted a DataFrame from a single series, we can do that by passing in the single series
df_region = pd.DataFrame({'East':east,'West':west})
df_region

Unnamed: 0,East,West
Q1,1000.0,1100
Q2,1200.0,1300
Q3,3400.0,2400
Q4,,3500


In [175]:
# Once we have a DataFrame, we can easily add Series on
df_region['North'] = [2000,3000,2500,4000]
df_region['South'] = [1500,2000,1500,4000]
df_region

Unnamed: 0,East,West,North,South
Q1,1000.0,1100,2000,1500
Q2,1200.0,1300,3000,2000
Q3,3400.0,2400,2500,1500
Q4,,3500,4000,4000


### Shifting and Changing the Index

In [176]:
# If we made a mistake and need to set a new index, we can add a new column
# and set that new column as the index

years = ['2016','2017','2018','2019']
df_region['years'] = years
df_region

Unnamed: 0,East,West,North,South,years
Q1,1000.0,1100,2000,1500,2016
Q2,1200.0,1300,3000,2000,2017
Q3,3400.0,2400,2500,1500,2018
Q4,,3500,4000,4000,2019


In [177]:
# We can use set_index to set the index to a different column in the DataFrame
df_region = df_region.set_index('years')
df_region

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,1000.0,1100,2000,1500
2017,1200.0,1300,3000,2000
2018,3400.0,2400,2500,1500
2019,,3500,4000,4000


In [178]:
# Let's say we want to see different index values, we can use reindex
# reindex will shift our index
new_df = df_region.reindex(['2017','2018','2019','2020','2021'])
new_df

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,1200.0,1300.0,3000.0,2000.0
2018,3400.0,2400.0,2500.0,1500.0
2019,,3500.0,4000.0,4000.0
2020,,,,
2021,,,,


In [179]:
# reindex can also be used on columns
# We can shift our columns, or add new ones if we add a name that was not present before
re_indexed = new_df.reindex(columns=['North','East','South','New'])
re_indexed

Unnamed: 0_level_0,North,East,South,New
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,3000.0,1200.0,2000.0,
2018,2500.0,3400.0,1500.0,
2019,4000.0,,4000.0,
2020,,,,
2021,,,,


### Missing Data

In [180]:
# Filling in missing values
# We may want to change all NaN values to 0 (or some specific number)
# This will be especially useful when working with certain types of algorithms
# Some algorithms cannot deal with NaN values

re_indexed.fillna(0) # note we are not actually assigning this to a new dataframe

Unnamed: 0_level_0,North,East,South,New
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,3000.0,1200.0,2000.0,0.0
2018,2500.0,3400.0,1500.0,0.0
2019,4000.0,0.0,4000.0,0.0
2020,0.0,0.0,0.0,0.0
2021,0.0,0.0,0.0,0.0


In [181]:
# If we don't want to fill with a singe value, we can change the fill method
# methods we can pick from - ‘backfill’, ‘bfill’, ‘pad’, ‘ffill’
# These methods will take the last available value and carry it to the next item
re_indexed.fillna(method='ffill')

Unnamed: 0_level_0,North,East,South,New
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,3000.0,1200.0,2000.0,
2018,2500.0,3400.0,1500.0,
2019,4000.0,3400.0,4000.0,
2020,4000.0,3400.0,4000.0,
2021,4000.0,3400.0,4000.0,


In [182]:
# We can also use interpolation
# The default method is linear, this can be changed if needed
re_indexed.interpolate()

Unnamed: 0_level_0,North,East,South,New
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,3000.0,1200.0,2000.0,
2018,2500.0,3400.0,1500.0,
2019,4000.0,3400.0,4000.0,
2020,4000.0,3400.0,4000.0,
2021,4000.0,3400.0,4000.0,


### Dropping items in DataFrames

In [183]:
# If we don't want to fill NaN values, we can drop instead

# dropna() on it's own will drop anything that contains any NaN values
# We might not want to do this, we can specify more parameters
re_indexed.dropna()

Unnamed: 0_level_0,North,East,South,New
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [184]:
# We can specify columns, and methods

# axis 1 = columns, axis 0 = rows
# how-'all'=If all values are NA, drop row/column, 'any'=If any NA values are present, drop row/column
re_indexed.dropna(axis=1, how='all')

Unnamed: 0_level_0,North,East,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017,3000.0,1200.0,2000.0
2018,2500.0,3400.0,1500.0
2019,4000.0,,4000.0
2020,,,
2021,,,


In [185]:
# We can also set a threshold: int - require that many non-NA values
re_indexed.dropna(thresh=1)

Unnamed: 0_level_0,North,East,South,New
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,3000.0,1200.0,2000.0,
2018,2500.0,3400.0,1500.0,
2019,4000.0,,4000.0,


In [186]:
re_indexed.dropna(thresh=2)

Unnamed: 0_level_0,North,East,South,New
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,3000.0,1200.0,2000.0,
2018,2500.0,3400.0,1500.0,
2019,4000.0,,4000.0,


In [187]:
# When we set the threshold to 3, 2019 is dropped as it only has 2 non-NA values
re_indexed.dropna(thresh=3)

Unnamed: 0_level_0,North,East,South,New
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,3000.0,1200.0,2000.0,
2018,2500.0,3400.0,1500.0,


In [188]:
# Dropping based on index
re_indexed.drop('2019')

Unnamed: 0_level_0,North,East,South,New
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,3000.0,1200.0,2000.0,
2018,2500.0,3400.0,1500.0,
2020,,,,
2021,,,,


In [221]:
# We can easily check for and remove duplicated rows
# We need to create a DataFrame with duplicates first to demonstrate this
df_dup = DataFrame([['A',1],['B',2],['A',1]])
df_dup

Unnamed: 0,0,1
0,A,1
1,B,2
2,A,1


In [222]:
# Finding duplicate rows
# duplicated() returns a boolean series denoting duplicate rows
df_dup.duplicated()

0    False
1    False
2     True
dtype: bool

In [227]:
# Dropping duplicate rows - drop_duplicates()
# By default inplace is False (meaning the DataFrame does not change), to change the DataFrame permanently, specify inplace=True
df_dup.drop_duplicates(inplace=True)

In [228]:
# DataFrame with removed duplicate rows
df_dup

Unnamed: 0,0,1
0,A,1
1,B,2


### Selecting Entries

In [189]:
# let's go back to using new_df
new_df

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,1200.0,1300.0,3000.0,2000.0
2018,3400.0,2400.0,2500.0,1500.0
2019,,3500.0,4000.0,4000.0
2020,,,,
2021,,,,


In [190]:
# We can select entire columns by using the column name
new_df['North']

years
2017    3000.0
2018    2500.0
2019    4000.0
2020       NaN
2021       NaN
Name: North, dtype: float64

In [191]:
# iloc
# iloc lets us find a record based on integer indexing - useful if we want to return the row
# in this case, location 2 corresponds to the row of 2019
new_df.iloc[2]

East        NaN
West     3500.0
North    4000.0
South    4000.0
Name: 2019, dtype: float64

In [192]:
# Using iloc to find specific values
# In this example, we are selecting the first row (2017), and then the second column (West)
new_df.iloc[0,1]

1300.0

In [193]:
# Wecan use slicing with iloc
# In this example we are interested in years 2018-2019 (positions 1 and 2)
new_df.iloc[1:3]

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,3400.0,2400.0,2500.0,1500.0
2019,,3500.0,4000.0,4000.0


In [194]:
# loc
# loc lets us access a group of rows and columns based on labels or a boolean array
# This is useful when we know the index name but not the position of the row we are interested in
new_df.loc['2019']

East        NaN
West     3500.0
North    4000.0
South    4000.0
Name: 2019, dtype: float64

In [195]:
# We can use it to select multiple rows
new_df.loc[['2018','2019']]

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,3400.0,2400.0,2500.0,1500.0
2019,,3500.0,4000.0,4000.0


In [196]:
# Using boolean arrays with loc
# For each row, we specify True or False depending on if we want that row returned
new_df.loc[[False,False,True,True,True]]

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019,,3500.0,4000.0,4000.0
2020,,,,
2021,,,,


In [199]:
# Using filtering to select specific information
# In this example, we want to only see values greater than 2000 in the West column
new_df[new_df['West'] > 2000]

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,3400.0,2400.0,2500.0,1500.0
2019,,3500.0,4000.0,4000.0


### Data alignment
We can add Series and columns in DataFrames together. Since Series are made up of singular columns, it is easy to add them togehter, with DataFrames there is an additional step of specifying the column to add. These examples will add DataFrame columns (Note that for Series, this will work the same, without having to input the column names).

In [200]:
# Adding two columns in one DataFrame
new_df['East']+new_df['North']

years
2017    4200.0
2018    5900.0
2019       NaN
2020       NaN
2021       NaN
dtype: float64

In [201]:
# We can use .add to be able to specify a fill value for NaN
# 2019 will now have a value because the NaN in one of the columns was filled to 0
# Results of two NaN columns will remain as NaN
new_df['East'].add(new_df['North'],fill_value=0)

years
2017    4200.0
2018    5900.0
2019    4000.0
2020       NaN
2021       NaN
dtype: float64

### Sorting and Ranking

In [202]:
# Sorting by index
# ascending: 1 True, 0 False
new_df.sort_index(ascending=0)

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021,,,,
2020,,,,
2019,,3500.0,4000.0,4000.0
2018,3400.0,2400.0,2500.0,1500.0
2017,1200.0,1300.0,3000.0,2000.0


In [203]:
# Sorting by column
# Default is ascending, to change, add the parameter like in the previous example
new_df.sort_values(by=['North'])

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,3400.0,2400.0,2500.0,1500.0
2017,1200.0,1300.0,3000.0,2000.0
2019,,3500.0,4000.0,4000.0
2020,,,,
2021,,,,


In [204]:
# Ranking columns
# First we need to specify the column, and then we can apply the rank function to it
new_df['North'].rank(ascending=0)

years
2017    2.0
2018    3.0
2019    1.0
2020    NaN
2021    NaN
Name: North, dtype: float64

In [205]:
# We can save the rank as a new column if needed
new_df['rank_north'] = new_df['North'].rank(ascending=0)
new_df

Unnamed: 0_level_0,East,West,North,South,rank_north
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017,1200.0,1300.0,3000.0,2000.0,2.0
2018,3400.0,2400.0,2500.0,1500.0,3.0
2019,,3500.0,4000.0,4000.0,1.0
2020,,,,,
2021,,,,,


## Summary Statistics

In [206]:
# Describe
# Probably the most useful - gives us general summary statistics for entire DataFrame
new_df.describe()

Unnamed: 0,East,West,North,South,rank_north
count,2.0,3.0,3.0,3.0,3.0
mean,2300.0,2400.0,3166.666667,2500.0,2.0
std,1555.634919,1100.0,763.762616,1322.875656,1.0
min,1200.0,1300.0,2500.0,1500.0,1.0
25%,1750.0,1850.0,2750.0,1750.0,1.5
50%,2300.0,2400.0,3000.0,2000.0,2.0
75%,2850.0,2950.0,3500.0,3000.0,2.5
max,3400.0,3500.0,4000.0,4000.0,3.0


In [207]:
# Finding totals of all columns
new_df.sum()

East          4600.0
West          7200.0
North         9500.0
South         7500.0
rank_north       6.0
dtype: float64

In [208]:
# Cumulative Sum
new_df.cumsum()

Unnamed: 0_level_0,East,West,North,South,rank_north
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017,1200.0,1300.0,3000.0,2000.0,2.0
2018,4600.0,3700.0,5500.0,3500.0,5.0
2019,,7200.0,9500.0,7500.0,6.0
2020,,,,,
2021,,,,,


In [209]:
# Minimum values in each column
new_df.min()

East          1200.0
West          1300.0
North         2500.0
South         1500.0
rank_north       1.0
dtype: float64

In [210]:
# Maximum values in each column
new_df.max()

East          3400.0
West          3500.0
North         4000.0
South         4000.0
rank_north       3.0
dtype: float64

## Index Hierarchy

In [211]:
# Creating a DataFrame with a hierarchical index
# We are using numpy arange to create a range of numbers, and re-shaping it so that the numbers will fit on a 4x4 grid
df_temp = DataFrame(np.arange(16).reshape(4,4), index = [['2018','2018','2019','2019'],['Jan','Feb','Jan','Feb']],
                   columns = [['NY','NY','LA','LA'],['Cold','Hot','Cold','Hot']])
df_temp

Unnamed: 0_level_0,Unnamed: 1_level_0,NY,NY,LA,LA
Unnamed: 0_level_1,Unnamed: 1_level_1,Cold,Hot,Cold,Hot
2018,Jan,0,1,2,3
2018,Feb,4,5,6,7
2019,Jan,8,9,10,11
2019,Feb,12,13,14,15


In [212]:
# We can add index and column names
df_temp.index.names=['Year','Month']
df_temp.columns.names=['State','Temp']
df_temp

Unnamed: 0_level_0,State,NY,NY,LA,LA
Unnamed: 0_level_1,Temp,Cold,Hot,Cold,Hot
Year,Month,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2018,Jan,0,1,2,3
2018,Feb,4,5,6,7
2019,Jan,8,9,10,11
2019,Feb,12,13,14,15


In [213]:
# Using swaplevel to swap levels on a particular axis
# axis - 0 or ‘index’, 1 or ‘columns’
df_temp.swaplevel('State','Temp',axis=1)

Unnamed: 0_level_0,Temp,Cold,Hot,Cold,Hot
Unnamed: 0_level_1,State,NY,NY,LA,LA
Year,Month,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2018,Jan,0,1,2,3
2018,Feb,4,5,6,7
2019,Jan,8,9,10,11
2019,Feb,12,13,14,15


In [214]:
# Sorting based on the first level of the index
df_temp.sort_index(level=0)

Unnamed: 0_level_0,State,NY,NY,LA,LA
Unnamed: 0_level_1,Temp,Cold,Hot,Cold,Hot
Year,Month,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2018,Feb,4,5,6,7
2018,Jan,0,1,2,3
2019,Feb,12,13,14,15
2019,Jan,8,9,10,11


In [215]:
# Sorting based on the second level of the index
df_temp.sort_index(level=1)

Unnamed: 0_level_0,State,NY,NY,LA,LA
Unnamed: 0_level_1,Temp,Cold,Hot,Cold,Hot
Year,Month,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2018,Feb,4,5,6,7
2019,Feb,12,13,14,15
2018,Jan,0,1,2,3
2019,Jan,8,9,10,11


In [216]:
# Accessing singular columns in muti-index
df_temp['NY']['Cold']

Year  Month
2018  Jan       0
      Feb       4
2019  Jan       8
      Feb      12
Name: Cold, dtype: int32

In [217]:
# Accessing single rows
# We can still use iloc with a multiindex
df_temp.iloc[0]

State  Temp
NY     Cold    0
       Hot     1
LA     Cold    2
       Hot     3
Name: (2018, Jan), dtype: int32

In [218]:
# This is how we could use loc
df_temp.loc['2018','Jan']

State  Temp
NY     Cold    0
       Hot     1
LA     Cold    2
       Hot     3
Name: (2018, Jan), dtype: int32