In [1]:
import pandas as pd
import os

## Obtain different dataset data.gov

https://catalog.data.gov/dataset

In [2]:
from six.moves import urllib

# where the datasets will be placed
ROOT_DATA = "../../../ROOT_DATA/data_gov/"

def fetch_data_from_URL(housing_url, file_name, sub_dir="tmp", root_path=ROOT_DATA):
    placement_dir = os.path.join(root_path, sub_dir)
    if not os.path.isdir(placement_dir):
        os.makedirs(placement_dir)
    placement_path = os.path.join(placement_dir, file_name)
    # only download if not already present
    if not os.path.isfile(placement_path):
        urllib.request.urlretrieve(housing_url, placement_path)
    return placement_path

In [3]:
# working with excel: `conda install -c anaconda xlrd`
fsa_xlsx_path = fetch_data_from_URL("https://ifap.ed.gov/fedschcodelist/attachments/1617FedSchoolCodeList.xlsx",
                    "1617FedSchoolCodeList.xlsx", sub_dir="FSA")

In [4]:
df = pd.read_excel(fsa_xlsx_path, index_col="ID")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6962 entries, 25969 to 32930
Data columns (total 9 columns):
SchoolCode    6962 non-null object
SchoolName    6962 non-null object
Address       6962 non-null object
City          6962 non-null object
StateCode     6962 non-null object
ZipCode       6962 non-null int64
Province      111 non-null object
Country       402 non-null object
PostalCode    229 non-null object
dtypes: int64(1), object(8)
memory usage: 543.9+ KB
None


In [5]:
print(df.head())

      SchoolCode                             SchoolName  \
ID                                                        
25969     B04724        WIDENER UNIV SCHOOL OF LAW - DE   
25970     B06171    CENTER FOR ADVANCED STUDIES OF PUER   
25971     B06511       PENTECOSTAL THEOLOGICAL SEMINARY   
25972     B07022  THE CHICAGO SCHOOL OF PROF PSYCHOLOGY   
25973     B07624   NATIONAL COLLEGE OF NATURAL MEDICINE   

                             Address        City StateCode  ZipCode Province  \
ID                                                                             
25969  4601 CONCORD PIKE/PO BOX 7474  WILMINGTON        DE    19803      NaN   
25970                     BOX S-4467    SAN JUAN        PR      902      NaN   
25971                    PO BOX 3330   CLEVELAND        TN    37320      NaN   
25972         325 NORTH WELLS STREET     CHICAGO        IL    60610      NaN   
25973                  049 SW PORTER    PORTLAND        OR    97201      NaN   

      Country PostalCode 

In [6]:
# print(df[0]) will not work here
print(df.loc[25969])

SchoolCode                             B04724
SchoolName    WIDENER UNIV SCHOOL OF LAW - DE
Address         4601 CONCORD PIKE/PO BOX 7474
City                               WILMINGTON
StateCode                                  DE
ZipCode                                 19803
Province                                  NaN
Country                                   NaN
PostalCode                                NaN
Name: 25969, dtype: object


If we have multiple rows with the same index, pandas will return all rows that match the query. Example:

In [7]:
tmp_df = df.reset_index() # keep the `ID` col
tmp_df = tmp_df.set_index("StateCode")
print(tmp_df.head())

              ID SchoolCode                             SchoolName  \
StateCode                                                            
DE         25969     B04724        WIDENER UNIV SCHOOL OF LAW - DE   
PR         25970     B06171    CENTER FOR ADVANCED STUDIES OF PUER   
TN         25971     B06511       PENTECOSTAL THEOLOGICAL SEMINARY   
IL         25972     B07022  THE CHICAGO SCHOOL OF PROF PSYCHOLOGY   
OR         25973     B07624   NATIONAL COLLEGE OF NATURAL MEDICINE   

                                 Address        City  ZipCode Province  \
StateCode                                                                
DE         4601 CONCORD PIKE/PO BOX 7474  WILMINGTON    19803      NaN   
PR                            BOX S-4467    SAN JUAN      902      NaN   
TN                           PO BOX 3330   CLEVELAND    37320      NaN   
IL                325 NORTH WELLS STREET     CHICAGO    60610      NaN   
OR                         049 SW PORTER    PORTLAND    97201    

In [8]:
print(tmp_df.loc['DE'])

              ID SchoolCode                                SchoolName  \
StateCode                                                               
DE         25969     B04724           WIDENER UNIV SCHOOL OF LAW - DE   
DE         26206     E01080                STAR TECHNICAL INSTITUTE -   
DE         27484     001428                 DELAWARE STATE UNIVERSITY   
DE         27485     001429                     GOLDEY BEACOM COLLEGE   
DE         27486     001431                    UNIVERSITY OF DELAWARE   
DE         27487     001433                            WESLEY COLLEGE   
DE         29676     007948                     WILMINGTON UNIVERSITY   
DE         30139     011727      DELAWARE TECHNICAL COMMUNITY COLLEGE   
DE         30756     015477        SCHILLING DOUGLAS SCH OF HAIR DSGN   
DE         31263     021252    SCHOOL OF NURSING BEEBE MEDICAL CENTER   
DE         31427     030258                      DAWN TRAINING CENTRE   
DE         31843     035433                 HARRIS 

In [9]:
tmp_df.index.value_counts()

CA    673
NY    452
TX    381
FL    367
FC    326
PA    321
OH    294
IL    268
MA    189
MI    187
MO    182
NC    175
TN    164
VA    147
GA    144
NJ    143
PR    126
IN    126
LA    118
OK    118
MN    116
WA    109
WI    104
KY    103
CO    100
AZ     97
SC     97
MD     92
CT     92
IA     90
     ... 
OR     83
KS     81
WV     73
CN     69
UT     65
MS     56
NE     44
NM     40
ME     39
ID     36
NV     35
NH     35
DC     30
MT     29
SD     28
ND     28
VT     24
HI     22
RI     21
DE     17
AK     10
WY     10
MX      7
GU      3
VI      2
MP      1
FM      1
MH      1
PW      1
AS      1
Name: StateCode, dtype: int64

In [10]:
# print(tmp_df.loc['AK': 'CA']) # needs to be sorted first
#print(tmp_df.sort().head())
print(tmp_df.sort().loc['AK': 'CA'])

              ID SchoolCode                                SchoolName  \
StateCode                                                               
AK         27243     001065            UNIVERSITY OF ALASKA SOUTHEAST   
AK         30109     011462            UNIVERSITY OF ALASKA ANCHORAGE   
AK         27241     001061                 ALASKA PACIFIC UNIVERSITY   
AK         31676     031603  AVTEC - ALASKA'S INSTITUTE OF TECHNOLOGY   
AK         27242     001063            UNIVERSITY OF ALASKA FAIRBANKS   
AK         31260     017377                           CHARTER COLLEGE   
AK         31811     034613                         ILISAGVIK COLLEGE   
AK         31180     017066                     ALASKA CAREER COLLEGE   
AK         30499     014325                      ALASKA BIBLE COLLEGE   
AK         32248     041386                  ALASKA CHRISTIAN COLLEGE   
AL         30946     016217  PRINCE INSTITUTE OF PROFESSIONAL STUDIES   
AL         27223     001030            BISHOP STATE

  This is separate from the ipykernel package so we can avoid doing imports until


### Multiple index not next to each other

In [11]:
print(tmp_df.loc[['PW', 'FM']])

              ID SchoolCode                   SchoolName  \
StateCode                                                  
PW         30080     011009      PALAU COMMUNITY COLLEGE   
FM         29990     010343  COLLEGE OF MICRONESIA - FSM   

                                  Address                     City  ZipCode  \
StateCode                                                                     
PW         POST OFFICE BOX 9  KOROR PALAU  WESTERN CAROLINA ISLAND    96940   
FM                            159 KOLONIA         KOLONIA, POHNPEI    96941   

          Province Country PostalCode  
StateCode                              
PW             NaN     NaN        NaN  
FM             NaN     NaN        NaN  


In [12]:
# if not included, it will be 'created'
print(tmp_df.loc[['PW', 'FM', 'ZZ']])

                ID SchoolCode                   SchoolName  \
StateCode                                                    
PW         30080.0     011009      PALAU COMMUNITY COLLEGE   
FM         29990.0     010343  COLLEGE OF MICRONESIA - FSM   
ZZ             NaN        NaN                          NaN   

                                  Address                     City  ZipCode  \
StateCode                                                                     
PW         POST OFFICE BOX 9  KOROR PALAU  WESTERN CAROLINA ISLAND  96940.0   
FM                            159 KOLONIA         KOLONIA, POHNPEI  96941.0   
ZZ                                    NaN                      NaN      NaN   

          Province Country PostalCode  
StateCode                              
PW             NaN     NaN        NaN  
FM             NaN     NaN        NaN  
ZZ             NaN     NaN        NaN  


In [13]:
# can check for inclusion
print("ZZ" in tmp_df.index)

False


## Second Argument

In [14]:
# will return the specified columns
print(tmp_df.loc['PW', ["ID", "SchoolCode"]])

ID             30080
SchoolCode    011009
Name: PW, dtype: object
