In [1]:
import pandas as pd
import os

## Obtain different dataset data.gov

https://catalog.data.gov/dataset

In [2]:
from six.moves import urllib

# where the datasets will be placed
ROOT_DATA = "../../../ROOT_DATA/data_gov/"

def fetch_data_from_URL(housing_url, file_name, sub_dir="tmp", root_path=ROOT_DATA):
    placement_dir = os.path.join(root_path, sub_dir)
    if not os.path.isdir(placement_dir):
        os.makedirs(placement_dir)
    placement_path = os.path.join(placement_dir, file_name)
    # only download if not already present
    if not os.path.isfile(placement_path):
        urllib.request.urlretrieve(housing_url, placement_path)
    return placement_path

In [3]:
# working with excel: `conda install -c anaconda xlrd`
fsa_xlsx_path = fetch_data_from_URL("https://ifap.ed.gov/fedschcodelist/attachments/1617FedSchoolCodeList.xlsx",
                    "1617FedSchoolCodeList.xlsx", sub_dir="FSA")

In [4]:
s_df = pd.read_excel(fsa_xlsx_path)
print(s_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6962 entries, 0 to 6961
Data columns (total 10 columns):
ID            6962 non-null int64
SchoolCode    6962 non-null object
SchoolName    6962 non-null object
Address       6962 non-null object
City          6962 non-null object
StateCode     6962 non-null object
ZipCode       6962 non-null int64
Province      111 non-null object
Country       402 non-null object
PostalCode    229 non-null object
dtypes: int64(2), object(8)
memory usage: 544.0+ KB
None


In [5]:
s_df.head()

Unnamed: 0,ID,SchoolCode,SchoolName,Address,City,StateCode,ZipCode,Province,Country,PostalCode
0,25969,B04724,WIDENER UNIV SCHOOL OF LAW - DE,4601 CONCORD PIKE/PO BOX 7474,WILMINGTON,DE,19803,,,
1,25970,B06171,CENTER FOR ADVANCED STUDIES OF PUER,BOX S-4467,SAN JUAN,PR,902,,,
2,25971,B06511,PENTECOSTAL THEOLOGICAL SEMINARY,PO BOX 3330,CLEVELAND,TN,37320,,,
3,25972,B07022,THE CHICAGO SCHOOL OF PROF PSYCHOLOGY,325 NORTH WELLS STREET,CHICAGO,IL,60610,,,
4,25973,B07624,NATIONAL COLLEGE OF NATURAL MEDICINE,049 SW PORTER,PORTLAND,OR,97201,,,


## Sort

Note:

1. `sort_values()` does not sort `inplace` by default
2. `NaN` values are placed at the "end" by default, regardless of ascending|descending
 - can be adjusted with the `na_position=` param

In [6]:
s_df.sort_values("Country").head()

Unnamed: 0,ID,SchoolCode,SchoolName,Address,City,StateCode,ZipCode,Province,Country,PostalCode
6885,32854,042401,AMERICAN UNIVERSITY OF ANTIGUA COLLEGE O,UNIVERSITY PARK,COOLIDGE,FC,0,ST.JOHN'S,ANTIGUA,
1228,27197,G42194,UNIVERSIDAD TORCUATO DI TELLA,AV. FIGUEROA ALCORTA 7350,BUENOS AIRES,FC,0,BUENOS AIRES,ARGENTINA,1428.0
1068,27037,G31048,GRIFFITH UNIVERSITY,GOLD COAST CAMPUS,QUEENSLAND 4211,FC,0,,AUSTRALIA,
1065,27034,G31000,UNIVERSITY OF ADELAIDE (THE),**,ADELAIDE,FC,0,,AUSTRALIA,
1064,27033,G30961,LA TROBE UNIVERSITY,BUNDORA CAMPUS LA TROBE UNIVER,BUNDOORA,FC,0,,AUSTRALIA,


In [7]:
s_df.sort_values("Country", ascending=False).head()

Unnamed: 0,ID,SchoolCode,SchoolName,Address,City,StateCode,ZipCode,Province,Country,PostalCode
768,26737,G08452,ABERYSTWYTH UNIVERSITY,OLD COLLEGE,ABERYSTWYTH,FC,0,,WALES,SY23 2AX
1113,27082,G34783,ROYAL WELSH COLLEGE OF MUSIC AND DRAMA,"CASTLE GROUNDS, CATHAYS PARK",CARDIFF,FC,0,,WALES,CF10 3ER
1129,27098,G35473,UNIVERSITY OF SOUTH WALES,"LLANTWIT ROAD, TREFOREST",PONTYPRIDD,FC,0,SOUTH WALES,WALES,CF37 1DL
774,26743,G08586,SWANSEA UNIVERSITY,ACADEMIC REGISTRY,SWANSEA,FC,0,,WALES,SA2 8PP
841,26810,G10589,BANGOR UNIVERSITY,BANGOR,GWYNEDD,FC,0,,WALES,LL57 2DG


## Sort by multiple values

In [8]:
# will sort by the order they are supplied in the list
s_df.sort_values(["Country","Province"]).head(6)

Unnamed: 0,ID,SchoolCode,SchoolName,Address,City,StateCode,ZipCode,Province,Country,PostalCode
6885,32854,042401,AMERICAN UNIVERSITY OF ANTIGUA COLLEGE O,UNIVERSITY PARK,COOLIDGE,FC,0,ST.JOHN'S,ANTIGUA,
1228,27197,G42194,UNIVERSIDAD TORCUATO DI TELLA,AV. FIGUEROA ALCORTA 7350,BUENOS AIRES,FC,0,BUENOS AIRES,ARGENTINA,1428.0
846,26815,G10681,MACQUARIE UNIVERSITY,BUILDING E3A LEVEL 1,NEW SOUTH WALES,FC,0,NEW SOUTH WALES,AUSTRALIA,2109.0
777,26746,G08670,UNIVERSITY OF NEW SOUTH WALES,"GATE 9, HIGH STREET","UNSW, SYDNEY",FC,0,NSW,AUSTRALIA,2052.0
1049,27018,G30672,UNIVERSITY OF SYDNEY,"VICE-CHANCELLOR OFFICE, A14",THE UNIVERSITY OF SYDNE,FC,0,NSW,AUSTRALIA,2006.0
1073,27042,G31406,SOUTHERN CROSS UNIVERSITY,MILITARY ROAD,EAST LISMORE,FC,0,NSW,AUSTRALIA,2480.0


In [11]:
# it's possible to set the ascending value to different values for each
s_df.sort_values(["Country","Province"], ascending=[True, False], inplace=True)

In [12]:
s_df.head()

Unnamed: 0,ID,SchoolCode,SchoolName,Address,City,StateCode,ZipCode,Province,Country,PostalCode
6885,32854,042401,AMERICAN UNIVERSITY OF ANTIGUA COLLEGE O,UNIVERSITY PARK,COOLIDGE,FC,0,ST.JOHN'S,ANTIGUA,
1228,27197,G42194,UNIVERSIDAD TORCUATO DI TELLA,AV. FIGUEROA ALCORTA 7350,BUENOS AIRES,FC,0,BUENOS AIRES,ARGENTINA,1428.0
882,26851,G11094,UNIVERSITY OF WESTERN AUSTRALIA (THE),35 STIRLING HIGHWAY,NEDLANDS,FC,0,WESTERN AUSTRALIA,AUSTRALIA,6009.0
847,26816,G10692,UNIVERSITY OF QUEENSLAND (THE),INTERNATIONAL EDUCATION DIRECTORATE,BRISBANE,FC,0,QUEENSLAND,AUSTRALIA,4072.0
907,26876,G12206,JAMES COOK UNIVERSITY,DOUGLAS CAMPUS,TOWNSVILLE,FC,0,QUEENSLAND,AUSTRALIA,4811.0


In [None]:
## `Soe

In [13]:
s_df.sort_index(inplace=True)
s_df.head()

Unnamed: 0,ID,SchoolCode,SchoolName,Address,City,StateCode,ZipCode,Province,Country,PostalCode
0,25969,B04724,WIDENER UNIV SCHOOL OF LAW - DE,4601 CONCORD PIKE/PO BOX 7474,WILMINGTON,DE,19803,,,
1,25970,B06171,CENTER FOR ADVANCED STUDIES OF PUER,BOX S-4467,SAN JUAN,PR,902,,,
2,25971,B06511,PENTECOSTAL THEOLOGICAL SEMINARY,PO BOX 3330,CLEVELAND,TN,37320,,,
3,25972,B07022,THE CHICAGO SCHOOL OF PROF PSYCHOLOGY,325 NORTH WELLS STREET,CHICAGO,IL,60610,,,
4,25973,B07624,NATIONAL COLLEGE OF NATURAL MEDICINE,049 SW PORTER,PORTLAND,OR,97201,,,
