In [1]:
import pandas as pd
import os

## Obtain different dataset data.gov

https://catalog.data.gov/dataset

In [2]:
from six.moves import urllib

# where the datasets will be placed
ROOT_DATA = "../../../ROOT_DATA/data_gov/"

def fetch_data_from_URL(housing_url, file_name, sub_dir="tmp", root_path=ROOT_DATA):
    placement_dir = os.path.join(root_path, sub_dir)
    if not os.path.isdir(placement_dir):
        os.makedirs(placement_dir)
    placement_path = os.path.join(placement_dir, file_name)
    # only download if not already present
    if not os.path.isfile(placement_path):
        urllib.request.urlretrieve(housing_url, placement_path)
    return placement_path

In [3]:
# working with excel: `conda install -c anaconda xlrd`
fsa_xlsx_path = fetch_data_from_URL("https://ifap.ed.gov/fedschcodelist/attachments/1617FedSchoolCodeList.xlsx",
                    "1617FedSchoolCodeList.xlsx", sub_dir="FSA")

In [4]:
df = pd.read_excel(fsa_xlsx_path, index_col="ID")
df.drop(["PostalCode","Country","Province"], axis="columns", inplace=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6962 entries, 25969 to 32930
Data columns (total 6 columns):
SchoolCode    6962 non-null object
SchoolName    6962 non-null object
Address       6962 non-null object
City          6962 non-null object
StateCode     6962 non-null object
ZipCode       6962 non-null int64
dtypes: int64(1), object(5)
memory usage: 380.7+ KB
None


## Sample method

In [5]:
# will return one random row
df.sample()

Unnamed: 0_level_0,SchoolCode,SchoolName,Address,City,StateCode,ZipCode
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
26734,G08444,SIMON FRASER UNIVERSITY,8888 UNIVERSITY BOULEVARD,BURNABY,CN,0


In [6]:
df.sample(5)

Unnamed: 0_level_0,SchoolCode,SchoolName,Address,City,StateCode,ZipCode
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
29291,5028,INTER AMERICAN UNIVERSITY OF PUERTO RICO,CARRETERA 830 #500,BAYAMON,PR,957
32037,39394,CENTURA INSTITUTE,1300 NORTH SEMORAN BOULEVARD,ORLANDO,FL,32807
30630,14952,HUMACAO COMMUNITY COLLEGE,ESMERALDA #7 PO BOX 9139,HUMACAO,PR,792
29194,4549,LEEWARD COMMUNITY COLLEGE,96-045 ALA IKE,PEARL CITY,HI,96782
31691,31993,CHRISTIAN LIFE COLLEGE,400 E GREGORY ST,MOUNT PROSPECT,IL,60056


In [7]:
# will extract a percent
# 0.25 = 25%
df.sample(frac=0.001)

Unnamed: 0_level_0,SchoolCode,SchoolName,Address,City,StateCode,ZipCode
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
26434,E01887,UNITED EDUCATION INSTITUTE - EL MONTE,3401 RIO HONDO AVENUE,EL MONTE,CA,91731
31283,023166,CONNECTICUT CTR FOR MASSAGE THERAPY,75 KITTS LN,NEWINGTON,CT,6111
26812,G10599,INST D'ETUDES POLITIQUES DE PARIS,27 RUE SAINT-GUILLAUME,PARIS,FC,0
27145,G39743,"ST. GEORGE'S UNIVERSITY, SCHOOL OF VETER",UNIVERSITY CENTRE,ST.GEORGE'S,FC,0
28861,003571,HARDIN-SIMMONS UNIVERSITY,BOX 16050,ABILENE,TX,79698
29417,006257,SAINT ELIZABETH SCHOOL OF NURSING,1508 TIPPECANOE STREET,LAFAYETTE,IN,47904
29346,005424,SOUTH CENTRAL CAREER CENTER,610 E OLDEN,WEST PLAINS,MO,65775


In [8]:
df.sample(3)

Unnamed: 0_level_0,SchoolCode,SchoolName,Address,City,StateCode,ZipCode
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
32923,E40433,FLORIDA CAREER COLLEGE - KENDALL,11731 MILLS DRIVE,MIAMI,FL,33183
31334,026001,NEW YORK ACADEMY OF ART (THE),111 FRANKLIN STREET,NEW YORK,NY,10013
31464,030515,US GRANT JOINT VOCATIONAL SCHOOL,718 WEST PLANE STREET,BETHEL,OH,45106


In [9]:
# will extract three random columns from dataframe
df.sample(3, axis="columns")

Unnamed: 0_level_0,StateCode,City,SchoolCode
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25969,DE,WILMINGTON,B04724
25970,PR,SAN JUAN,B06171
25971,TN,CLEVELAND,B06511
25972,IL,CHICAGO,B07022
25973,OR,PORTLAND,B07624
25974,OR,PORTLAND,B07625
25975,MN,HOPKINS,B08041
25976,DC,WASHINGTON,B08083
25977,TX,CONROE,B42154
25978,MA,CAMBRIDGE,E00014
