In [1]:
import pandas as pd
import os

## Obtain different dataset data.gov

https://catalog.data.gov/dataset

In [2]:
from six.moves import urllib

# where the datasets will be placed
ROOT_DATA = "../../../ROOT_DATA/data_gov/"

def fetch_data_from_URL(housing_url, file_name, sub_dir="tmp", root_path=ROOT_DATA):
    placement_dir = os.path.join(root_path, sub_dir)
    if not os.path.isdir(placement_dir):
        os.makedirs(placement_dir)
    placement_path = os.path.join(placement_dir, file_name)
    # only download if not already present
    if not os.path.isfile(placement_path):
        urllib.request.urlretrieve(housing_url, placement_path)
    return placement_path

In [3]:
# working with excel: `conda install -c anaconda xlrd`
fsa_xlsx_path = fetch_data_from_URL("https://ifap.ed.gov/fedschcodelist/attachments/1617FedSchoolCodeList.xlsx",
                    "1617FedSchoolCodeList.xlsx", sub_dir="FSA")

In [4]:
df = pd.read_excel(fsa_xlsx_path, index_col="ID")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6962 entries, 25969 to 32930
Data columns (total 9 columns):
SchoolCode    6962 non-null object
SchoolName    6962 non-null object
Address       6962 non-null object
City          6962 non-null object
StateCode     6962 non-null object
ZipCode       6962 non-null int64
Province      111 non-null object
Country       402 non-null object
PostalCode    229 non-null object
dtypes: int64(1), object(8)
memory usage: 543.9+ KB
None


In [5]:
df.head()

Unnamed: 0_level_0,SchoolCode,SchoolName,Address,City,StateCode,ZipCode,Province,Country,PostalCode
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
25969,B04724,WIDENER UNIV SCHOOL OF LAW - DE,4601 CONCORD PIKE/PO BOX 7474,WILMINGTON,DE,19803,,,
25970,B06171,CENTER FOR ADVANCED STUDIES OF PUER,BOX S-4467,SAN JUAN,PR,902,,,
25971,B06511,PENTECOSTAL THEOLOGICAL SEMINARY,PO BOX 3330,CLEVELAND,TN,37320,,,
25972,B07022,THE CHICAGO SCHOOL OF PROF PSYCHOLOGY,325 NORTH WELLS STREET,CHICAGO,IL,60610,,,
25973,B07624,NATIONAL COLLEGE OF NATURAL MEDICINE,049 SW PORTER,PORTLAND,OR,97201,,,


## Drop method
Will need to use the `inplace=` param to change the existing dataframe

In [6]:
df.drop(25969).head()

Unnamed: 0_level_0,SchoolCode,SchoolName,Address,City,StateCode,ZipCode,Province,Country,PostalCode
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
25970,B06171,CENTER FOR ADVANCED STUDIES OF PUER,BOX S-4467,SAN JUAN,PR,902,,,
25971,B06511,PENTECOSTAL THEOLOGICAL SEMINARY,PO BOX 3330,CLEVELAND,TN,37320,,,
25972,B07022,THE CHICAGO SCHOOL OF PROF PSYCHOLOGY,325 NORTH WELLS STREET,CHICAGO,IL,60610,,,
25973,B07624,NATIONAL COLLEGE OF NATURAL MEDICINE,049 SW PORTER,PORTLAND,OR,97201,,,
25974,B07625,OREGON COL OF ORIENTAL MEDICINE,10525 SE CHERRY BLOSSOM DR,PORTLAND,OR,97216,,,


In [7]:
df.drop([25969, 25971]).head()

Unnamed: 0_level_0,SchoolCode,SchoolName,Address,City,StateCode,ZipCode,Province,Country,PostalCode
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
25970,B06171,CENTER FOR ADVANCED STUDIES OF PUER,BOX S-4467,SAN JUAN,PR,902,,,
25972,B07022,THE CHICAGO SCHOOL OF PROF PSYCHOLOGY,325 NORTH WELLS STREET,CHICAGO,IL,60610,,,
25973,B07624,NATIONAL COLLEGE OF NATURAL MEDICINE,049 SW PORTER,PORTLAND,OR,97201,,,
25974,B07625,OREGON COL OF ORIENTAL MEDICINE,10525 SE CHERRY BLOSSOM DR,PORTLAND,OR,97216,,,
25975,B08041,ALFRED ADLER GRADUATE SCHOOL,1001 WEST HIGHWAY 7 SUITE 344,HOPKINS,MN,55305,,,


**Note**, if the index had multiple rows, all rows with the specifed value would be dropped.

In [8]:
## Drop a column
# df.drop("Province", axis="columns") == df.drop("Province", axis=1)
df.drop("Province", axis="columns").head()

Unnamed: 0_level_0,SchoolCode,SchoolName,Address,City,StateCode,ZipCode,Country,PostalCode
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
25969,B04724,WIDENER UNIV SCHOOL OF LAW - DE,4601 CONCORD PIKE/PO BOX 7474,WILMINGTON,DE,19803,,
25970,B06171,CENTER FOR ADVANCED STUDIES OF PUER,BOX S-4467,SAN JUAN,PR,902,,
25971,B06511,PENTECOSTAL THEOLOGICAL SEMINARY,PO BOX 3330,CLEVELAND,TN,37320,,
25972,B07022,THE CHICAGO SCHOOL OF PROF PSYCHOLOGY,325 NORTH WELLS STREET,CHICAGO,IL,60610,,
25973,B07624,NATIONAL COLLEGE OF NATURAL MEDICINE,049 SW PORTER,PORTLAND,OR,97201,,


In [9]:
# dropping multiple (they don't need to be ordered)
df.drop(["PostalCode","Country","Province"], axis="columns").head()

Unnamed: 0_level_0,SchoolCode,SchoolName,Address,City,StateCode,ZipCode
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
25969,B04724,WIDENER UNIV SCHOOL OF LAW - DE,4601 CONCORD PIKE/PO BOX 7474,WILMINGTON,DE,19803
25970,B06171,CENTER FOR ADVANCED STUDIES OF PUER,BOX S-4467,SAN JUAN,PR,902
25971,B06511,PENTECOSTAL THEOLOGICAL SEMINARY,PO BOX 3330,CLEVELAND,TN,37320
25972,B07022,THE CHICAGO SCHOOL OF PROF PSYCHOLOGY,325 NORTH WELLS STREET,CHICAGO,IL,60610
25973,B07624,NATIONAL COLLEGE OF NATURAL MEDICINE,049 SW PORTER,PORTLAND,OR,97201


## Others

- `pop` method - will remove (and extract/assign to variable)
> `s = df.pop("PostalCode")`
