In [1]:
import pandas as pd
import os

## Obtain different dataset data.gov

https://catalog.data.gov/dataset

In [2]:
from six.moves import urllib

# where the datasets will be placed
ROOT_DATA = "../../../ROOT_DATA/data_gov/"

def fetch_data_from_URL(housing_url, file_name, sub_dir="tmp", root_path=ROOT_DATA):
    placement_dir = os.path.join(root_path, sub_dir)
    if not os.path.isdir(placement_dir):
        os.makedirs(placement_dir)
    placement_path = os.path.join(placement_dir, file_name)
    # only download if not already present
    if not os.path.isfile(placement_path):
        urllib.request.urlretrieve(housing_url, placement_path)
    return placement_path

In [3]:
# .CSV data
traffic_csv_path = fetch_data_from_URL("https://data.montgomerycountymd.gov/api/views/4mse-ku6q/rows.csv?accessType=DOWNLOAD",
                    "traffic_violations.csv", sub_dir="traffic")

In [4]:
# read entire file into a dataframe
t_df = pd.read_csv(traffic_csv_path)
print(t_df.info())

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1251972 entries, 0 to 1251971
Data columns (total 35 columns):
Date Of Stop               1251972 non-null object
Time Of Stop               1251972 non-null object
Agency                     1251972 non-null object
SubAgency                  1251962 non-null object
Description                1251963 non-null object
Location                   1251970 non-null object
Latitude                   1158178 non-null float64
Longitude                  1158178 non-null float64
Accident                   1251972 non-null object
Belts                      1251972 non-null object
Personal Injury            1251972 non-null object
Property Damage            1251972 non-null object
Fatal                      1251972 non-null object
Commercial License         1251972 non-null object
HAZMAT                     1251972 non-null object
Commercial Vehicle         1251972 non-null object
Alcohol                    1251972 non-null object
Work Zone         

In [5]:
charge_ds = t_df['Charge']
print(charge_ds.head())

0     13-401(h)
1    21-201(a1)
2     21-403(b)
3     21-402(b)
4    21-405(e1)
Name: Charge, dtype: object


In [6]:
print(charge_ds.sort_values().head())

1140408    10-303(a)
756371     10-303(a)
580317     10-303(a)
1208885    10-303(a)
303278     10-303(a)
Name: Charge, dtype: object


### Sort series

In [7]:
print(charge_ds.sort_values(ascending=False).head())

837970           90
1091898          90
1093648          90
1251349          90
1031348    9-337(a)
Name: Charge, dtype: object


In [8]:
# can chain together
print(charge_ds.sort_values(ascending=False).tail()) # the same as sort_values()

303278     10-303(a)
1208885    10-303(a)
580317     10-303(a)
756371     10-303(a)
1140408    10-303(a)
Name: Charge, dtype: object


### Note about NaN

In [9]:
d_ds = t_df['Description']
## Descending
print(d_ds.sort_values().head())

# note the NaN
# this should be the same as d_ds.sort_values().head() but isn't
# becuase of the NaN values
print(d_ds.sort_values(ascending=False).tail())

873135                                       #3 RT FLAT TIRE
1202280                                                    )
1036398    , ATTEMPTING TO DRIVE MOTOR VEHICLE ON HIGHWAY...
746547                             , IMPROPER EXHAUST SYSTEM
918268                            , MARKER LIGHT INOPERATIVE
Name: Description, dtype: object
1049222    NaN
1051440    NaN
1164359    NaN
1205989    NaN
1225527    NaN
Name: Description, dtype: object


## What if I want to overwrite the values?

we can either perform a method and reassign the output to our original data frame or series, or we can use the `inplace` parameter (provided it is available)

In [10]:
print(charge_ds.sort_values().head())
print(charge_ds.head())

1140408    10-303(a)
756371     10-303(a)
580317     10-303(a)
1208885    10-303(a)
303278     10-303(a)
Name: Charge, dtype: object
0     13-401(h)
1    21-201(a1)
2     21-403(b)
3     21-402(b)
4    21-405(e1)
Name: Charge, dtype: object


### Using `inplace`
```python
charge_ds.sort_values(inplace=True)
```
But... this won't work here since charge_ds is "a view of some other array". This is because this series is obtained from the data frame and is not a stand alone series.

```
ValueError: This Series is a view of some other array, to sort in-place you must create a copy
```

We can get around this by making a copy of the series, or by reading in the series by itself.

In [11]:
t_ds = pd.read_csv(traffic_csv_path, usecols=["Charge"], squeeze=True)
print(t_ds.head())

0     13-401(h)
1    21-201(a1)
2     21-403(b)
3     21-402(b)
4    21-405(e1)
Name: Charge, dtype: object


In [12]:
print(t_ds.sort_values().head())
print(t_ds.head()) # same as cell above

1140408    10-303(a)
756371     10-303(a)
580317     10-303(a)
1208885    10-303(a)
303278     10-303(a)
Name: Charge, dtype: object
0     13-401(h)
1    21-201(a1)
2     21-403(b)
3     21-402(b)
4    21-405(e1)
Name: Charge, dtype: object


In [13]:
print(t_ds.sort_values(inplace=True))
print(t_ds.head()) # now the change takes place "inplace"

None
1140408    10-303(a)
756371     10-303(a)
580317     10-303(a)
1208885    10-303(a)
303278     10-303(a)
Name: Charge, dtype: object


## What if we want to get it back to where it is sorted by the index?

In [15]:
print(t_ds.sort_index().head())
print(t_ds.head())

0     13-401(h)
1    21-201(a1)
2     21-403(b)
3     21-402(b)
4    21-405(e1)
Name: Charge, dtype: object
1140408    10-303(a)
756371     10-303(a)
580317     10-303(a)
1208885    10-303(a)
303278     10-303(a)
Name: Charge, dtype: object


### inplace also works here

In [21]:
print(t_ds.sort_index(inplace=True))
print(t_ds.head())

None
0     13-401(h)
1    21-201(a1)
2     21-403(b)
3     21-402(b)
4    21-405(e1)
Name: Charge, dtype: object
