# Locations - Additional Data Cleaning

- Read locations from `CCMF_Locations_Edited.csv`
- Do some additional cleaning to standardize city names, revise the provinces for cities which have been assigned to the wrong province, and revise some individual entries where the location is unclear or needs additional review
- Create a reference table of all the unique cities in this data
- For the few incidents with multiple locations, flag for review and assign a single location (list the other locations in the notes)
- Save the table of cleaned incident locations and the locations reference table to CSV files

In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_rows = 100

save_csv = True

In [2]:
cities_update = {
    'Bridge River Lillooet' : 'Lillooet',
    'Canarama' : 'Saskatoon',
    'Charolettetown' : 'Charlottetown',
    'Enerby' : 'Enderby',
    'Gaspé region' : 'Gaspé',
    'Newfoundland Labrador' : np.nan,
    "Nova Scotia's Pictou County" : 'Pictou County',
    'Nunavut' : np.nan,
    'Peel District' : 'Peel Region',
    'Peel' : 'Peel Region',
    'QC City' : 'Quebec City',
    'Saskachewan' : np.nan,
    'Statford' : 'Stratford',
    'Village of Perdue' : 'Perdue',
    "Wet’suwet’en" : "Wet'suwet'en"
}

# The cities below were assigned to the wrong province
prov_updates = {
    'British Columbia' : ['Nanaimo', 'Port Alberni', 'West Vancouver', 'Whistler', 'Williams Lake'],
    'Nova Scotia' : ['Amherst', 'Berwick'],
    'Ontario' : ['Timmins']
}

# Detailed locations
detailed_locs = {
    'Anjou' : 'Montreal',
    'Mount Polley' : 'Cariboo Region',
    'Steveston' : 'Richmond',
    'York' : 'Toronto',
    'East Vancouver' : 'Vancouver'
}

In [3]:
# Read locations data and do some additional cleaning
loc_file = 'CCMF_Locations_Edited.csv'
locations = (pd.read_csv(loc_file)
             .rename({'location' : 'city_or_region'}, axis=1)
             .replace({'Unknown' : np.nan})
             .replace({'national' : {True : 'Yes'},
                       'city_or_region' : cities_update,
                       'province' : {'Newfoundland Labrador' : 'Newfoundland and Labrador'}
                      })
            )

# Update the provinces for cities which have incorrect provinces listed
for prov, city_list in prov_updates.items():
    for city in city_list:
        locations.loc[locations['city_or_region'] == city, 'province'] = prov
        
# A few incidents have more detailed (sub city level) locations
idx_detail = locations['city_or_region'].isin(detailed_locs)
locations.loc[idx_detail, 'detailed_location'] = locations.loc[idx_detail, 'city_or_region']
locations['city_or_region'] = locations['city_or_region'].replace(detailed_locs)
        
# Consolidated raw data - extract article URLs and merge with locations dataframe
rr = pd.read_csv('../race_relations_raw_consolidated.csv')
locations = locations.merge(rr[['incident_id', 'article_url']], on='incident_id', how='outer')

# Manually revise and/or flag entries for specific articles
url = 'https://www.theglobeandmail.com/politics/article-disqualified-conservative-leadership-candidate-sues-party-to-re-enter/'
idx = locations['article_url'] == url
locations.loc[idx, 'province'] = 'Ontario'
locations.loc[idx, 'needs_review'] = 'Check city'
locations.loc[idx, 'notes'] = 'No mention of the city of Cambridge in the article.'

url = 'https://www.cbc.ca/news/canada/british-columbia/agassiz-rcmp-fishermen-harassment-1.4803129'
idx = locations['article_url'] == url
locations.loc[idx, 'city_or_region'] = 'Hope'
locations.loc[idx, 'needs_review'] = 'Check city'
locations.loc[idx, 'notes'] = ("Location originally listed as Fraser River, which isn't a city and might"
                               ' be difficult to map. Article describes location as between the cities of'
                               ' Hope and Yale.'
                              )

url = 'https://www.coastmountainnews.com/news/b-c-feds-accused-of-environmental-racism-over-site-c-mount-polley/'
idx = locations['article_url'] == url
locations.loc[idx, 'province'] = 'British Columbia'

url = 'https://www.cbc.ca/news/canada/british-columbia/former-b-c-jail-guard-wins-human-rights-case-for-racism-1.5202461'
idx = locations['article_url'] == url
locations.loc[idx, 'city_or_region'] = 'Port Coquitlam'

url = 'https://ricochet.media/en/2961/bomb-threat-made-against-wetsuweten-and-tyendinaga-mohawks-as-far-right-escalates-talk-of-violence'
idx = locations['article_url'] == url
locations.loc[idx, 'province'] = 'British Columbia'
locations.loc[idx, 'needs_review'] = 'Multiple locations'
locations.loc[idx, 'notes'] = ('Article also mentions a threat to Tyendinaga Mohawks in Ontario.'
                               ' Should an additional entry be added for the Ontario location?'
                              )

url = 'https://www.cheknews.ca/new-video-protesters-push-plywood-at-truck-going-through-wetsuweten-highway-19-blockade-645649/'
idx = locations['article_url'] == url
locations.loc[idx, 'city_or_region'] = 'Courtenay'
locations.loc[idx, 'province'] = 'British Columbia'
locations.loc[idx, 'needs_review'] = 'Check city'
locations.loc[idx, 'notes'] = ("Location originally listed as Wet'suwet'en but article describes the "
                               'location as being near Courtenay (on the Vancouver Island).'
                              )

print(locations.shape)
locations.head()

(1011, 8)


Unnamed: 0,incident_id,city_or_region,province,national,detailed_location,article_url,needs_review,notes
0,1,Burnaby,British Columbia,,,https://globalnews.ca/news/3949365/b-c-woman-c...,,
1,2,Windsor,Ontario,,,https://www.cbc.ca/news/canada/windsor/graffit...,,
2,3,Ottawa,Ontario,,,https://nationalpost.com/news/politics/sen-lyn...,,
3,4,Calgary,Alberta,,,https://www.cbc.ca/news/canada/calgary/judge-e...,,
4,5,Tignish,Prince Edward Island,,,http://nationalpost.com/news/canada/p-e-i-legi...,,


## Locations reference table

In [4]:
# Create reference table of locations for dropdown menus in data entry form

loc_ref = (locations[['province', 'city_or_region']].drop_duplicates()
          .dropna(subset=['city_or_region'])
          .sort_values(['province', 'city_or_region'])
          .reset_index(drop=True)
         )

# Add an entry for Yukon, so that we have at least one city listed for it
loc_ref = (loc_ref.append(pd.DataFrame({'province' : 'Yukon', 'city_or_region' : 'Whitehorse'}, index=[0]))
           .reset_index(drop=True)
          )
print(loc_ref.shape)
loc_ref.head()

(158, 2)


Unnamed: 0,province,city_or_region
0,Alberta,Athabasca
1,Alberta,Brooks
2,Alberta,Calgary
3,Alberta,Cardston
4,Alberta,Edmonton


In [5]:
loc_ref['province'].value_counts(dropna=False)

Ontario                      46
British Columbia             34
Alberta                      21
Quebec                       17
Nova Scotia                  16
Manitoba                      8
Saskatchewan                  4
Newfoundland and Labrador     4
New Brunswick                 3
Prince Edward Island          2
Northwest Territories         1
Nunavut                       1
Yukon                         1
Name: province, dtype: int64

## Incidents with multiple locations

In [6]:
# Number of locations listed for each incident
locs_per_incident = locations.groupby('incident_id').size()

# IDs of incidents with more than 1 location
multiple_locs = locs_per_incident[locs_per_incident > 1].index
idx_multi = locations['incident_id'].isin(multiple_locs)

# Flag multi-location incidents for review
locations.loc[idx_multi, 'needs_review'] = 'Multiple locations'

locations[idx_multi]

Unnamed: 0,incident_id,city_or_region,province,national,detailed_location,article_url,needs_review,notes
188,189,Edmonton,Alberta,,,https://www.cbc.ca/news/canada/edmonton/female...,Multiple locations,
189,189,Ottawa,Ontario,,,https://www.cbc.ca/news/canada/edmonton/female...,Multiple locations,
258,258,Edmonton,Alberta,,,https://www.straight.com/news/1178116/canadas-...,Multiple locations,
259,258,Calgary,Alberta,,,https://www.straight.com/news/1178116/canadas-...,Multiple locations,
260,258,Vancouver,British Columbia,,,https://www.straight.com/news/1178116/canadas-...,Multiple locations,
263,261,Edmonton,Alberta,,,https://globalnews.ca/news/4789167/alberta-yel...,Multiple locations,
264,261,Calgary,Alberta,,,https://globalnews.ca/news/4789167/alberta-yel...,Multiple locations,
265,261,Edson,Alberta,,,https://globalnews.ca/news/4789167/alberta-yel...,Multiple locations,
266,261,Brooks,Alberta,,,https://globalnews.ca/news/4789167/alberta-yel...,Multiple locations,
267,261,Medicine Hat,Alberta,,,https://globalnews.ca/news/4789167/alberta-yel...,Multiple locations,


### Collapse dataframe to 1 row per incident

For multi-location incidents, keep the first listed location and include a list of all locations in the notes column

In [7]:
# 2-letter codes for each province
prov_codes = pd.read_csv('province_codes.csv').set_index('province').to_dict()['code']

# Location names formatted as city, 2-letter province code
loc_names = locations[idx_multi].set_index('incident_id').replace({'province' : prov_codes}).copy()
loc_names = (loc_names['city_or_region'].str.cat(loc_names['province'], sep=', ')
             .reset_index(drop=False)
             .rename({'city_or_region' : 'multi_locations'}, axis=1)
            )

# Combine formatted location names into ;-separated string
multi_loc_names = (loc_names.groupby('incident_id')['multi_locations']
                   .agg(lambda x: '; '.join(x))
                   .apply(lambda s: 'Multiple locations: ' + s)
                  )

# Merge with locations dataframe and drop extra rows
locations = (locations.drop('multi_locations', axis=1, errors='ignore')
             .merge(multi_loc_names.reset_index(drop=False), how='outer', on='incident_id')
            )

# Fill multi locations lists in notes column
locations['notes'] = locations['notes'].fillna(locations['multi_locations'])

# Drop extra column and unnecessary rows, sort by incident_id and reset index
locations = (locations.drop('multi_locations', axis=1)
             .drop_duplicates(subset=['incident_id'], keep='first')
             .sort_values('incident_id')
             .reset_index(drop=True)
            )

# Show all incident locations flagged for review
locations[locations['needs_review'].notnull()]

Unnamed: 0,incident_id,city_or_region,province,national,detailed_location,article_url,needs_review,notes
168,169,Hope,British Columbia,,,https://www.cbc.ca/news/canada/british-columbi...,Check city,"Location originally listed as Fraser River, wh..."
188,189,Edmonton,Alberta,,,https://www.cbc.ca/news/canada/edmonton/female...,Multiple locations,"Multiple locations: Edmonton, AB; Ottawa, ON"
257,258,Edmonton,Alberta,,,https://www.straight.com/news/1178116/canadas-...,Multiple locations,"Multiple locations: Edmonton, AB; Calgary, AB;..."
260,261,Edmonton,Alberta,,,https://globalnews.ca/news/4789167/alberta-yel...,Multiple locations,"Multiple locations: Edmonton, AB; Calgary, AB;..."
310,311,Vancouver,British Columbia,,,https://www.straight.com/news/1215686/new-zeal...,Multiple locations,"Multiple locations: Vancouver, BC; Edmonton, A..."
589,590,Happy Valley-Goose Bay,Newfoundland and Labrador,,,https://www.cbc.ca/news/canada/newfoundland-la...,Multiple locations,"Multiple locations: Happy Valley-Goose Bay, NL..."
804,805,Wet'suwet'en,British Columbia,,,https://ricochet.media/en/2961/bomb-threat-mad...,Multiple locations,Article also mentions a threat to Tyendinaga M...
806,807,Calgary,Alberta,,,https://globalnews.ca/news/6600386/calgary-col...,Multiple locations,"Multiple locations: Calgary, AB; Red Deer, AB"
830,831,Cambridge,Ontario,,,https://www.theglobeandmail.com/politics/artic...,Check city,No mention of the city of Cambridge in the art...
879,880,Courtenay,British Columbia,,,https://www.cheknews.ca/new-video-protesters-p...,Check city,Location originally listed as Wet'suwet'en but...


In [8]:
locations.shape

(1000, 8)

In [9]:
locations['incident_id'].nunique()

1000

### Save CSV files

In [10]:
def save_data(data, savefile, index=False):
    print(f'Saving to {savefile}')
    data.to_csv(savefile, index=index)

In [11]:
if save_csv:
    save_data(locations, 'incident_locations.csv')
    save_data(loc_ref, 'locations_reference.csv')

Saving to incident_locations.csv
Saving to locations_reference.csv
