# Raw Data Pre-Processing

- Load raw data from CCMF's spreadsheet
- Consolidate years 2018-2020 into one table and assign an ID number to each incident
- Extract article URLs for web scraping
- Extract locations for further data cleaning
- Do some preliminary cleaning of the various other tags (target communities, incident categories)
- Save consolidated raw data, locations, and tags to CSV files for further processing

In [1]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse
pd.options.display.max_rows = 100

save_csv = True

### Load data and concatenate

In [2]:
datafile = '../Raw_Data_from_CCMF/20200703 - 2018-2020 Quantifying Race Relations.xlsx'
sheet_names = ['2018 collection', '2019 collection', '2020 collection']
extra_header_rows = [2, 0, 0]

short_names = {
    'Date (MM/DD/YY)' : 'date',
    'Location (City)' : 'location',
    'Criminal (C) vs. Non-Criminal (NC)' : 'category',
    'Sub-Categorization of Criminal v. Non-Criminal' : 'sub_category',
    'Target Community' : 'target_community',
    'Notes' : 'description',
    'Link to Article ' : 'article_url'
}

# Read race relations data for each year and concatenate into a single dataframe
rr = pd.DataFrame()
for sheet_name, skiprows in zip(sheet_names, extra_header_rows):
    df = (pd.read_excel(datafile, sheet_name=sheet_name, skiprows=skiprows)
          .rename(short_names, axis=1)
          .drop('Number', axis=1, errors='ignore')
          .dropna(how='all')
         )
    
    # Some missing / unconfirmed dates have been flagged with ?, set these to NaN and convert
    # column to datetime
    if str(df['date'].dtype) != 'datetime64[ns]':
        df.loc[df['date'].str.contains('/?').fillna(False), 'date'] = np.nan
        df['date'] = pd.to_datetime(df['date'])
    
    # Parse dates and add some info to facilitate comparison of cleaned data with the original spreadsheet
    df['year'] = df['date'].dt.year
    df['orig_sheet_name'] = sheet_name
    df['orig_row_num'] = df.index + 1 + skiprows
    
    # Append to main dataframe
    rr = rr.append(df)

# Assign a unique integer ID to each incident and set as the dataframe's index
rr['incident_id'] = np.arange(1, len(rr) + 1)
rr = rr.set_index('incident_id', drop=True)

# Strip outer white space from text columns
for col in rr.select_dtypes(include=object).columns:
    rr[col] = rr[col].str.strip()
    
print(rr.shape)
rr.tail()

(1000, 10)


Unnamed: 0_level_0,date,location,category,sub_category,target_community,description,article_url,year,orig_sheet_name,orig_row_num
incident_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
996,2020-06-23,Toronto,Police Investigation,hate crime,anti-black,Two nooses were placed in areas at which two B...,https://globalnews.ca/news/7098601/anti-black-...,2020.0,2020 collection,259
997,2020-06-22,Abbotsford,HR investigation,hate speech,anti-black,UFV investigating two students accused of racism,https://www.citynews1130.com/2020/06/22/ufv-in...,2020.0,2020 collection,260
998,2020-06-26,Montreal,NC,profiled,anti-black,West Island grad labelled 'most likely to beco...,https://www.cbc.ca/news/canada/montreal/wanted...,2020.0,2020 collection,261
999,2020-06-26,British Columbia,HR investigation,discrimination at work,anti-immigrant,Foreign-trained doctors file B.C. human rights...,https://globalnews.ca/news/7114162/b-c-foreign...,2020.0,2020 collection,262
1000,2020-06-26,Vancouver,NC,political,white supremacist,Vancouver Police Chief says the suggestion of ...,https://vancouversun.com/news/local-news/dan-f...,2020.0,2020 collection,263


### Data overview

In [3]:
rr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1 to 1000
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              997 non-null    datetime64[ns]
 1   location          1000 non-null   object        
 2   category          999 non-null    object        
 3   sub_category      999 non-null    object        
 4   target_community  997 non-null    object        
 5   description       999 non-null    object        
 6   article_url       1000 non-null   object        
 7   year              997 non-null    float64       
 8   orig_sheet_name   1000 non-null   object        
 9   orig_row_num      1000 non-null   int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(7)
memory usage: 85.9+ KB


In [4]:
# Summarize the non-numeric columns
rr.describe(include=object)

Unnamed: 0,location,category,sub_category,target_community,description,article_url,orig_sheet_name
count,1000,999,999,997,999,1000,1000
unique,219,52,117,163,996,984,3
top,Ottawa,NC,race relations,anti-black,Suspect wanted in connection with 4 acts of ha...,https://newsinteractives.cbc.ca/longform/covid...,2019 collection
freq,92,638,212,177,2,4,478


In [5]:
# Number of incidents each year
rr['year'].value_counts(dropna=False).sort_index()

2018.0    258
2019.0    480
2020.0    259
NaN         3
Name: year, dtype: int64

### Article URLs

URLs and domains for each article, for web scraping.

In [6]:
def extract_domain(url):
    return '.'.join(urlparse(url).netloc.split('.')[-2:])

In [7]:
urls = rr[['article_url']].copy()
urls['domain'] = urls['article_url'].apply(extract_domain)
urls.head()

Unnamed: 0_level_0,article_url,domain
incident_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,https://globalnews.ca/news/3949365/b-c-woman-c...,globalnews.ca
2,https://www.cbc.ca/news/canada/windsor/graffit...,cbc.ca
3,https://nationalpost.com/news/politics/sen-lyn...,nationalpost.com
4,https://www.cbc.ca/news/canada/calgary/judge-e...,cbc.ca
5,http://nationalpost.com/news/canada/p-e-i-legi...,nationalpost.com


In [8]:
counts = urls['domain'].value_counts()
counts

cbc.ca                      339
globalnews.ca                90
ctvnews.ca                   89
thestar.com                  27
huffingtonpost.ca            26
                           ... 
americanbazaaronline.com      1
blogto.com                    1
thinkpol.ca                   1
jta.org                       1
worldbulletin.net             1
Name: domain, Length: 161, dtype: int64

In [9]:
counts[counts >= 10]

cbc.ca                   339
globalnews.ca             90
ctvnews.ca                89
thestar.com               27
huffingtonpost.ca         26
theglobeandmail.com       26
nationalpost.com          25
canada.com                14
citynews1130.com          14
vice.com                  12
citynews.ca               11
thechronicleherald.ca     11
cjnews.com                11
straight.com              11
calgaryherald.com         10
Name: domain, dtype: int64

### Data cleaning functions

In [10]:
def split_explode(series_in, split_comma=True, na_fill='n/a', lowercase=True):
    """Split strings containing multiple tags into their separate values and explode into separate rows"""
    
    # Remove outer white space and replace various split delimiters with one consistent delimiter ";"
    series_out = (series_in.str.strip()
                  .str.replace('/', ';')
                  .str.replace('\n', ';')
                 )
    
    if split_comma:
        series_out = series_out.str.replace(',', ';')
        
    if lowercase:
        series_out = series_out.str.lower()
        
    if na_fill is not None:
        series_out = series_out.fillna(na_fill)
        
    # Separate tags and strip white space
    series_out = (series_out.str.split(';')
                  .explode()
                  .str.strip()
                 )
    
    # Remove any rows containing only an empty string
    empty = series_out == ''
        
    return series_out

### Locations

In [11]:
# Tags identifying the location of each incident
locations = (rr['location'].str.replace('Ontario', 'ON')
             .str.replace('\(ON\)', ', ON')
             .str.replace('Alberta', 'AB')
             .str.replace('Quebec', 'QC')
             .str.replace('N.S.', 'NS')
             .str.replace('Edmonton, Calgary, Vancouver', 'Edmonton; Calgary; Vancouver')
             .str.replace('Edmonton, Ottawa', 'Edmonton; Ottawa')
             .str.replace('Vancouver , Edmonton , Lethbridge', 'Vancouver; Edmonton; Lethbridge')
             .pipe(split_explode, split_comma=False, lowercase=False)
            )

print(len(locations))
locations.head()

1011


incident_id
1    Burnaby
2    Windsor
3     Ottawa
4    Calgary
5    Tignish
Name: location, dtype: object

In [12]:
# Summarize unique tags and incident counts
location_counts = locations.value_counts()
print(len(location_counts))
location_counts

213


Ottawa              93
Toronto             90
Montreal            73
Vancouver           61
National            53
                    ..
Yarmouth, NS         1
Mount Polley         1
Georgina             1
Nunavut              1
Fraser River, BC     1
Name: location, Length: 213, dtype: int64

In [13]:
location_counts.head(100)

Ottawa                                     93
Toronto                                    90
Montreal                                   73
Vancouver                                  61
National                                   53
Edmonton                                   45
Calgary                                    43
Winnipeg                                   36
Halifax                                    31
QC                                         24
Hamilton                                   18
ON                                         17
Mississauga                                16
Nova Scotia                                14
Regina                                     12
Peterborough                               11
London                                     10
Thunder Bay                                10
Burnaby                                     9
Surrey                                      9
Victoria                                    8
British Columbia                  

In [14]:
# Check the province values in tags that are in the form "city, province"
locations.str.split(',').str[1].value_counts()

 ON    21
 BC    17
 NS    12
 QC     7
 AB     5
 MB     4
 NL     2
 SK     1
Name: location, dtype: int64

In [15]:
# Number of locations per incident
(locations.reset_index()
 .groupby('incident_id')
 .size()
 .value_counts()
 .reset_index()
 .rename({'index' : 'Locations per Incident', 0 : 'Number of Incidents'}, axis=1)
)

Unnamed: 0,Locations per Incident,Number of Incidents
0,1,994
1,2,3
2,3,2
3,5,1


In [16]:
locations[locations.str.startswith('Fred')].value_counts()

Fredericton     2
Frederickton    1
Fredricton      1
Name: location, dtype: int64

In [17]:
locations[locations.str.startswith('Winn')].value_counts()

Winnipeg    36
Winnepeg     1
Name: location, dtype: int64

In [18]:
rr[rr['location'] == 'Unknown']

Unnamed: 0_level_0,date,location,category,sub_category,target_community,description,article_url,year,orig_sheet_name,orig_row_num
incident_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
837,2020-03-28,Unknown,NC,hate speech,anti-immigrant ; sexist,Online press conference highlighting Laval imm...,https://www.halifaxexaminer.ca/featured/toxify...,2020.0,2020 collection,98
838,2020-03-29,Unknown,NC,hate speech,anti-black ; homophobia,Webinar on COVID-19 and the future of public t...,https://www.halifaxexaminer.ca/featured/toxify...,2020.0,2020 collection,99
840,2020-04-03,Unknown,NC,hate speech,anti-black,Virtual town hall held by YWCA Canada is zoomb...,https://www.cbc.ca/news/technology/zoombombing...,2020.0,2020 collection,101


### Target communities

In [19]:
# Tags identifying the different target communities for each incident
targets = (split_explode(rr['target_community'].str.replace(' - ', ';'))
           .str.replace('ant-', 'anti-')
           .str.replace('ani-', 'anti-')
           .str.replace('anti ', 'anti-')
           .str.replace('immigrant', 'immigration')
           .str.replace('muslims', 'muslim')
           .str.replace('muslilm', 'muslim')
           .str.replace('semetic', 'semitic')
           .replace('anti-semitism', 'anti-semitic')
           .replace('white supremacist', 'white supremacists')
           .replace('white supremasists', 'white supremacists')
           .replace('white-supremacist', 'white supremacists')
           .replace('white supremacy', 'white supremacists')
           .replace('gender', 'gendered')
           .replace('homophobic', 'homophobia')
           .replace('indian', 'anti-indian')
           .replace('not specified', 'n/a')
           .replace('not identified', 'n/a')
           .replace('lgbtq2s+', 'anti-lgbtq2s+')
           .replace('lgbtq2+', 'anti-lgbtq2s+')
           .replace('anti-lbtq2s+', 'anti-lgbtq2s+')
           .replace('other', 'n/a')
          )

print(len(targets))
targets.head()

1207


incident_id
1         anti-black
2        anti-muslim
3    anti-indigenous
4         anti-black
5          anti-sikh
Name: target_community, dtype: object

In [20]:
# Summarize the unique tags and their incident counts
target_counts = targets.value_counts()
print(len(target_counts))
target_counts

40


anti-black              225
anti-indigenous         193
anti-muslim             131
anti-racism             129
anti-semitic             96
white supremacists       89
anti-immigration         76
anti-asian               56
n/a                      55
anti-chinese             28
anti-sikh                21
anti-indian              19
gendered                 14
anti-arab                10
anti-poc                  9
homophobia                8
anti-lgbtq2s+             8
anti-south asian          5
sexist                    5
anti-filipino             4
anti-palestinian          3
anti-afghan               2
race relations            2
anti-francophone          2
anti-iranian              2
anti-african              1
anti-taiwanese            1
anti-israel               1
anti-white                1
anti-mexican              1
anti-syrian               1
anti-middle eastern       1
anti-discrimination       1
anti-christian            1
environmental racism      1
anti-islam          

### Incident categories

In [21]:
# Tags identifying the category of each incident
categories = (split_explode(rr['category'])
              .replace('unknown', 'n/a')
              .replace('other', 'n/a')
              .replace('not specified', 'n/a')
              .replace('bc', 'n/a')
              .replace('class action', 'class action lawsuit')
             )
print(len(categories))
categories.head()

1002


incident_id
1    nc
2    nc
3    nc
4    nc
5    nc
Name: category, dtype: object

In [22]:
# Summarize the unique tags and their incident counts
category_counts = categories.value_counts()
print(len(category_counts))
category_counts

43


nc                                                                     638
police investigation                                                    93
c                                                                       89
hr investigation                                                        50
investigation                                                           38
human rights                                                            14
military investigation                                                  10
n/a                                                                     10
lawsuit                                                                  8
human rights complaint                                                   4
human rights tribunal                                                    4
political campaign                                                       4
political                                                                3
pending litigation       

It looks like the province "BC" may have been mistakenly duplicated in the "category" column for this incident, perhaps a typo for "NC"?

### Sub-categories

In [23]:
# Tags identifying the sub-categories of each incident
sub_cats = (split_explode(rr['sub_category'])
            .replace('poltical', 'political')
            .replace('politcal', 'political')
            .replace('harrassment', 'harassment')
            .replace('assualt', 'assault')
            .replace('race realtions', 'race relations')
            .replace('hate seech', 'hate speech')
            .str.replace('racits', 'racist')
            .str.replace('alligation', 'allegation')
            .str.replace('discrimintation', 'discrimination')
            .str.replace('law suit', 'lawsuit')
            .replace('other', 'n/a')
           )
print(len(sub_cats))
sub_cats

1164


incident_id
1                  hate speech
2                    vandalism
3                  hate speech
4                  hate speech
5                   harassment
                 ...          
996                 hate crime
997                hate speech
998                   profiled
999     discrimination at work
1000                 political
Name: sub_category, Length: 1164, dtype: object

In [24]:
# Summarize unique tags and their incident counts
sub_cats_counts = sub_cats.value_counts()
print(len(sub_cats_counts))
sub_cats_counts

44


race relations                                                                                242
hate speech                                                                                   236
political                                                                                     175
vandalism                                                                                      90
profiled                                                                                       65
hate crime                                                                                     53
human rights                                                                                   49
discrimination at work                                                                         47
harassment                                                                                     38
n/a                                                                                            37
protest             

### Consolidate tags to apply taxonomy

In [25]:
tags = categories[~categories.isin(['c', 'nc'])].str.replace('c - ', '')
idx = tags.str.startswith('c (').fillna(False)
tags.loc[idx] = tags.loc[idx].str.replace('c \(', '').str.replace('\)', '')

tags = (tags.to_frame()
        .join(sub_cats.to_frame(), how='outer')
        .join(targets.to_frame(), how='outer')
        .reset_index(drop=False)
        .melt(id_vars='incident_id', value_name='tag')
        .replace({'tag' : {'n/a' : np.nan}})
        .dropna(subset=['tag'])
        .sort_values(['incident_id', 'variable'], ascending=[True, False])
        .drop_duplicates()
       )

# Drop some long descriptions that should have been in the incident descriptions instead of tags
idx_drop = tags['tag'].str.startswith('ndp employee says') | tags['tag'].str.startswith('violates international convention')
tags = tags.drop(tags[idx_drop].index).reset_index(drop=True)
tags

Unnamed: 0,incident_id,variable,tag
0,1,target_community,anti-black
1,1,sub_category,hate speech
2,2,target_community,anti-muslim
3,2,sub_category,vandalism
4,3,target_community,anti-indigenous
...,...,...,...
2537,999,target_community,anti-immigration
2538,999,sub_category,discrimination at work
2539,999,category,hr investigation
2540,1000,target_community,white supremacists


In [26]:
tag_counts = tags['tag'].value_counts().sort_index()
tag_counts.head(100)

allegations                                               23
allegations against police                                 1
anti-afghan                                                2
anti-african                                               1
anti-arab                                                 10
anti-asian                                                56
anti-black                                               225
anti-chinese                                              28
anti-christian                                             1
anti-discrimination                                        1
anti-filipino                                              4
anti-francophone                                           2
anti-immigration                                          76
anti-indian                                               19
anti-indigenous                                          193
anti-iranian                                               2
anti-islam              

In [27]:
tag_counts.iloc[100:]

rcmp investigation                  2
robbery                             1
settlement                          1
sexist                              5
terror                              1
threats                             1
transphobia                         1
vandalism                          90
violates international law (un)     1
white supremacists                 89
Name: tag, dtype: int64

In [28]:
# List of unique tags
tags_unique = tag_counts.index.to_frame(name='tag').reset_index(drop=True)
tags_unique

Unnamed: 0,tag
0,allegations
1,allegations against police
2,anti-afghan
3,anti-african
4,anti-arab
...,...
105,threats
106,transphobia
107,vandalism
108,violates international law (un)


### Save CSV files

In [29]:
def save_data(data, savefile, index=True):
    print(f'Saving to {savefile}')
    data.to_csv(savefile, index=index)

In [30]:
if save_csv:
    save_data(rr, '../race_relations_raw_consolidated.csv')
    save_data(urls, 'article_urls.csv')
    save_data(locations, 'locations.csv')
    save_data(tags, 'tags_all.csv', index=False)
    save_data(tags_unique, 'tags_unique.csv', index=False)

Saving to ../race_relations_raw_consolidated.csv
Saving to article_urls.csv
Saving to locations.csv
Saving to tags_all.csv
Saving to tags_unique.csv
