# Create Mock Data

Create mock dataset based on preliminary cleaned data, for dashboard prototyping

In [1]:
import pandas as pd
import numpy as np

save_csv = True

In [2]:
# Preliminary cleaned data
rr = pd.read_csv('race_relations_clean_prelim_2020-08-18.csv')
rr.head(2)

Unnamed: 0,Incident ID,Article URL,Incident Description,Publication Date,Incident Date,Pub Date Minus Incident Date,Province,City or Region,Detailed Location,Incident Category,...,South Asian,Southeast Asian,Identity-Based,Context,Type of Incident,Previous Tags,Needs Review,D4G Notes,CCMF Notes,Follow-Up Action
0,169,https://www.cbc.ca/news/canada/british-columbi...,RCMP are looking for a group of fishermen accu...,2018-08-29,2018-08-29,0.0,British Columbia,Hope,,Criminal: Unknown,...,,,,,,C; race relations; anti-indigenous; Fraser Riv...,Check city,"Location originally listed as Fraser River, wh...",,
1,831,https://www.theglobeandmail.com/politics/artic...,"Conservative leadership candidate, who was dis...",2020-03-24,2020-03-24,0.0,Ontario,Cambridge,,Non-Criminal,...,,,Religious discrimination: Islamophobia,Politics,,NC; political; anti-muslim; Cambridge,Check city,No mention of the city of Cambridge in the art...,,


In [3]:
# List of all possible categories for each dimension
categories = (pd.read_csv('categories.csv', na_values='', keep_default_na=False)
              .melt(var_name='Dimension', value_name='Value')
              .dropna(subset=['Value'])
             )

# Append some additional multi-select values
extra = [
    ['Gender of Victim(s)', 'Female, Male'],
    ['Ethnic Community', 'Black/African/Caribbean, Indigenous'],
    ['Ethnic Community', 'South Asian, Southeast Asian'],
    ['Ethnic Community', 'Central Asian, Middle Eastern'],
    ['Identity-Based', 'Religious discrimination: Islamophobia, Xenophobia'],
    ['Identity-Based', 'Religious discrimination: Anti-Semitism, Religious discrimination: Islamophobia']
]
extra = pd.DataFrame(extra, columns=['Dimension', 'Value'])
categories = categories.append(extra).set_index('Dimension', drop=True)

print(categories.shape)
categories.sample(10, random_state=0)

(216, 1)


Unnamed: 0_level_0,Value
Dimension,Unnamed: 1_level_1
Context,Transit
Black/African/Caribbean,Comoros
Black/African/Caribbean,Togo
Southeast Asian,Singapore
Middle Eastern,Turkey
Black/African/Caribbean,Saint Pierre and Miquelon
Black/African/Caribbean,Zimbabwe
Middle Eastern,Cyprus
Gender of Victim(s),Male
Black/African/Caribbean,South Africa


In [4]:
def random_choice(choices, n, blank_ratio=2, seed=0):
    """Return a series of random values from a list of choices"""
    
    if blank_ratio > 0:
        choices = choices + int(blank_ratio) * len(choices) * ['']
    
    np.random.seed(seed)
    values = pd.Series(np.random.choice(choices, n)).replace({'' : np.nan})
    
    return values

In [5]:
# Create dataframe with mock data
cols_drop = ['Publication Date', 'Pub Date Minus Incident Date', 'Previous Tags',
             'Needs Review', 'D4G Notes', 'CCMF Notes', 'Follow-Up Action']
rr_mock = (rr.copy()
           .sort_values('Incident ID')
           .reset_index(drop=True)
           .drop(cols_drop, axis=1)
          )

# For the few entries with multiple URLs listed, extract the first URL
rr_mock['Article URL'] = rr_mock['Article URL'].str.split(expand=True)[0]

# Empty placeholder column
rr_mock['Notes'] = np.nan

# Populate missing values with random selections from applicable categories
n = rr_mock.shape[0]
communities = categories.loc['Ethnic Community', 'Value'].values
for name in categories.index.unique():
    choices = categories.loc[name, 'Value'].to_list()
    
    # Make some of the sub-communities more common than others
    if name == 'Black/African/Caribbean':
        choices += 10 * ['Ethiopia', 'Jamaica', 'Nigeria', 'Somalia']
    elif name == 'Indigenous':
        choices += 5 * ['First Nations']
    elif name == 'East Asian':
        choices += 10 * ['China'] + 5 * ['Japan', 'Taiwan', 'South Korea']
    elif name == 'South Asian':
        choices += 10 * ['India', 'Pakistan']
    elif name == 'Southeast Asian':
        choices += 10 * ['Indonesia', 'Philippines', 'Vietnam']
    elif name == 'Central Asian':
        choices += 10 * ['Afghanistan', 'Iran']
    elif name == 'Middle Eastern':
        choices += 10 * ['Israel', 'Palestine'] + 5 * ['Egypt', 'Iraq', 'Syria']
    elif name == 'Latin American':
        choices += 10 * ['Mexico'] + 5 * ['Brazil', 'Ecuador', 'Venezuela']
        
    if name == 'Incident Category' or name == 'Ethnic Community':
        # No blanks
        blank_ratio = 0
    elif name in communities:
        blank_ratio = 1
    else:
        # Set roughly 2/3 of values to blanks
        blank_ratio = 2
    values = random_choice(choices, n, blank_ratio=blank_ratio)
    rr_mock[name] = rr_mock[name].fillna(values)
    
    # If the column is a sub-community, set values to NaN if the corresponding community is
    # not selected
    if name in communities:
        idx = ~rr_mock['Ethnic Community'].fillna('').str.contains(name)
        rr_mock.loc[idx, name] = np.nan

rr_mock.head(2)

Unnamed: 0,Incident ID,Article URL,Incident Description,Incident Date,Province,City or Region,Detailed Location,Incident Category,Gender of Victim(s),Name of Victim(s),...,East Asian,Indigenous,Latin American,Middle Eastern,South Asian,Southeast Asian,Identity-Based,Context,Type of Incident,Notes
0,1,https://globalnews.ca/news/3949365/b-c-woman-c...,B.C. woman caught on video delivering racist r...,2018-01-05,British Columbia,Burnaby,,Non-Criminal,,,...,,,,,,,,,Hate speech: Other,
1,2,https://www.cbc.ca/news/canada/windsor/graffit...,Windsor Muslims pen letter asking police why I...,2018-01-03,Ontario,Windsor,,Non-Criminal,Female,,...,,,,,,,Religious discrimination: Islamophobia,,Vandalism,


In [6]:
rr_mock.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Incident ID              1000 non-null   int64  
 1   Article URL              1000 non-null   object 
 2   Incident Description     999 non-null    object 
 3   Incident Date            997 non-null    object 
 4   Province                 941 non-null    object 
 5   City or Region           852 non-null    object 
 6   Detailed Location        26 non-null     object 
 7   Incident Category        1000 non-null   object 
 8   Gender of Victim(s)      349 non-null    object 
 9   Name of Victim(s)        0 non-null      float64
 10  Ethnic Community         1000 non-null   object 
 11  Black/African/Caribbean  165 non-null    object 
 12  Central Asian            50 non-null     object 
 13  East Asian               48 non-null     object 
 14  Indigenous               

In [7]:
cols = ['Incident Category', 'Gender of Victim(s)'] + rr_mock.columns[10:22].to_list()
for col in cols:
    print('---- ' + col + ' ------')
    print(rr_mock[col].value_counts(dropna=False))
    print('')

---- Incident Category ------
Non-Criminal             705
Criminal: Unknown        161
Criminal: Charged         69
Criminal: Not charged     65
Name: Incident Category, dtype: int64

---- Gender of Victim(s) ------
NaN              651
Female, Male     104
Female            87
Other/Unknown     81
Male              77
Name: Gender of Victim(s), dtype: int64

---- Ethnic Community ------
Black/African/Caribbean                                245
Indigenous                                             220
South Asian                                             81
East Asian                                              78
Central Asian                                           54
Middle Eastern                                          51
Black/African/Caribbean, Indigenous                     49
Central Asian, Middle Eastern                           46
N/A                                                     43
Latin American                                          42
Southeast Asian   

## Save to CSV

In [8]:
def save_data(data, savefile, index=False):
    print(f'Saving to {savefile}')
    data.to_csv(savefile, index=index)

In [9]:
if save_csv:
    save_data(rr_mock, 'mock_data.csv')

Saving to mock_data.csv
