In [1]:
# come up with the population divide by races for each neighborhood
# use neighborhood-separated articles
# for each neighborhood, turn all articles into a spaCy-processible list of documents
# for each list, extract out all people and run their last names with ethnicolr to predict races
# for each set of predictions, get percentage of races
# have journalism team go through U.S. Census data to see if the proportions of races match Census data

# QUESTION: how to verify that two names talked about in an article belong to different people/the same people?

# potential solution: for each article, only store the unique names; but is this possible? 
# each doc is an article, so we can extract out all "PERSON" entities and then keep only those which are unique
# we could then feed the last names of those unique people (the last names may not necessarily be unique) to ethnicolr

In [2]:
import spacy
import pandas as pd
import en_core_web_md

In [3]:
# load medium English model in case we need to work with vectors
nlp = en_core_web_md.load()

In [4]:
# data cleaning
# remove special characters and extra whitespace
def remove_specChar(df):

    spec_chars = ["!",'"',"#","%","&","'","(",")", "*","+",",",
                  "-",".","/",":",";","<", "=",">","?","@","[",
                  "\\","]","^","_", "`","{","|","}","~","–", 
                  "\xc2", "\xa0", "\x80", "\x9c", "\x99", "\x94", 
                  "\xad", "\xe2", "\x9d", "\n"]

    #for char in spec_chars:
    #    df['text'] = df['text'].str.strip()
    #    df['text'] = df['text'].str.replace(char, ' ')
        
    
    # access each column separately
    for i in range(len(df.index)):
        for col in df.columns:
            for char in spec_chars:
                try:
                    df.loc[i, col] = df.loc[i, col].str.strip()
                    df.loc[i, col] = df.loc[i, col].str.replace(char, ' ')
                except:
                    pass
    return df

In [5]:
# read in CSV of articles
df = pd.read_csv('Neighborhood_Separated_Articles/2014.csv')

df = remove_specChar(df)
print("shape: ", df.shape)

shape:  (500, 23)


In [6]:
articles = {'hyde_park': [], 'beacon_hill': [], 'south_boston': [], 'jamaica_plain': [], 'east_boston': [],
                'south_end': [], 'back_bay': [], 'north_end': [], 'west_roxbury': [], 'mission_hill': [],
                'harbor_islands': [], 'west_end': [], 'longwood_medical_area': [],
                'dorchester': [], 'roxbury': [], 'downtown': [], 'fenway': [], 'mattapan': [], 'brighton': [],
                'charlestown': [], 'roslindale': [], 'allston': []}
for sub_neighborhood in articles.keys():
    for i in range(df.shape[0]):
        if type(df.loc[i, sub_neighborhood]) == str:
            articles[sub_neighborhood].append(nlp(df.loc[i, sub_neighborhood]))
    print(sub_neighborhood + ' DONE')

hyde_park DONE
beacon_hill DONE
south_boston DONE
jamaica_plain DONE
east_boston DONE
south_end DONE
back_bay DONE
north_end DONE
west_roxbury DONE
mission_hill DONE
harbor_islands DONE
west_end DONE
longwood_medical_area DONE
dorchester DONE
roxbury DONE
downtown DONE
fenway DONE
mattapan DONE
brighton DONE
charlestown DONE
roslindale DONE
allston DONE


In [16]:
from ethnicolr import census_ln, pred_census_ln

In [10]:
for key in articles.keys():
    print(key + ' ' + str(len(articles[key])))

hyde_park 108
beacon_hill 43
south_boston 90
jamaica_plain 61
east_boston 23
south_end 51
back_bay 53
north_end 11
west_roxbury 54
mission_hill 32
harbor_islands 1
west_end 8
longwood_medical_area 0
dorchester 500
roxbury 218
downtown 55
fenway 16
mattapan 52
brighton 13
charlestown 6
roslindale 30
allston 12


In [57]:
people = {'hyde_park': [], 'beacon_hill': [], 'south_boston': [], 'jamaica_plain': [], 'east_boston': [],
                'south_end': [], 'back_bay': [], 'north_end': [], 'west_roxbury': [], 'mission_hill': [],
                'harbor_islands': [], 'west_end': [], 'longwood_medical_area': [],
                'dorchester': [], 'roxbury': [], 'downtown': [], 'fenway': [], 'mattapan': [], 'brighton': [],
                'charlestown': [], 'roslindale': [], 'allston': []}

for sub_neighborhood in articles.keys():
    for doc in articles[sub_neighborhood]:
        for ent in doc.ents:
            if ent.label_ == 'PERSON':
                name = ent[0:2]
                people[sub_neighborhood].append(name)

In [58]:
for sub_neighborhood in people.keys():
    list1 = people[sub_neighborhood]
    # insert the list to the set
    list_set = set(list1)
    # convert the set to the list
    unique_list = (list(list_set))
    people[sub_neighborhood] = unique_list

In [59]:
representation_proportions = {'hyde_park': [], 'beacon_hill': [], 'south_boston': [], 'jamaica_plain': [], 'east_boston': [],
                'south_end': [], 'back_bay': [], 'north_end': [], 'west_roxbury': [], 'mission_hill': [],
                'harbor_islands': [], 'west_end': [], 'longwood_medical_area': [],
                'dorchester': [], 'roxbury': [], 'downtown': [], 'fenway': [], 'mattapan': [], 'brighton': [],
                'charlestown': [], 'roslindale': [], 'allston': []}
for sub_neighborhood in people.keys():
    for i in range(len(people[sub_neighborhood])):
        if people[sub_neighborhood][i].text.strip() != '':
            temp = people[sub_neighborhood][i].text.split()
            print()
            if len(temp) > 1:
                people[sub_neighborhood][i] = temp[-1]
            else:
                people[sub_neighborhood][i] = temp[0]
    names_df = pd.DataFrame(people[sub_neighborhood])
    names_df.columns = ['name']
    representation_proportions[sub_neighborhood] = pred_census_ln(names_df, 'name', 2010)

ValueError: Length mismatch: Expected axis has 12 elements, new values have 1 elements

In [41]:
representation_proportions

'     '

In [60]:
names_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,,,,,,,,,,,,
1,C,a,r,t,e,r,,,,,,
2,B,u,r,k,e,,,,,,,
3,J,e,n,k,i,n,s,,,,,
4,S,h,a,w,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1170,V,i,ñ,o,l,y,,,,,,
1171,M,o,r,r,i,s,,,,,,
1172,H,e,r,n,a,n,d,e,z,,,
1173,S,h,a,w,,,,,,,,


In [55]:
people[sub_neighborhood][i].strip().split()

['Rosie']

In [63]:
# re-organize the data so that we have a way to retrieve original text
# like adding ID to the dataset to identify each article
# we should be able to find out the article a name comes from
# we should also be able to find out which neighborhood an article talks about

In [62]:
# If name has 'word', 'word', then take the first name
# keep sentence where name occurred, okay if multiple sentences
# look at sentence where the name was mentioned 
# and the words which were used
# end up with a dataset which has 'name' + 'sentence' + 'race'
# try to put ID of article in the dataset as well, next to the sentence
# for now, try to keep the row from which the name comes, or at least some form of ID

# if extra time, group sentences by associated race
# find most frequently used words for each race, maybe a word cloud or something