In [1]:
import spacy
import pandas as pd
import en_core_web_md
import ast

In [2]:
# load medium English model in case we need to work with vectors
nlp = en_core_web_md.load()

In [3]:
df = pd.read_csv('Neighborhood_Separated_Articles/2014.csv')

In [4]:
black_neighborhoods = ['dorchester', 'roxbury', 'mattapan', 'hyde_park']
white_neighborhoods = ['fenway', 'beacon_hill', 'downtown', 'south_boston', 'east_boston', 'back_bay', 'jamaica_plain',
                      'south_end', 'charlestown', 'brighton', 'allston', 'west_end', 'roslindale', 'north_end',
                      'mission_hill', 'harbor_islands', 'longwood_medical_area', 'west_roxbury']
df = df.fillna("('no article', 'no_id')")
df['dorchester'] = df['dorchester'].apply(ast.literal_eval)
df['roxbury'] = df['roxbury'].apply(ast.literal_eval)
df['mattapan'] = df['mattapan'].apply(ast.literal_eval)
df['hyde_park'] = df['hyde_park'].apply(ast.literal_eval)
df['fenway'] = df['fenway'].apply(ast.literal_eval)
df['beacon_hill'] = df['beacon_hill'].apply(ast.literal_eval)
df['downtown'] = df['downtown'].apply(ast.literal_eval)
df['south_boston'] = df['south_boston'].apply(ast.literal_eval)
df['east_boston'] = df['east_boston'].apply(ast.literal_eval)
df['back_bay'] = df['back_bay'].apply(ast.literal_eval)
df['jamaica_plain'] = df['jamaica_plain'].apply(ast.literal_eval)
df['south_end'] = df['south_end'].apply(ast.literal_eval)
df['charlestown'] = df['charlestown'].apply(ast.literal_eval)
df['brighton'] = df['brighton'].apply(ast.literal_eval)
df['allston'] = df['allston'].apply(ast.literal_eval)
df['west_end'] = df['west_end'].apply(ast.literal_eval)
df['roslindale'] = df['roslindale'].apply(ast.literal_eval)
df['north_end'] = df['north_end'].apply(ast.literal_eval)
df['mission_hill'] = df['mission_hill'].apply(ast.literal_eval)
df['harbor_islands'] = df['harbor_islands'].apply(ast.literal_eval)
df['longwood_medical_area'] = df['longwood_medical_area'].apply(ast.literal_eval)
df['west_roxbury'] = df['west_roxbury'].apply(ast.literal_eval)

In [5]:
spec_chars = ["!",'"',"#","%","&","'","(",")", "*","+",",",
                  "-",".","/",":",";","<", "=",">","?","@","[",
                  "\\","]","^","_", "`","{","|","}","~","–", 
                  "\xc2", "\xa0", "\x80", "\x9c", "\x99", "\x94", 
                  "\xad", "\xe2", "\x9d", "\n"]

df = df.drop(['Unnamed: 0'], axis=1)

#for char in spec_chars:
#    df['text'] = df['text'].str.strip()
#    df['text'] = df['text'].str.replace(char, ' ')
       
# access each column separately
for i in range(len(df.index)):
    for col in df.columns:
        for char in spec_chars:
            try:
                df.loc[i, col][0] = df.loc[i, col][0].str.strip()
                df.loc[i, col][0] = df.loc[i, col][0].str.replace(char, ' ')
            except:
                pass

In [6]:
df.shape

(500, 22)

In [7]:
articles = {'hyde_park': [], 'beacon_hill': [], 'south_boston': [], 'jamaica_plain': [], 'east_boston': [],
                'south_end': [], 'back_bay': [], 'north_end': [], 'west_roxbury': [], 'mission_hill': [],
                'harbor_islands': [], 'west_end': [], 'longwood_medical_area': [],
                'dorchester': [], 'roxbury': [], 'downtown': [], 'fenway': [], 'mattapan': [], 'brighton': [],
                'charlestown': [], 'roslindale': [], 'allston': []}
for sub_neighborhood in articles.keys():
    for i in range(df.shape[0]):
        if type(df.loc[i, sub_neighborhood]) == tuple:
            articles[sub_neighborhood].append((nlp(df.loc[i, sub_neighborhood][0]), df.loc[i, sub_neighborhood][1]))
    print(sub_neighborhood + ' DONE')

hyde_park DONE
beacon_hill DONE
south_boston DONE
jamaica_plain DONE
east_boston DONE
south_end DONE
back_bay DONE
north_end DONE
west_roxbury DONE
mission_hill DONE
harbor_islands DONE
west_end DONE
longwood_medical_area DONE
dorchester DONE
roxbury DONE
downtown DONE
fenway DONE
mattapan DONE
brighton DONE
charlestown DONE
roslindale DONE
allston DONE


In [8]:
articles['dorchester'][0][1]

'2014_5'

In [9]:
people = {'hyde_park': [], 'beacon_hill': [], 'south_boston': [], 'jamaica_plain': [], 'east_boston': [],
                'south_end': [], 'back_bay': [], 'north_end': [], 'west_roxbury': [], 'mission_hill': [],
                'harbor_islands': [], 'west_end': [], 'longwood_medical_area': [],
                'dorchester': [], 'roxbury': [], 'downtown': [], 'fenway': [], 'mattapan': [], 'brighton': [],
                'charlestown': [], 'roslindale': [], 'allston': []}

for sub_neighborhood in articles.keys():
    for (doc, article_id) in articles[sub_neighborhood]:
        for ent in doc.ents:
            if ent.label_ == 'PERSON':
                name = ent[0:2]
                # sentence = ent.sent
                people[sub_neighborhood].append((name, article_id))

In [10]:
for sub_neighborhood in people.keys():
    list1 = people[sub_neighborhood]
    # convert the list to the set
    list_set = set(list1)
    # convert the set to the list
    unique_list = (list(list_set))
    people[sub_neighborhood] = unique_list

In [11]:
representation_proportions = {'hyde_park': [], 'beacon_hill': [], 'south_boston': [], 'jamaica_plain': [], 'east_boston': [],
                'south_end': [], 'back_bay': [], 'north_end': [], 'west_roxbury': [], 'mission_hill': [],
                'harbor_islands': [], 'west_end': [], 'longwood_medical_area': [],
                'dorchester': [], 'roxbury': [], 'downtown': [], 'fenway': [], 'mattapan': [], 'brighton': [],
                'charlestown': [], 'roslindale': [], 'allston': []}
for sub_neighborhood in people.keys():
    for i in range(len(people[sub_neighborhood])):
        if people[sub_neighborhood][i][0].text.strip() != '':
            temp = people[sub_neighborhood][i][0].text.split()
            if len(temp) > 1:
                people[sub_neighborhood][i] = (temp[0], people[sub_neighborhood][i][1])
            else:
                people[sub_neighborhood][i] = (temp[0], people[sub_neighborhood][i][1])

In [12]:
people['dorchester'][0]

('Dan', '2014_216')

In [22]:
from ethnicolr import pred_wiki_ln

In [25]:
# temp = pd.DataFrame(people['fenway'], columns=['last_name', 'article_id'])
# temp = temp.drop(['article_id'], axis=1)
# temp['subneighborhood'] = 'fenway'
# temp.head()

In [26]:
final_df = pd.DataFrame(columns=['last_name', 'subneighborhood'])
subs = white_neighborhoods + black_neighborhoods
subs.remove('longwood_medical_area')
for col in subs:
    temp = pd.DataFrame(people[col], columns=['last_name', 'article_id'])
    temp = temp.drop(['article_id'], axis=1)
    temp['subneighborhood'] = col
    preds = pred_wiki_ln(temp, 'last_name')
    final_df = pd.concat([final_df, preds], axis=0)
    print(col + ' DONE')

fenway DONE
beacon_hill DONE
downtown DONE
south_boston DONE
east_boston DONE
back_bay DONE
jamaica_plain DONE
south_end DONE
charlestown DONE
brighton DONE
allston DONE
west_end DONE
roslindale DONE
north_end DONE
mission_hill DONE
harbor_islands DONE
west_roxbury DONE
dorchester DONE
roxbury DONE
mattapan DONE
hyde_park DONE


In [27]:
hen = final_df['race'].value_counts()
pd.DataFrame(hen)

Unnamed: 0,race
"GreaterEuropean,British",20493
"GreaterEuropean,WestEuropean,French",2068
"GreaterEuropean,WestEuropean,Italian",2054
"GreaterEuropean,WestEuropean,Hispanic",1487
"GreaterEuropean,Jewish",1300
"Asian,IndianSubContinent",1134
"Asian,GreaterEastAsian,EastAsian",947
"GreaterAfrican,Muslim",663
"GreaterEuropean,WestEuropean,Nordic",368
"GreaterEuropean,EastEuropean",357


In [28]:
groups = final_df.groupby(final_df.subneighborhood)
race_df = pd.DataFrame(columns=subs)
races = final_df.race.unique()
for col in subs:
    group = groups.get_group(col)
    race_df[col] = group.race.value_counts()

In [29]:
race_df = race_df.fillna(0.0)
race_df

Unnamed: 0,fenway,beacon_hill,downtown,south_boston,east_boston,back_bay,jamaica_plain,south_end,charlestown,brighton,...,west_end,roslindale,north_end,mission_hill,harbor_islands,west_roxbury,dorchester,roxbury,mattapan,hyde_park
"GreaterEuropean,British",232,620,655,1782,365,973,672,636,165.0,217.0,...,66.0,365,93.0,357.0,4.0,706,7599,2705,562,1513
"GreaterEuropean,WestEuropean,Italian",96,26,178,116,63,69,57,53,5.0,13.0,...,8.0,71,26.0,13.0,0.0,59,568,185,52,381
"GreaterEuropean,WestEuropean,Hispanic",74,15,82,81,225,18,66,28,12.0,18.0,...,6.0,49,4.0,7.0,7.0,20,542,116,26,77
"Asian,IndianSubContinent",59,65,30,65,6,66,52,20,5.0,7.0,...,4.0,20,1.0,30.0,0.0,40,328,169,29,115
"Asian,GreaterEastAsian,EastAsian",54,20,52,101,16,15,21,18,3.0,9.0,...,4.0,6,13.0,2.0,0.0,19,320,102,23,144
"GreaterEuropean,Jewish",38,65,63,138,19,27,37,28,3.0,7.0,...,2.0,33,17.0,12.0,0.0,43,370,220,43,118
"GreaterEuropean,WestEuropean,French",20,34,67,150,34,74,77,129,8.0,5.0,...,9.0,29,10.0,16.0,0.0,49,925,204,62,149
"GreaterEuropean,EastEuropean",14,3,25,43,6,7,12,7,4.0,2.0,...,0.0,8,1.0,5.0,0.0,15,147,41,3,11
"GreaterAfrican,Muslim",9,16,27,39,19,44,24,19,11.0,2.0,...,3.0,11,5.0,52.0,0.0,17,139,187,4,28
"GreaterEuropean,WestEuropean,Nordic",5,5,13,21,4,68,9,15,0.0,0.0,...,3.0,2,6.0,1.0,0.0,10,147,34,11,14


In [30]:
for i in range(len(subs)):
    tot = race_df[subs[i]].sum()
    for j in race_df.index:
        race_df.loc[j, [subs[i]]] = (race_df.loc[j, [subs[i]]] / tot) * 100

In [31]:
race_df

Unnamed: 0,fenway,beacon_hill,downtown,south_boston,east_boston,back_bay,jamaica_plain,south_end,charlestown,brighton,...,west_end,roslindale,north_end,mission_hill,harbor_islands,west_roxbury,dorchester,roxbury,mattapan,hyde_park
"GreaterEuropean,British",38.347107,70.056497,53.036437,68.538462,47.402597,70.609579,63.757116,65.56701,74.660633,75.347222,...,56.896552,60.231023,51.666667,69.590643,36.363636,71.385238,67.128975,66.494592,68.038741,58.169935
"GreaterEuropean,WestEuropean,Italian",15.867769,2.937853,14.412955,4.461538,8.181818,5.007257,5.40797,5.463918,2.262443,4.513889,...,6.896552,11.716172,14.444444,2.534113,0.0,5.965622,5.017668,4.547689,6.2954,14.648212
"GreaterEuropean,WestEuropean,Hispanic",12.231405,1.694915,6.639676,3.115385,29.220779,1.306241,6.26186,2.886598,5.429864,6.25,...,5.172414,8.085809,2.222222,1.364522,63.636364,2.022245,4.787986,2.851524,3.1477,2.9604
"Asian,IndianSubContinent",9.752066,7.344633,2.42915,2.5,0.779221,4.78955,4.933586,2.061856,2.262443,2.430556,...,3.448276,3.30033,0.555556,5.847953,0.0,4.044489,2.897527,4.154376,3.510896,4.421376
"Asian,GreaterEastAsian,EastAsian",8.92562,2.259887,4.210526,3.884615,2.077922,1.088534,1.99241,1.85567,1.357466,3.125,...,3.448276,0.990099,7.222222,0.389864,0.0,1.921132,2.826855,2.507375,2.784504,5.536332
"GreaterEuropean,Jewish",6.280992,7.344633,5.101215,5.307692,2.467532,1.959361,3.510436,2.886598,1.357466,2.430556,...,1.724138,5.445545,9.444444,2.339181,0.0,4.347826,3.268551,5.408063,5.205811,4.536717
"GreaterEuropean,WestEuropean,French",3.305785,3.841808,5.425101,5.769231,4.415584,5.370102,7.305503,13.298969,3.61991,1.736111,...,7.758621,4.785479,5.555556,3.118908,0.0,4.954499,8.171378,5.014749,7.506053,5.728566
"GreaterEuropean,EastEuropean",2.31405,0.338983,2.024291,1.653846,0.779221,0.507983,1.13852,0.721649,1.809955,0.694444,...,0.0,1.320132,0.555556,0.974659,0.0,1.516684,1.298587,1.007866,0.363196,0.422914
"GreaterAfrican,Muslim",1.487603,1.80791,2.186235,1.5,2.467532,3.193033,2.27704,1.958763,4.977376,0.694444,...,2.586207,1.815182,2.777778,10.136452,0.0,1.718908,1.227915,4.596853,0.484262,1.076509
"GreaterEuropean,WestEuropean,Nordic",0.826446,0.564972,1.052632,0.807692,0.519481,4.934688,0.85389,1.546392,0.0,0.0,...,2.586207,0.330033,3.333333,0.194932,0.0,1.011122,1.298587,0.835792,1.331719,0.538255


In [32]:
agg_df = pd.DataFrame(columns=subs, index=['white', 'black', 'api', 'hispanic'])

In [33]:
agg_df = agg_df.fillna(0.0)

In [34]:
pred_races = race_df.index
for sub in subs:
    for each in pred_races:
        temp = race_df.loc[each][sub]
        if each == 'GreaterEuropean,WestEuropean,Hispanic':
            agg_df.loc['hispanic'][sub] += temp
        elif each == 'Asian,GreaterEastAsian,EastAsian' or each == 'Asian,IndianSubContinent' or each == 'Asian,GreaterEastAsian,Japanese':
            agg_df.loc['api'][sub] += temp
        elif each == 'GreaterAfrican,Muslim' or each == 'GreaterAfrican,Africans':
            agg_df.loc['black'][sub] += temp
        else:
            agg_df.loc['white'][sub] += temp

In [35]:
agg_df.to_csv('Wiki_Last_Name_Race_Representation_by_Neighborhood/2014_representation.csv')

In [36]:
agg_df

Unnamed: 0,fenway,beacon_hill,downtown,south_boston,east_boston,back_bay,jamaica_plain,south_end,charlestown,brighton,...,west_end,roslindale,north_end,mission_hill,harbor_islands,west_roxbury,dorchester,roxbury,mattapan,hyde_park
white,67.107438,85.310734,82.186235,88.076923,64.155844,88.896952,82.827324,90.309278,83.710407,86.805556,...,76.724138,84.158416,85.555556,78.947368,36.363636,89.383215,86.696113,83.652901,88.861985,84.467512
black,1.818182,2.711864,3.238866,2.115385,3.116883,3.628447,3.13093,2.783505,6.78733,1.388889,...,10.344828,3.135314,4.444444,10.136452,0.0,2.325581,2.040636,5.703048,0.847458,1.499423
api,18.842975,10.282486,7.935223,6.692308,3.506494,6.16836,7.779886,4.020619,4.072398,5.555556,...,7.758621,4.620462,7.777778,9.551657,0.0,6.268959,6.475265,7.792527,7.142857,11.072664
hispanic,12.231405,1.694915,6.639676,3.115385,29.220779,1.306241,6.26186,2.886598,5.429864,6.25,...,5.172414,8.085809,2.222222,1.364522,63.636364,2.022245,4.787986,2.851524,3.1477,2.9604
