In [1]:
import spacy
import pandas as pd
import en_core_web_md
import ast

In [2]:
# load medium English model in case we need to work with vectors
nlp = en_core_web_md.load()

In [3]:
df = pd.read_csv('Neighborhood_Separated_Articles/2018.csv')

In [4]:
black_neighborhoods = ['dorchester', 'roxbury', 'mattapan', 'hyde_park']
white_neighborhoods = ['fenway', 'beacon_hill', 'downtown', 'south_boston', 'east_boston', 'back_bay', 'jamaica_plain',
                      'south_end', 'charlestown', 'brighton', 'allston', 'west_end', 'roslindale', 'north_end',
                      'mission_hill', 'harbor_islands', 'longwood_medical_area', 'west_roxbury']
df = df.fillna("('no article', 'no_id')")
df['dorchester'] = df['dorchester'].apply(ast.literal_eval)
df['roxbury'] = df['roxbury'].apply(ast.literal_eval)
df['mattapan'] = df['mattapan'].apply(ast.literal_eval)
df['hyde_park'] = df['hyde_park'].apply(ast.literal_eval)
df['fenway'] = df['fenway'].apply(ast.literal_eval)
df['beacon_hill'] = df['beacon_hill'].apply(ast.literal_eval)
df['downtown'] = df['downtown'].apply(ast.literal_eval)
df['south_boston'] = df['south_boston'].apply(ast.literal_eval)
df['east_boston'] = df['east_boston'].apply(ast.literal_eval)
df['back_bay'] = df['back_bay'].apply(ast.literal_eval)
df['jamaica_plain'] = df['jamaica_plain'].apply(ast.literal_eval)
df['south_end'] = df['south_end'].apply(ast.literal_eval)
df['charlestown'] = df['charlestown'].apply(ast.literal_eval)
df['brighton'] = df['brighton'].apply(ast.literal_eval)
df['allston'] = df['allston'].apply(ast.literal_eval)
df['west_end'] = df['west_end'].apply(ast.literal_eval)
df['roslindale'] = df['roslindale'].apply(ast.literal_eval)
df['north_end'] = df['north_end'].apply(ast.literal_eval)
df['mission_hill'] = df['mission_hill'].apply(ast.literal_eval)
df['harbor_islands'] = df['harbor_islands'].apply(ast.literal_eval)
df['longwood_medical_area'] = df['longwood_medical_area'].apply(ast.literal_eval)
df['west_roxbury'] = df['west_roxbury'].apply(ast.literal_eval)

In [5]:
spec_chars = ["!",'"',"#","%","&","'","(",")", "*","+",",",
                  "-",".","/",":",";","<", "=",">","?","@","[",
                  "\\","]","^","_", "`","{","|","}","~","–", 
                  "\xc2", "\xa0", "\x80", "\x9c", "\x99", "\x94", 
                  "\xad", "\xe2", "\x9d", "\n"]

df = df.drop(['Unnamed: 0'], axis=1)

#for char in spec_chars:
#    df['text'] = df['text'].str.strip()
#    df['text'] = df['text'].str.replace(char, ' ')
       
# access each column separately
for i in range(len(df.index)):
    for col in df.columns:
        for char in spec_chars:
            try:
                df.loc[i, col][0] = df.loc[i, col][0].str.strip()
                df.loc[i, col][0] = df.loc[i, col][0].str.replace(char, ' ')
            except:
                pass

In [6]:
df.shape

(1783, 22)

In [7]:
articles = {'hyde_park': [], 'beacon_hill': [], 'south_boston': [], 'jamaica_plain': [], 'east_boston': [],
                'south_end': [], 'back_bay': [], 'north_end': [], 'west_roxbury': [], 'mission_hill': [],
                'harbor_islands': [], 'west_end': [], 'longwood_medical_area': [],
                'dorchester': [], 'roxbury': [], 'downtown': [], 'fenway': [], 'mattapan': [], 'brighton': [],
                'charlestown': [], 'roslindale': [], 'allston': []}
for sub_neighborhood in articles.keys():
    for i in range(df.shape[0]):
        if type(df.loc[i, sub_neighborhood]) == tuple:
            articles[sub_neighborhood].append((nlp(df.loc[i, sub_neighborhood][0]), df.loc[i, sub_neighborhood][1]))
    print(sub_neighborhood + ' DONE')

hyde_park DONE
beacon_hill DONE
south_boston DONE
jamaica_plain DONE
east_boston DONE
south_end DONE
back_bay DONE
north_end DONE
west_roxbury DONE
mission_hill DONE
harbor_islands DONE
west_end DONE
longwood_medical_area DONE
dorchester DONE
roxbury DONE
downtown DONE
fenway DONE
mattapan DONE
brighton DONE
charlestown DONE
roslindale DONE
allston DONE


In [8]:
articles['dorchester'][0][1]

'2018_3'

In [9]:
people = {'hyde_park': [], 'beacon_hill': [], 'south_boston': [], 'jamaica_plain': [], 'east_boston': [],
                'south_end': [], 'back_bay': [], 'north_end': [], 'west_roxbury': [], 'mission_hill': [],
                'harbor_islands': [], 'west_end': [], 'longwood_medical_area': [],
                'dorchester': [], 'roxbury': [], 'downtown': [], 'fenway': [], 'mattapan': [], 'brighton': [],
                'charlestown': [], 'roslindale': [], 'allston': []}

for sub_neighborhood in articles.keys():
    for (doc, article_id) in articles[sub_neighborhood]:
        for ent in doc.ents:
            if ent.label_ == 'PERSON':
                name = ent[0:2]
                # sentence = ent.sent
                people[sub_neighborhood].append((name, article_id))

In [10]:
for sub_neighborhood in people.keys():
    list1 = people[sub_neighborhood]
    # convert the list to the set
    list_set = set(list1)
    # convert the set to the list
    unique_list = (list(list_set))
    people[sub_neighborhood] = unique_list

In [11]:
representation_proportions = {'hyde_park': [], 'beacon_hill': [], 'south_boston': [], 'jamaica_plain': [], 'east_boston': [],
                'south_end': [], 'back_bay': [], 'north_end': [], 'west_roxbury': [], 'mission_hill': [],
                'harbor_islands': [], 'west_end': [], 'longwood_medical_area': [],
                'dorchester': [], 'roxbury': [], 'downtown': [], 'fenway': [], 'mattapan': [], 'brighton': [],
                'charlestown': [], 'roslindale': [], 'allston': []}
for sub_neighborhood in people.keys():
    for i in range(len(people[sub_neighborhood])):
        if people[sub_neighborhood][i][0].text.strip() != '':
            temp = people[sub_neighborhood][i][0].text.split()
            if len(temp) > 1:
                people[sub_neighborhood][i] = (temp[0], people[sub_neighborhood][i][1])
            else:
                people[sub_neighborhood][i] = (temp[0], people[sub_neighborhood][i][1])

In [12]:
people['dorchester'][0]

('Johnson', '2018_422')

In [13]:
from ethnicolr import pred_wiki_ln

AttributeError: module 'keras.utils.generic_utils' has no attribute 'populate_dict_with_module_objects'

In [None]:
# temp = pd.DataFrame(people['fenway'], columns=['last_name', 'article_id'])
# temp = temp.drop(['article_id'], axis=1)
# temp['subneighborhood'] = 'fenway'
# temp.head()

In [None]:
final_df = pd.DataFrame(columns=['last_name', 'subneighborhood'])
subs = white_neighborhoods + black_neighborhoods
#subs.remove('longwood_medical_area')
for col in subs:
    temp = pd.DataFrame(people[col], columns=['last_name', 'article_id'])
    temp = temp.drop(['article_id'], axis=1)
    temp['subneighborhood'] = col
    preds = pred_census_ln(temp, 'last_name', 2010)
    final_df = pd.concat([final_df, preds], axis=0)
    print(col + ' DONE')

In [None]:
final_df = final_df.drop(['api', 'black', 'hispanic', 'white'], axis=1)

In [None]:
hen = final_df['race'].value_counts()
pd.DataFrame(hen)

In [None]:
groups = final_df.groupby(final_df.subneighborhood)
race_df = pd.DataFrame(columns=subs)
races = ['api', 'black', 'hispanic', 'white']
for col in subs:
    group = groups.get_group(col)
    race_df[col] = group.race.value_counts()

In [None]:
race_df = race_df.fillna(0.0)
race_df

In [None]:
for i in range(len(subs)):
    tot = race_df[subs[i]].sum()
    for j in race_df.index:
        race_df.loc[j, [subs[i]]] = (race_df.loc[j, [subs[i]]] / tot) * 100

In [None]:
race_df

In [None]:
race_df.to_csv('Wiki_Last_Name_Race_Representation_by_Neighborhood/2018_representation.csv')