# Race Patterns Analysis

Done by: Merna Alghannam, Team 1

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
# import statsmodels.api as sm

In [None]:
ROOT_FOLDER = '/Users/merna/OneDrive/Desktop/ds-state-ma-housing-comitt/'

In [None]:
# data is now our pandas DataFrame containing all of the data from our csv file
geo_df = pd.read_csv(ROOT_FOLDER + "/Eviction_CaseS/eviction_with_geographic.csv",index_col=0)

In [None]:
# Created new columns where I calculated the total number of people in race a/total population in that municiplation and multiplied
#by a 100 to get the percentage of each race in a municipality in order to ease analysis

#Anyone not specified with a name in the dataset
geo_df['other'] = (geo_df['Estimate!!Total:!!Not Hispanic or Latino:!!Some other race alone'] + geo_df['Estimate!!Total:!!Not Hispanic or Latino:!!Two or more races:']- geo_df['Estimate!!Total:!!Not Hispanic or Latino:!!Two or more races:!!Two races including Some other race'])/geo_df['Estimate!!Total:'] * 100

#new column for percentage of white
geo_df['White'] = geo_df['Estimate!!Total:!!Not Hispanic or Latino:!!White alone']/geo_df['Estimate!!Total:'] * 100

geo_df['Black or African American'] = geo_df['Estimate!!Total:!!Not Hispanic or Latino:!!Black or African American alone']/geo_df['Estimate!!Total:'] * 100

geo_df['Asian'] = geo_df['Estimate!!Total:!!Not Hispanic or Latino:!!Asian alone']/geo_df['Estimate!!Total:'] * 100

geo_df['Native Hawaiian and Other Pacific Islander'] = geo_df['Estimate!!Total:!!Not Hispanic or Latino:!!Native Hawaiian and Other Pacific Islander alone']/geo_df['Estimate!!Total:'] * 100

geo_df['American Indian and Alaska Native'] = geo_df['Estimate!!Total:!!Not Hispanic or Latino:!!American Indian and Alaska Native alone']/geo_df['Estimate!!Total:'] * 100

geo_df['Hispanic or Latino'] = geo_df['Estimate!!Total:!!Hispanic or Latino:']/geo_df['Estimate!!Total:'] * 100

#Calculated percentage of minorities or POC
geo_df['Minority'] = geo_df['Hispanic or Latino'] + geo_df['Black or African American'] + geo_df['Native Hawaiian and Other Pacific Islander'] + geo_df['American Indian and Alaska Native']

In [None]:
#contains all columns relavant for analysis
race_profile_columns=['Muni',"Eviction Numbers", 'Evictions per Rented Households', 'White', 'Black or African American', 'Asian', 'Native Hawaiian and Other Pacific Islander',
           'American Indian and Alaska Native', 'other', 'Hispanic or Latino']

In [None]:
#population brackets file which contains the name of municipality for each bracket
one_to_ten_k = pd.read_csv(ROOT_FOLDER+'population_brackets/1kTO10K.csv',skiprows=1)
ten_to_fifty_k = pd.read_csv(ROOT_FOLDER+'population_brackets/10kTO50K.csv',skiprows=1)
fifty_to_90_k = pd.read_csv(ROOT_FOLDER+'population_brackets/50kTO90K.csv',skiprows=1)
ninty_to_200_k = pd.read_csv(ROOT_FOLDER+'population_brackets/90kTO200K.csv',skiprows=1)

In [None]:
#created a df that contains only the municipalities in population bracket 1
edu_one_to_ten_k = geo_df[geo_df["Muni"].isin(one_to_ten_k.Middleton.append(pd.Series('Middleton')))]

#Found the top highest and lowest eviction rated per rented household - 2 municipipalities each
low_three_1 = edu_one_to_ten_k[edu_one_to_ten_k['Eviction Numbers'] >=20].sort_values('Evictions per Rented Households', ascending=False).head(2).filter(race_profile_columns, axis=1)
top_three_1 = edu_one_to_ten_k[edu_one_to_ten_k['Eviction Numbers'] >=10].sort_values('Evictions per Rented Households').head(2).filter(race_profile_columns, axis=1)

#combined them together in a single df
race_1 = pd.concat([low_three_1, top_three_1.reindex(index=top_three_1.index[::-1])])

In [None]:
race_1

In [None]:
#created a df that contains only the municipalities in population bracket 2
edu_ten_to_fifty_k = geo_df[geo_df["Muni"].isin(ten_to_fifty_k.Everett.append(pd.Series('Everett')))]

#Found the top highest and lowest eviction rated per rented household - 2 municipipalities each
low_three_2 = edu_ten_to_fifty_k[edu_ten_to_fifty_k['Eviction Numbers'] >=10].sort_values('Evictions per Rented Households', ascending=False).head(2).filter(race_profile_columns, axis=1)
top_three_2 = edu_ten_to_fifty_k[edu_ten_to_fifty_k['Eviction Numbers'] >=10].sort_values('Evictions per Rented Households').head(2).filter(race_profile_columns, axis=1)

#combined them together in a single df
race_2 = pd.concat([low_three_2, top_three_2.reindex(index=top_three_2.index[::-1])])

In [None]:
race_2

In [None]:
#created a df that contains only the municipalities in population bracket 3
edu_fifty_to_90_k = geo_df[geo_df["Muni"].isin(fifty_to_90_k['Fall River'].append(pd.Series('Fall River')))]

#Found the top highest and lowest eviction rated per rented household - 2 municipipalities each
low_three_3 = edu_fifty_to_90_k[edu_fifty_to_90_k['Eviction Numbers'] >=10].sort_values('Evictions per Rented Households', ascending=False).head(2).filter(race_profile_columns, axis=1)
top_three_3 = edu_fifty_to_90_k[edu_fifty_to_90_k['Eviction Numbers'] >=10].sort_values('Evictions per Rented Households').head(2).filter(race_profile_columns, axis=1)

#combined them together in a single df
race_3 = pd.concat([low_three_3, top_three_3.reindex(index=top_three_3.index[::-1])])

In [None]:
race_3

In [None]:
#created a df that contains only the municipalities in population bracket 3
edu_ninty_to_200_k = geo_df[geo_df["Muni"].isin(ninty_to_200_k['Worcester'].append(pd.Series('Worcester')))]

#sorted municipalities from highest to lowest
race_4 = edu_ninty_to_200_k.sort_values('Evictions per Rented Households', ascending=False).filter(race_profile_columns, axis=1)

In [None]:
race_4

# Plot minoritiy distribution in relation to eviction rate

In [None]:
plt.figure(figsize=(20,10))
gateway_cities = ["Attleboro", "Brockton", "Chelsea", "Chicopee", "Everett", "Fall River", "Fitchburg", "Haverhill", "Holyoke",
                  "Lawrence", "Leominster", "Lowell", "Lynn", "Malden", 
                  "Methuen", "New Bedford", "Peabody", "Pittsfield", "Quincy", "Revere", "Salem", "Springfield", "Taunton", "Westfield", "Worcester"]

#Plot all municipalities in population bracket 1
plt.plot(edu_one_to_ten_k['Evictions per Rented Households'], edu_one_to_ten_k["Minority"], 'o',
             label="Population: 1k - 10k", markersize=3)

#plot for population bracket 1 top highest and lowest eviction rated per 
#rented household - 2 municipipalities each
for city in (race_1["Muni"].tolist()):
    plt.annotate(city, (edu_one_to_ten_k[edu_one_to_ten_k["Muni"] == city]['Evictions per Rented Households'], 
                        edu_one_to_ten_k[edu_one_to_ten_k["Muni"] == city]["Minority"]))

#Plot all municipalities in population bracket 2
plt.plot(edu_ten_to_fifty_k['Evictions per Rented Households'], edu_ten_to_fifty_k["Minority"], 'o', label="Population: Population: 10k - 50k")

#plot for population bracket 1 top highest and lowest eviction rated per 
#rented household - 2 municipipalities each
for city in (race_2["Muni"].tolist()):
    plt.annotate(city, (edu_ten_to_fifty_k[edu_ten_to_fifty_k["Muni"] == city]['Evictions per Rented Households'], 
                        edu_ten_to_fifty_k[edu_ten_to_fifty_k["Muni"] == city]["Minority"]))

# Plot all gateway cities in population bracket 2
for city in gateway_cities: 
    if city in edu_ten_to_fifty_k["Muni"].tolist():
        plt.annotate(city, (edu_ten_to_fifty_k[edu_ten_to_fifty_k["Muni"] == city]['Evictions per Rented Households'], 
                            edu_ten_to_fifty_k[edu_ten_to_fifty_k["Muni"] == city]["Minority"]), color='Purple', size=10)

#Plot all municipalities in population bracket 3
plt.plot(edu_fifty_to_90_k['Evictions per Rented Households'], edu_fifty_to_90_k["Minority"], 'o', label="Population: 50k - 90k", markersize=10)


#plot for population bracket 1 top highest and lowest eviction rated per 
#rented household - 2 municipipalities each
for city in (race_3["Muni"].tolist()):
    plt.annotate(city, (edu_fifty_to_90_k[edu_fifty_to_90_k["Muni"] == city]['Evictions per Rented Households'], 
                        edu_fifty_to_90_k[edu_fifty_to_90_k["Muni"] == city]["Minority"]))

# Plot all gateway cities in population bracket 3
for city in gateway_cities: 
    if city in edu_fifty_to_90_k["Muni"].tolist():
        plt.annotate(city, (edu_fifty_to_90_k[edu_fifty_to_90_k["Muni"] == city]['Evictions per Rented Households'], 
                            edu_fifty_to_90_k[edu_fifty_to_90_k["Muni"] == city]["Minority"]), color='Purple', size=10)

#Plot all municipalities in population bracket 4
plt.plot(edu_ninty_to_200_k['Evictions per Rented Households'], edu_ninty_to_200_k["Minority"], 'o', label="Population: 90k - 200k", markersize=15)

# Plot all gateway cities in population bracket 4
for city in gateway_cities: 
    if city in edu_ninty_to_200_k["Muni"].tolist():
        plt.annotate(city, (edu_ninty_to_200_k[edu_ninty_to_200_k["Muni"] == city]['Evictions per Rented Households'], 
                            edu_ninty_to_200_k[edu_ninty_to_200_k["Muni"] == city]["Minority"]), color='Purple', size=10)

plt.title('% of Minorities in Cities from Lowest to Highest Evictions per Rented Households', color='black')
plt.xlabel("Rate of Evictions per Rented Households")
plt.ylabel("% of Minorities")
plt.plot([], [], ' ', label="Purple Text: Gateway City")
plt.legend(numpoints=1)
plt.show()

# Statewide Race vs. Eviction Rate patterns

In [None]:

#Plot all municipalities percentage for black/aa race
plt.plot(geo_df['Evictions per Rented Households'], geo_df["Black or African American"], 'o', markersize=3, color='blue')

#Calculate linear regression model for Black/AA percentage per municipality
rate_race = linear_model.LinearRegression().fit(geo_df['Evictions per Rented Households'].values.reshape(-1, 1), geo_df["Black or African American"].values)
regression_line = rate_race.predict(geo_df['Evictions per Rented Households'].values.reshape(-1, 1))

#plot line
plt.plot(geo_df['Evictions per Rented Households'].values, regression_line, label='Best Fit Line', color='blue', linewidth=1)

plt.title('% Black/AA in all Municipality', color='black')
plt.xlabel("Rate of Evictions per Rented Households")
plt.ylabel("% Black/AA")
plt.legend(numpoints=1)

plt.show()

#Plot all municipalities percentage for hispanic/latinx race
plt.plot(geo_df['Evictions per Rented Households'].values, geo_df["Hispanic or Latino"].values, 'o', markersize=3, color='orange')

#Calculate linear regression model for hispanic/latinx percentage per municipality
rate_race = linear_model.LinearRegression().fit(geo_df['Evictions per Rented Households'].values.reshape(-1, 1), geo_df["Hispanic or Latino"].values)
regression_line = rate_race.predict(geo_df['Evictions per Rented Households'].values.reshape(-1, 1))

#plot line
plt.plot(geo_df['Evictions per Rented Households'].values, regression_line, label='Best Fit Line', color='orange', linewidth=1)

plt.title('% Hispanic/LatinX in all Municipality', color='black')
plt.xlabel("Rate of Evictions per Rented Households")
plt.ylabel("% of Hispanic/LatinX")
plt.legend(numpoints=1)

plt.show()

#Plot all municipalities percentage for asian race
plt.plot(geo_df['Evictions per Rented Households'], geo_df["Asian"], 'o', markersize=3, color='purple')

#Calculate linear regression model for asian percentage per municipality
rate_race = linear_model.LinearRegression().fit(geo_df['Evictions per Rented Households'].values.reshape(-1, 1), geo_df["Asian"].values)
regression_line = rate_race.predict(geo_df['Evictions per Rented Households'].values.reshape(-1, 1))

#plot line
plt.plot(geo_df['Evictions per Rented Households'].values, regression_line, label='Best Fit Line', color='purple', linewidth=1)


plt.title('% Asian in all Municipality', color='black')
plt.xlabel("Rate of Evictions per Rented Households")
plt.ylabel("% Asian")
plt.legend(numpoints=1)

plt.show()


#Plot all municipalities percentage for white race
plt.plot(geo_df['Evictions per Rented Households'], geo_df["White"], 'o', markersize=3, color='red')

#Calculate linear regression model for white percentage per municipality
rate_race = linear_model.LinearRegression().fit(geo_df['Evictions per Rented Households'].values.reshape(-1, 1), geo_df["White"].values)
regression_line = rate_race.predict(geo_df['Evictions per Rented Households'].values.reshape(-1, 1))

#plot line
plt.plot(geo_df['Evictions per Rented Households'].values, regression_line, label='Best Fit Line', color='red', linewidth=1)


plt.title('% White in all Municipality', color='black')
plt.xlabel("Rate of Evictions per Rented Households")
plt.ylabel("% White")
plt.legend(numpoints=1)

plt.show()

# Race distribution in gateway cities vs. eviction rate 

In [None]:
#Plot race distribution in relation to eviction rate for all gateway cities
gateway_cities = ["Attleboro", "Brockton", "Chelsea", "Chicopee", "Everett", "Fall River", "Fitchburg", "Haverhill", "Holyoke",
                  "Lawrence", "Leominster", "Lowell", "Lynn", "Malden", 
                  "Methuen", "New Bedford", "Peabody", "Pittsfield", "Quincy", "Revere", "Salem", "Springfield", "Taunton", "Westfield", "Worcester"]

#Plot all gateway cities percentage for black/aa race
plt.plot(geo_df[geo_df['Muni'].isin(gateway_cities)]['Evictions per Rented Households'], geo_df[geo_df['Muni'].isin(gateway_cities)]["Black or African American"], 'o', markersize=3, color='blue')


plt.title('% Black/AA in Gateway Cities', color='black')
plt.xlabel("Rate of Evictions per Rented Households")
plt.ylabel("% Black/AA")

plt.show()


#Plot all gateway cities percentage for hispanic/latinx race
plt.plot(geo_df[geo_df['Muni'].isin(gateway_cities)]['Evictions per Rented Households'], geo_df[geo_df['Muni'].isin(gateway_cities)]["Hispanic or Latino"], 'o', markersize=3, color='orange')

plt.title('% Hispanic/LatinX in Gateway Cities', color='black')
plt.xlabel("Rate of Evictions per Rented Households")
plt.ylabel("% of Hispanic/LatinX")

plt.show()


#Plot all gateway cities percentage for asian race
plt.plot(geo_df[geo_df['Muni'].isin(gateway_cities)]['Evictions per Rented Households'], geo_df[geo_df['Muni'].isin(gateway_cities)]["Asian"], 'o', markersize=3, color='purple')


plt.title('% Asian in Gateway Cities', color='black')
plt.xlabel("Rate of Evictions per Rented Households")
plt.ylabel("% Asian")

plt.show()


#Plot all gateway cities percentage for white race
plt.plot(geo_df[geo_df['Muni'].isin(gateway_cities)]['Evictions per Rented Households'], geo_df[geo_df['Muni'].isin(gateway_cities)]["White"], 'o', markersize=3, color='red')


plt.title('% White in Gateway Cities', color='black')
plt.xlabel("Rate of Evictions per Rented Households")
plt.ylabel("% White")

plt.show()

In [None]:
#attempted to create an ols model. It had very low statistical significance
# X2 = sm.add_constant(edu_one_to_ten_k[["Hispanic or Latino", "Black or African American", "White", "Asian"]])

# est= sm.OLS(edu_one_to_ten_k['Evictions per Rented Households'].values, X2)
# reg2 = est.fit()

# reg2.summary()