In [189]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

df = pd.read_csv("hatecrime.csv")

X_Income = df.loc[:, ["gini_index"]].to_numpy().reshape(-1, 1)
X_white_pov = df.loc[:, ['share_white_poverty']].to_numpy().reshape(-1, 1)
X_non_white = df.loc[:, ['share_non_white']].to_numpy().reshape(-1, 1)

X_states_hatecrime_splc = df.loc[:, ["state", "hate_crimes_per_100k_splc"]]

X_states_hatecrime_fbi = df.loc[:, ["state", "avg_hatecrimes_per_100k_fbi"]]

X_non_white_white_pov = df.loc[:, ["share_non_white", "share_white_poverty"]].to_numpy()
y_HateCrimes = df.loc[:, "hate_crimes_per_100k_splc"]


In [190]:

def preprocess(y):
    ny = []

    med = np.nanmedian(y)

    for i in y:
        if np.isnan(i):
            ny.append(med)
        else:
            ny.append(i)

    return ny


X_Income = preprocess(X_Income)

X_white_pov = preprocess(X_white_pov)

X_non_white = preprocess(X_non_white)

# X_non_white_white_pov = preprocess(X_non_white_white_pov)

y_HateCrimes = preprocess(y_HateCrimes)

In [191]:
X_train_gini, X_test_gini, y_train_gini, y_test_gini = train_test_split(X_Income, y_HateCrimes, random_state=42,
                                                                        shuffle=True)

X_train_white_pov, X_test_white_pov, y_train_white_pov, y_test_white_pov = train_test_split(X_white_pov, y_HateCrimes,
                                                                                            random_state=42,
                                                                                            shuffle=True)

X_train_non_white, X_test_non_white, y_train_non_white, y_test_non_white = train_test_split(X_non_white, y_HateCrimes,
                                                                                            random_state=42,
                                                                                            shuffle=True)

X_train_non_white_white_pov, X_test_non_white_white_pov, y_train_non_white_white_pov, y_test_non_white_white_pov = (
    train_test_split(X_non_white_white_pov, y_HateCrimes, random_state=42, shuffle=True))

model = LinearRegression()


In [192]:
model.fit(X_train_gini, y_train_gini)

y_pred_gini = model.predict(X_test_gini)

score = model.coef_[0]

intercept = model.intercept_

print(f"Income Inequality Index:\nScore: {score}\nIntercept: {intercept}")
print("Test R^2:", model.score(X_test_gini, y_test_gini))


Income Inequality Index:
Score: -0.8657637493232659
Intercept: 0.6826259799985643
Test R^2: -0.1043728024760231


In [193]:
model.fit(X_train_white_pov, y_train_white_pov)

y_pred_white_pov = model.predict(X_test_white_pov)

score = model.coef_[0]

intercept = model.intercept_

print(f"White Poverty Proportion: \nScore: {score}\nIntercept: {intercept}")
print("Test R^2:", model.score(X_test_white_pov, y_test_white_pov))


White Poverty Proportion: 
Score: -0.4473235413303558
Intercept: 0.334001492914819
Test R^2: 0.03475586647535611


In [194]:
model.fit(X_train_non_white, y_train_non_white)

y_pred_non_white = model.predict(X_test_non_white)

score = model.coef_[0]

intercept = model.intercept_

print(f"Non-white proportion:\nScore: {score}\nIntercept: {intercept}")
print("Test R^2:", model.score(X_test_non_white, y_test_non_white))


Non-white proportion:
Score: -0.2796864909307416
Intercept: 0.375248951800577
Test R^2: -0.13797825475907954


In [195]:
model.fit(X_train_non_white_white_pov, y_train_non_white_white_pov)

y_pred_non_white = model.predict(X_test_non_white_white_pov)

score = model.coef_

intercept = model.intercept_

print(f"Non-white and White_poverty proportions:\nScore: {score}\nIntercept: {intercept}")
print("Test R^2:", model.score(X_test_non_white_white_pov, y_test_non_white_white_pov))


Non-white and White_poverty proportions:
Score: [-0.30144004 -0.88436111]
Intercept: 0.46142277994862835
Test R^2: -0.07118001259481477


In [196]:
hate_crime_var = np.var(y_HateCrimes)
print(f"Hate crime variance across states: {hate_crime_var:.3f}")

Hate crime variance across states: 0.058


In [197]:
from collections import defaultdict

find_similar = defaultdict(list)

for i in range(len(X_states_hatecrime_splc)):
    find_similar[str(X_states_hatecrime_splc.iloc[i, 0])].append(float(X_states_hatecrime_splc.iloc[i, 1]))
    find_similar[str(X_states_hatecrime_fbi.iloc[i, 0])].append(float(X_states_hatecrime_fbi.iloc[i, 1]))

smallest_state = ""
greatest_state = ""
smallest_diff = -1
greatest_diff = -1

similar_states = []

similar_diff = 0.3
# print(find_similar)
for k,v in find_similar.items():
    diff = abs(v[0] - v[1])
    if smallest_diff == -1:
        smallest_diff = diff
        smallest_state = k
    elif smallest_diff > diff:
        smallest_diff = diff
        smallest_state = k

    if greatest_diff == -1:
        greatest_diff = diff
        greatest_state = k
    elif greatest_diff < diff:
        greatest_diff = diff
        greatest_state = k

    if round(diff, 2) <= similar_diff:
        similar_states.append(k)


print(f"State w/ smallest diff: {smallest_state}: {smallest_diff:.3f}\nState w/ greatest diff: {greatest_state}: "
      f"{greatest_diff:.3f}")

print(f"States with similar differences: {similar_states}")

State w/ smallest diff: Iowa: 0.107
State w/ greatest diff: District of Columbia: 9.431
States with similar differences: ['Georgia', 'Iowa', 'Pennsylvania']
