In [None]:
# # if you do not have the folder to begin with:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd content/drive/MyDrive
#!git clone https://github.com/MadBeignet/MadBeignet.github.io

In [None]:
#%cd../../../

In [None]:
# # first, mount your google drive, change to the course folder, pull latest changes, and change to the lab folder.
# from google.colab import drive
# drive.mount('/content/drive',force_remount=True)
# %cd content/MadBeignet.github.io
# !git pull
# %cd Data

In [None]:
%cd './Data'

In [None]:
# imports
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import numpy as np
from matplotlib.pyplot import figure
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

Team: Merrilee Montgomery and Maddie Wisinski


Website Link: https://madbeignet.github.io/

<h1>Project Goals</h1>
<p>The team will be looking at the relationship between political participation and political resistance in the United States from 2000-2021 by state.</p> <p>To measure political participation, the team will use voter turnout statistics by state from collected by the Election Project. The election project website derives all its data from individual state websites.</p><p>This project will distinguish between violent and nonviolent political resistance. To measure nonviolent political resistance, this group will use protest frequency and size from Count Love, a group from MIT that began tracking protests amidst the 2017 Women's March. To study violent political resistance, this project will use Profiles of Individual Radicalization in the US (PIRUS) from University of Maryland National Consortium for the Study of Terrorism and Responses to Terrorism (START). The PIRUS Dataset contains informaiton about individuals who's radicalization became apparent through their plotting to engage in violent activity.</p>

Election Project: https://www.electproject.org/home

Count Love: https://countlove.org/faq.html

PIRUS: https://www.start.umd.edu/data-tools/profiles-individual-radicalization-united-states-pirus


<h1>Voter Turnout: 2000-2022</h1>

<h2>Cleaning the Data</h2><p>The Election Project collects voter turnout data for the general election that occur every two comes in separate CSVs by year. Here we want to read all by-year files into a single DataFrame. To do so, we must account for the following:</p> 

1.   Years 2000-2010 are in a uniform format, but missing state abbreviation.
2.   Years 2012-2020 have an extra column of state abreviation that can be used to create a index value consisting of Year and State Abbreviation.
3.   Years 2016-2020 have notes at the end of each csv that must be deleted.

<h4>Step 1: Concatenate years 2000-2010</h4>

In [None]:
csv_final = pd.read_csv("./Voter_Turnouts/2000 November General Election - Turnout Rates.csv",
                        header = None,
                        skiprows = 2)#first two rows is header in CSV
csv_final['Year']=2000

l = []#we will use this to make sure all files loaded
for a in range (2002,2012,2):
  csv_temp = pd.read_csv("./Voter_Turnouts/"+str(a)+" November General Election - Turnout Rates.csv",
                         header = None,
                         skiprows = 2)#first two rows are headers in csv
  csv_temp['Year']=a#As year is incremented,value changes
  l.append(1)
  csv_final = pd.concat([csv_final,csv_temp],ignore_index = True)
final_df = pd.DataFrame(csv_final)
print(len(l) == 5)#Test to make sure all files were uploaded, returns true if successful, false else
final_df.columns = ['Region', 'VEP Total Ballots Counted', 'VEP Highest Office', 'VAP Highest Office', 'Total Ballots Counted', 'Highest Office', 'Voting-Eligible Population (VEP)', 'Voting-Age Population (VAP)', '% Non-citizen', 'Prison', 'Probation', 'Parole', 'Total Ineligible Felon', 'Overseas Eligible', 'Year']
#Rename all columns

final_df

<h4>Step 2: Drop State Abbreviations and Excess Rows</h4>


<p>To concatenate Voter Turnout from years 2012-2022, we have to remove the abbreviation column and any excess rows (which are usually methodology notes.)</p>

In [None]:
l = []#we will use this to make sure all files loaded
for a in range (2012,2016,2):
  csv_temp = pd.read_csv("./Voter_Turnouts/"+str(a)+" November General Election - Turnout Rates.csv",
                         header = None,
                         skiprows = 2,
                         names = ['Region', 'VEP Total Ballots Counted', 'VEP Highest Office', 'VAP Highest Office', 'Total Ballots Counted', 'Highest Office', 'Voting-Eligible Population (VEP)', 'Voting-Age Population (VAP)', '% Non-citizen', 'Prison', 'Probation', 'Parole', 'Total Ineligible Felon', 'Overseas Eligible', 'State Abv'])#first two rows are headers in csv
  csv_temp['Year']=a#As year is incremented,value changes
  csv_temp = csv_temp.iloc[:52]
  csv_temp.drop('State Abv',inplace=True,axis=1)
  csv_final = pd.concat([csv_final,csv_temp],ignore_index=True)#not sure whats going wrong now, I'll ask Dr. Culotta

csv_final

<p>After 2014, Voter Turnout Data Column names and values vary more. As a result, we must clean each individual dataset to concatenate</p>
<p></p>
<h4>2018</h4>

In [None]:
temp_18 = pd.read_csv("./Voter_Turnouts/2018 November General Election - Turnout Rates.csv",
                      names =['Region', 'Estimated or Actual 2018 Total Ballots Counted VEP Turnout Rate', '2018 Vote for Highest Office VEP Turnout Rate', 'Status', 'Source', 'Estimated or Actual 2018 Total Ballots Counted', '2018 Vote for Highest Office', 'Voting-Eligible Population (VEP)', 'Voting-Age Population (VAP)', '% Non-citizen', 'Prison', 'Probation', 'Parole', 'Total Ineligible Felon', 'Overseas Eligible', 'State Abv'],
                      skiprows=2,
                      header = None)
temp_18.drop('Source',inplace=True,axis=1)
temp_18.drop('Status',inplace=True,axis=1)
temp_18.drop('State Abv',inplace=True,axis=1)
csv_final.drop('VAP Highest Office',inplace=True,axis=1)
csv_final.columns
temp_18.columns = ['Region', 'VEP Total Ballots Counted', 'VEP Highest Office',
       'Total Ballots Counted', 'Highest Office',
       'Voting-Eligible Population (VEP)', 'Voting-Age Population (VAP)',
       '% Non-citizen', 'Prison', 'Probation', 'Parole',
       'Total Ineligible Felon', 'Overseas Eligible']
temp_18['Year']=2018
temp_18 = temp_18.iloc[:52]
csv_final = pd.concat([csv_final,temp_18],ignore_index = True)
csv_final


In [None]:
l = 'Region,Source,Status,Total Ballots Counted (Estimate),Vote for Highest Office (President),VEP Turnout Rate (Total Ballots Counted),VEP Turnout Rate (Highest Office),Voting-Eligible Population (VEP),Voting-Age Population (VAP),% Non-citizen,Prison,Probation,Parole,Total Ineligible Felon,Overseas Eligible,State Abv'
lis = l.split(',')
print(lis)


<h4>2016</h4>

In [None]:
temp_16 = pd.read_csv("./Voter_Turnouts/2016 November General Election - Turnout Rates.csv",
                      names =['Region', 'State Results Website', 'Status', 'VEP Total Ballots Counted', 'VEP Highest Office', 'VAP Highest Office', 'Total Ballots Counted (Estimate)', 'Highest Office', 'Voting-Eligible Population (VEP)', 'Voting-Age Population (VAP)', '% Non-citizen', 'Prison', 'Probation', 'Parole', 'Total Ineligible Felon', 'Overseas Eligible', 'State Abv'],
                      skiprows=2,
                      header = None)
temp_16.drop('Status',inplace=True,axis=1)
temp_16.drop('State Results Website',inplace=True,axis=1)
temp_16.drop('State Abv',inplace=True,axis=1)
temp_16.drop('VAP Highest Office',inplace=True,axis=1)
temp_16['Year']=2016
temp_16.columns = csv_final.columns
temp_16 = temp_16.iloc[:52]
csv_final = pd.concat([csv_final,temp_16],ignore_index = True)
csv_final


<h4>2020</h4>

In [None]:
temp_20 = pd.read_csv("./Voter_Turnouts/2020 November General Election - Turnout Rates.csv",
                      names =['Region', 'Source', 'Status', 'Total Ballots Counted (Estimate)', 'Vote for Highest Office (President)', 'VEP Turnout Rate (Total Ballots Counted)', 'VEP Turnout Rate (Highest Office)', 'Voting-Eligible Population (VEP)', 'Voting-Age Population (VAP)', '% Non-citizen', 'Prison', 'Probation', 'Parole', 'Total Ineligible Felon', 'Overseas Eligible', 'State Abv'],
                      skiprows=2,
                      header = None)
temp_20.drop('Source',inplace=True,axis=1)
temp_20.drop('Status',inplace=True,axis=1)
temp_20.drop('State Abv',inplace=True,axis=1)
temp_20 = temp_20[['Region','VEP Turnout Rate (Total Ballots Counted)','VEP Turnout Rate (Highest Office)','Total Ballots Counted (Estimate)', 'Vote for Highest Office (President)', 'Voting-Eligible Population (VEP)', 'Voting-Age Population (VAP)', '% Non-citizen', 'Prison', 'Probation', 'Parole', 'Total Ineligible Felon', 'Overseas Eligible']]
temp_20['Year'] = 2020
temp_20.columns = csv_final.columns
temp_20 = temp_20.iloc[:52]
csv_final = pd.concat([csv_final,temp_20],ignore_index = True)
csv_final

In [None]:
states_cleaned = []
for e in csv_final.Region:
    e = str(e).replace('*','')
    states_cleaned.append(e)
csv_final.Region = states_cleaned

pd.unique(csv_final.Region)

<h2>Radicalized Individuals in the United States</h2><p>This data set is collected on the individual level. Because we are examining trends on the state level, we will save this Data grouped to the individuals' origin states.</p>

<h4>1. Loading the PIRUS Data</h4><p>The PIRUS data measures 145 categorical and quantitative variables that do not load nicely into COLAB. We have taken the first header line from the CSV an split it into a list that can be passed as column names for the CSV.</p>

In [None]:
a = "Subject_ID,Loc_Plot_State1,Loc_Plot_City1,Loc_Plot_State2,Loc_Plot_City2,Date_Exposure,Plot_Target1,Plot_Target2,Plot_Target3,Attack_Preparation,Op_Security,Changing_Target,Anticp_Fatals_Targ,Internet_Use_Plot,Extent_Plot,Violent,Criminal_Severity,Criminal_Charges,Indict_Arrest,Current_Status,Group_Membership,Terrorist_Group_Name1,Terrorist_Group_Name2,Terrorist_Group_Name3,Actively_Recruited,Recruiter1,Recruiter2,Recruiter3,Actively_Connect,Group_Competition,Role_Group,Length_Group,Clique,Clique_Radicalize,Clique_Connect,Internet_Radicalization,Media_Radicalization,Social_Media,Social_Media_Frequency,Social_Media_Platform1,Social_Media_Platform2,Social_Media_Platform3,Social_Media_Platform4,Social_Media_Platform5,Social_Media_Activities1,Social_Media_Activities2,Social_Media_Activities3,Social_Media_Activities4,Social_Media_Activities5,Social_Media_Activities6,Social_Media_Activities7,Radicalization_Islamist,Radicalization_Far_Right,Radicalization_Far_Left,Radicalization_Single_Issue,Ideological_Sub_Category1,Ideological_Sub_Category2,Ideological_Sub_Category3,Loc_Habitation_State1,Loc_Habitation_City1,Loc_Habitation_State2,Loc_Habitation_City2,Itinerant,External_Rad,Rad_duration,Radical_Behaviors,Radical_Beliefs,US_Govt_Leader,Foreign_Govt_Leader,Event_Influence1,Event_Influence2,Event_Influence3,Event_Influence4,Beliefs_Trajectory,Behaviors_Trajectory,Radicalization_Sequence,Radicalization_Place,Prison_Radicalize,Broad_Ethnicity,Age,Marital_Status,Children,Age_Child,Gender,Religious_Background,Convert,Convert_Date,Reawakening,Reawakening_Date,Citizenship,Residency_Status,Nativity,Time_US_Months,Immigrant_Generation,Immigrant_Source,Language_English,Diaspora_Ties,Education,Student,Education_Change,Employment_Status,Change_Performance,Work_History,Military,Foreign_Military,Social_Stratum_Childhood,Social_Stratum_Adulthood,Aspirations,Abuse_Child,Abuse_Adult,Abuse_type1,Abuse_Type2,Abuse_Type3,Psychological,Alcohol_Drug,Absent_Parent,Overseas_Family,Close_Family,Family_Religiosity,Family_Ideology,Family_Ideological_Level,Prison_Family_Friend,Crime_Family_Friend,Radical_Friend,Radical_Family,Radical_Signif_Other,Relationship_Troubles,Platonic_Troubles,Unstructured_Time,Friendship_Source1,Friendship_Source2,Friendship_Source3,Kicked_Out,Previous_Criminal_Activity,Previous_Criminal_Activity_Type1,Previous_Criminal_Activity_Type2,Previous_Criminal_Activity_Type3,Previous_Criminal_Activity_Age,Gang,Gang_Age_Joined,Trauma,Other_Ideologies,Angry_US,Group_Grievance,Standing"
def listify(mis_string):
  return mis_string.split(",")
pirus_headlist = listify(a)
print(pirus_headlist)

In [None]:
pirus_temp = pd.read_csv("./PIRUS_May2020/PIRUS_Public_May2020.csv",
                         header=1,
                         names = pirus_headlist)
pirus_temp

<h4>2. Filtering and Grouping By States</h2><p> We will only examine radicalized individual since 2000 due to the fact that voter data comes from the 2000-2020 years, and protest data comes from the 2017-2021 years. (This data starts in 1948 and goes through 2018.)

We will also use value_counts to determine the different states that from which the radicalized individual originate from. We are assuming that this is also the state in which the individual is mostly likely to engage in popular protest and vote.</p>

In [None]:
#Date_Exposure is not comparable because it is of dtype string.
#Create column 'Year' of int values that represents the last 2 digit of the year.
l = []
for val in pirus_temp['Date_Exposure']:
  a = val.split('/')
  b = a[-1:]
  l.append(int(b[0]))
pirus_temp['Year'] = l
#Any row with 'Year' under 22 occured in the 2000's and is in the scope of this study
pirus_temp = pirus_temp[pirus_temp['Year'] <= 22]
#Group by state
pirus_states_since_2000 = pirus_temp.value_counts('Loc_Habitation_State1')
pirus_states_since_2000.plot(kind='bar', figsize=(20, 10))

<h2>Protests in the United States</h2><p>This data set is collected on the event level. Because we are examining trends on the state level, we will save this Data grouped to the protest event location. It is worth noting that popular protest often spreads. This data is harvested by webcrawling for news articles and similar media referencing the protest to a location. Therefore, protests that happened in wave, such as those in response to George Floyd's murder, will appearch multiple times. However, we will still count these as separate events, even if such events are comorbid.</p><p>Because this data set only covers 4 years, we do not have to filter it. We will only group it by state.</p>

In [None]:
protests_temp = pd.read_csv("./Protests/data.csv",
                         header = 1,
                         names = ['Date','Location','Attendees',
                         'Event (legacy; see tags)','Tags',
                         'Curated','Source','Total_Articles'])
protests_temp.head(20)

In [None]:
#Must create state attribute to groupby state, similar to extracting year, but first create dictionary matching statges to abbreviation.
states = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut","District of Columbia","Delaware","Florida","Georgia","Guam","Hawaii","Idaho","Illinois","Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland","Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana","Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York","North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania","Puerto Rico","Rhode Island","South Carolina","South Dakota","Tennessee","Texas","United States","Utah", "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]
abbrev = ["AL","AK","AZ","AR","CA","CO","CT","DC","DE","FL","GA","GU","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","PR","RI","SC","SD","TN","TX","US","UT","VT","VA","WA","WV","WI","WY"]
states_dict = {}
i = 0
for name in abbrev:
  states_dict[name] = states[i]
  i += 1
print(states_dict)
#create list of state names
protests_temp


In [None]:
#Create a list that can be added as a column to the DataFrame, representing the locatiion the protest took place in.
l = []
for val in protests_temp['Location']:
  m = val.split(',')
  if len(m) >= 2:
    n = m[-1][-2:]
    state = states_dict[n.upper()]
    l.append(state)
  else: #accounting for abnormal cases. implementation based on printing individual cases
    if m == 'La Porte County Courthouse in La Porte':
      l.append('Indiana')
    if m == 'Space':
      l.append('New York')
    if n == 'WA':
      l.append('Washington')
    if n == 'DE':
      l.append('Delaware')
protests_temp['State'] = l
#protests_temp

In [None]:
protests_by_state = protests_temp.value_counts('State')
protests_by_state.plot.barh(figsize=(10,15))

<p>From this bar chart, we can see that California has the highest number of protests. California also had the highest number of radicalized individuals.</p>

In [None]:
protests_by_state.mean()

We can also see that the mean number of protests by state is 718. 

<h1>Population Data</h1>
Source: Census Bureau. 
Notes: The years 2020 and 2021 were in different files, so had to join them.

In [None]:
pop20_21 = pd.read_csv('./Population/2020-2021 Census Bureau Population.csv')
#Rename columns due to header reading error
pop20_21.rename(columns={'Population Estimate\n (as of July 1)':'2020','Unnamed: 3':'2021'},inplace=True)
#Drop the first 6 rows becausethey are aggregates
pop20_21 = pop20_21.iloc[6:]
list1 = []
for i in pop20_21['Geographic Area']:
  i = i[1:]
  list1.append(i)
pop20_21['Geographic Area'] = list1
pop20_21.head()

In [None]:
pop10_19 = pd.read_csv('./Population/nst-est2019-01.csv')
#Rename columns due to header reading error
pop10_19.rename(columns={'Population Estimate (as of July 1)':'2010','Unnamed: 2':'Estimates Base','Unnamed: 4':'2011','Unnamed: 5':'2012','Unnamed: 5':'2012','Unnamed: 6':'2013','Unnamed: 7':'2014','Unnamed: 8':'2015','Unnamed: 9':'2016','Unnamed: 10':'2017','Unnamed: 11':'2018','Unnamed: 12': '2019'},inplace=True)
#Drop the first 6 rows becausethey are aggregates
pop10_19 = pop10_19.iloc[6:]
list1 = []
for i in pop10_19['Geographic Area']:
  i = i[1:]
  list1.append(i)
pop10_19['Geographic Area'] = list1
pop10_19.head()

In [None]:
total_population = pop10_19
total_population['2020'] = pop20_21['2020']
total_population['2021'] = pop20_21['2021']
total_population.drop(['April 1, 2010','Estimates Base'],inplace=True,axis=1)

In [None]:
total_population.columns

In [None]:
def df_creation(row):
  ret_val = pd.DataFrame()
  ret_val['Population'] = list(row)[1:]
  ret_val['Year'] = total_population.columns[1:]
  return ret_val

S_pop = {}
for index, row in total_population.iterrows():
  S_pop[list(row)[0]] = df_creation(row)


In [None]:
ax=S_pop['Alabama'].plot(x='Year',y='Population')

In [None]:
total_population

In [None]:
per_population_growth = total_population.copy()
years = [2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021]
for i in range(len(years)):
    per_population_growth[str(years[len(years) - i - 1])] = total_population[str(years[len(years)- i- 1])]/total_population['2010']-1

per_population_growth.drop(['2010'],inplace=True,axis=1)
per_population_growth.set_index('Geographic Area').transpose().plot(figsize=(10,15))
plt.legend()
#plt.yscale("log")
plt.xlabel("Years")
plt.ylabel("Population")
plt.title("Population Growth Over Time For Each State")
plt.grid(linestyle=':')

handles, labels = plt.gca().get_legend_handles_labels()
order = per_population_growth['2021'].sort_values(ascending=False).keys()
order = order-6

plt.legend([handles[idx] for idx in order],[labels[idx] for idx in order],bbox_to_anchor=(1., 1.0), fancybox=True, shadow=True, ncol=1)


**Percent Population Growth by State**

Above is the percentage of population growth of each state based on its initial population in 2010. Each state starts on the value 1 for the year 2010, so 2010 was not included. The legend is sorted by the max value at the end, so it's easier to compare each state's line, and also see which state proportionally grew the most over the decade. This is important because the population growth of a state will affect the number of protests and the number of radicalized individuals, and therefore the number of protests per radicalized individual.

In [None]:
pd.pivot_table(total_population, index='Geographic Area').head()

<h1>Merging Data</h1><p>Both protest and radicalization measure resistance to social or governmental structures. Therefore, it makes sense to join aspects of the data into a very simple table to compare radicalization and protest activity. We will not merge this data on the 'State' attribute, because both datasets have such a large number of variables that the table produced would be unweildy.</p>

In [None]:
resistance_data = pd.DataFrame()
resistance_data['Radicalized_num'] = pirus_states_since_2000
resistance_data['Protest_num'] = protests_by_state
resistance_data.head()

<p>We can look at the relationship now between the number of protests in a state and the number of radicalized individuals in a state. Unsurprisingly, there is a visually obvious correlation. 

In [None]:
resistance_data.rename({'Loc_Habitation_State1':'State'},inplace=True)
resistance_data.plot(kind='scatter',
                     y='Radicalized_num',
                     x='Protest_num',
                     ylabel = "Number of People Radicalized",
                     xlabel = "Number of Protests",
                     figsize=(10,8),
                     alpha=0.4,
                     color='purple',
                     s=30)

<p>We can compute the correlation between these two variables as follows:</p>

In [None]:
resistance_data['Protest_num'].corr(resistance_data['Radicalized_num'])

<p>This is a significant, but unsurprising 
correlation. We can represent the population size of the state through the dot size. </p>

In [None]:
resistance_data_merged = resistance_data.reset_index().rename(columns={'Loc_Habitation_State1':'State'}).merge(total_population.rename(columns={'Geographic Area':'State',"2021":"Population"})[["State","Population"]],on='State', how="right").set_index("State")
resistance_data_merged.head()

In [None]:
resistance_data_merged["Population"]

In [None]:
resistance_data_merged.plot(kind='scatter',
                     y='Radicalized_num',
                     x='Protest_num',
                     ylabel = "Number of People Radicalized",
                     xlabel = "Number of Protests",
                     title="Number of People Radicalized vs Number of Protests",
                     figsize=(10,8),
                     alpha=0.4,
                     color='purple',
                     #s = resistance_data_merged['Population']
                     #s=resistance_data_merged["Population"].apply({lambda x: x/1e4}),
                     s=100)
plt.xscale("log")
plt.yscale("log")
x_vals = list(resistance_data_merged.reset_index()["Protest_num"])
y_vals = list(resistance_data_merged.reset_index()["Radicalized_num"])
states = list(resistance_data_merged.reset_index()["State"])
for i in range(len(x_vals)):
    plt.text(x_vals[i], y_vals[i], states[i], fontsize=8)


<h3>Protests and Radicalized Individuals Based On State</h3>
<p>It is unsurprising to see the largest states have both the most radicalized individuals as well as protests, so a better perspective would be to normalize each of those values based on population. Below will give a better view on each state's participation in politics</p>

In [None]:
resistance_data_normalized = resistance_data_merged.copy()
resistance_data_normalized["Protest_num"] = resistance_data_normalized["Protest_num"]/resistance_data_normalized["Population"]
resistance_data_normalized["Radicalized_num"] = resistance_data_normalized["Radicalized_num"]/resistance_data_normalized["Population"]
resistance_data_normalized.plot(kind='scatter',
                     y='Radicalized_num',
                     x='Protest_num',
                     ylabel = "Number of People Radicalized (Normalized by Population)",
                     xlabel = "Number of Protests (Normalized by Population)",
                     title="Number of People Radicalized vs Number of Protests (Normalized by Population)",
                     figsize=(10,8),
                     alpha=0.4,
                     color='purple',
                     #s = resistance_data_merged['Population']
                     s=resistance_data_normalized["Population"].apply({lambda x: x/1e4}))
plt.xscale("log")
plt.yscale("log")
x_vals = list(resistance_data_normalized.reset_index()["Protest_num"])
y_vals = list(resistance_data_normalized.reset_index()["Radicalized_num"])

for i in range(len(x_vals)):
    plt.text(x=x_vals[i], y=y_vals[i], s=states[i], fontsize=7)

<h3>Normalized Protests and Radicalized Individuals</h3>
<p>One flaw with the graph above was that it meant nothing. When comparing large sets of data belonging to different areas, it's important to normalize them by some factor, so that the data is proportional instead of its regular value. After normalizing the data, a very interesting but telling picture shows. DC is in the top right, making it the most participatory "state" in the United States. This makes sense because it's home to the White House, and many protests likely occur here by others outside of DC. The number of protests in proportion to its population as just a city make it key for political involvement.</p>

<h1>Models</h1>

<h3>Model 1: Preface</h3><p>The Protest data Attendance column is missing values for many events. We will build a model to predict what the attendance would have been based on the issues the protest addressed, the state protest took place in, and the proportion of the radicalized individuals from that state.<p>First, let's look at what issues people protest about most often.

In [None]:
count = []
final = []
for tag in protests_temp['Tags']:
    temp = tag.split(';')
    count.append(len(temp))
    final += temp
final = pd.unique(final)
print(pd.unique(count))

In [None]:
protests_temp.head()

In [None]:
protests_iss =protests_temp[['Date','Location','Event (legacy; see tags)', 'Attendees','State','Tags']]
protests_iss_known = protests_iss

In [None]:
l_tags = []
for i in range(1,9):
    m ='Tag' + str(i)
    l_tags.append(m)

In [None]:
protests_iss_known.rename(columns={'Event (legacy; see tags)':'Event'},inplace=True)

In [None]:
def categorizer(word):
    p_list = [r"\s*([Rr]acial)",
    r'\s*(45)', r"\s*([Gg]un\s[Rr]ights)",r"\s*([Gg]un\s[Cc]ontrol)",
     r"\s*([Oo]ther)", r"\s*([Ee]nvironment)", 
     r"\s*([Ee]ducation)",r'\s*([Hh]ealthcare)',
     r"\s*([Ii]mmigration)",r"\s*([Ee]xecutive)", 
     r"\s*([Ii]nternational\s[Rr]elations)",
     r"\s*([Ll]egislative)",r"\s*([Cc]ivil\s[Rr]ights)"]
    tag_dict = {r"\s*([Rr]acial)":"Racial",
    r'\s*(45)':"45th President", r"\s*([Gg]un\s[Rr]ights)":"Gun Rights",r"\s*([Gg]un\s[Cc]ontrol)":"Gun Control",
     r"\s*([Oo]ther)":'Other', r"\s*([Ee]nvironment)":'Environment', 
     r"\s*([Ee]ducation)":"Education",r'\s*([Hh]ealthcare)':'Healthcare',
     r"\s*([Ii]mmigration)":'Immigration',r"\s*([Ee]xecutive)":'Executive', 
     r"\s*([Ii]nternational\s[Rr]elations)":'International Relations',
     r"\s*([Ll]egislative)":'Legislative',r"\s*([Cc]ivil\s[Rr]ights)":'Civil Rights','[]':'Other'}
    ret_list = set([])
    for w in word.split(';'):
        print(w)
        for pattern in p_list:
            m = re.search(pattern,w)
            if m != None:
                b = tag_dict[pattern]
                if b not in ret_list:
                    ret_list.add(b)
    return ret_list

In [None]:
events = []
for a in protests_iss_known["Tags"]:
    events.append(categorizer(a))
#eventsssss = protests_iss_known["Tags"].apply({lambda x: categorizer(x)}) # "Racial Injustice" if "Racial Injustice" in x else "Gun Rights" if "Guns" in x else "Other" if "Other" in x else "Environment" if "Environment" in x else "Education" if "Education" in x else "Immigration" if "Immigration" in x else x
protests_iss_known["Event"] = events
Common_Events = protests_iss_known["Event"].value_counts().head(12).keys()
print(len(protests_iss_known["Event"].value_counts()))
#protests_iss_known["Event"] = protests_iss_known["Event"].apply({lambda x: x if x in Common_Events else "Other"})

In [None]:
def overlapping_value_count(df,return_dict):
    s = df['Event']
    for entry in s:
        l = list(entry)
        for e in l:
            if e in return_dict.keys():
                return_dict[e] += 1
            else:
                return_dict[e] = 1
    ret_val = pd.DataFrame(list(return_dict.items()),index=range(0,len(return_dict.keys())))
    ret_val.columns = ['Tag','Count']
    ret_val.set_index('Tag',inplace=True)
    return ret_val
tag_counts = overlapping_value_count(protests_iss_known,{})
tag_counts.plot(y='Count',kind='pie',figsize=(10,10),fontsize=10,legend=True,title='Protest Topics',colors=sns.color_palette('tab20'))

<p>People protest many different issues. Let's look at the top 50.</p>

In [None]:
#How to select out certain Protest issues, when Event attribute is saved to a list:
"""'Racial' in protests_iss_known.iloc[0].Event
protests_iss_known[protests_iss_known['Event']&{'Racial'}]"""

<p>Just from looking at this chart, it looks like civil rights, racial justice, guns, and immigration are major issues that people protest about..</p><p>Let's also look at the relationship between the time of year that the protests occur and the number of attendees. We will have to drop rows that do not have attendees listed, and convert the data column to a datetime object. 

In [None]:
protests_iss_known.Date = pd.to_datetime(protests_iss_known.Date)
protests_real_test = protests_iss.query('Attendees != Attendees')
protests_iss_attendees_known = protests_iss.dropna(subset='Attendees')

In [None]:
protests_iss_attendees_known.Date.value_counts().plot(figsize=(15,10))

In [None]:
protests_real_test

<h3>Model 1: Building the Model</h1><p>We previously saved the protests with unknown attendees to the DataFrame protests_real_test. Let's revisit that data.</p>

In [None]:
tag_unknown = overlapping_value_count(protests_real_test,{})
tag_unknown.plot(y='Count',kind='pie',figsize=(8,8),colors=sns.color_palette('tab20'))

<p>We can build a K-nearest neighbor predictor of the number of Attendees at a protest based on the issues the protest addressed, the state protest took place in, and the proportion of the radicalized individuals from that state</p>

In [None]:
protests_iss_known

In [None]:
#How to select out certain Protest issues, when Event attribute is saved to a list:
def issue_search(issue):
    return protests_iss_known[protests_iss_known['Event']&{issue}]
def state_date(row):
    return (row.Date,row.State)
def state_year(row):
    return (row.Date.year,row.State)
def vote_pcnt(tuple):
    print('tuple',tuple)
    year = tuple[0]
    state = tuple[1]
    if year%2 != 0:
        year -= 1
        print(year)
    if state not in pd.unique(csv_final.Region):
        return ('NaN')
    line = str(csv_final[(csv_final.Region == state)&(csv_final.Year == year)]['VEP Highest Office'])
    print(line)
    pcnt = re.search(r'(....%)',line)
    print(pcnt.groups())
    return float(pcnt[0][:-1])
def get_rads_by_population(tuple):
    date = tuple[0]
    state = tuple[1]
    if state not in pd.unique(total_population['Geographic Area']):
        return ('NaN')
    radicals = pd.DataFrame(pirus_temp[(pirus_temp.Date_Exposure < date)&(pirus_temp.Loc_Plot_State1 == state)&(pirus_temp.Date_Exposure > '2000-01-01 00:00:00')]).size
    population = total_population[total_population['Geographic Area'] == state][str(date.year)]
    return population/radicals
def to_raw(string):
    return fr"{string}"

votes = []
for e in range(0,38096):
    pcnt = vote_pcnt(state_year(protests_iss_known.iloc[e]))
    votes.append(pcnt)
protests_iss_known['State_voters'] = votes

all_tags= ['Racial','45th President', 'Gun Rights', 'Gun Control', 'Other', 'Environment', 'Education', 'Healthcare', 'Immigration', 'Executive', 'International Relations', 'Legislative', 'Civil Rights', 'Other']
for t in all_tags:
    protests_iss_known[t] = [0]*38096
    for e in list(issue_search(t).index):
        e = int(e)
        protests_iss_known.loc[e,t]=1

In [None]:
rads = []
for e in range(0,38096):
    rad = get_rads_by_population(state_date(protests_iss_known.iloc[e]))
    rads.append(list(rad)[0])
protests_iss_known['Radicals'] = rads

In [None]:
protests_iss_known

In [None]:
protests_real_test = protests_iss_known.query('Attendees != Attendees')
protests_iss_attendees_known = protests_iss_known.dropna(subset='Attendees')

In [None]:
protests_iss_attendees_known

In [None]:
protests_real_test

In [None]:
"""feats = ["Date", "Event", "State"]
X_dict = protests_iss_known[feats].to_dict(orient="records")
y = protests_iss_known["Attendees"]

# specify the pipeline
kays = []
errors = []
cvs = []
vec = DictVectorizer(sparse=False)
scaler = StandardScaler()
for num in range(10,50,3):
  model = KNeighborsRegressor(n_neighbors=num)
  pipeline = Pipeline([("vectorizer", vec), ("scaler", scaler), ("fit", model)])
  scores = cross_val_score(pipeline, X_dict, y, 
                         cv=5, scoring="neg_mean_squared_error")
  i = 1
  errors.append(np.sqrt(np.mean(-scores)))
  for error in scores:
    kays.append(num)
    cvs.append(i)
    i+=1

for_plot = pd.DataFrame()
for_plot['K-value'] = kays
for_plot['Division'] = cvs
for_plot['Error'] = errors

print(for_plot)"""
#for_plot.groupby('Division')['Error'].plot(kind='line',x='K-value',y='Error',legend=True,figsize=(15,8))


<h3>Model 2</h3><p>Radicalization and Protests Over Time: We will look at the correlation between radicalized individuals and protests over time. Perhaps there are relationships between radicalization on certain issues and more protests on certain issues. For example, we know that internet searches for "Straight pride" peak each year during June, which is Pride Month for LGBTQ+ folks. (https://trends.google.com/trends/explore?date=all&geo=US&q=straight%20pride) Perhaps more discussion around an issue in the form of protests causes more radicalization on the opposing side. We will use time data and issue categories for both radicalized individuals from the PIRUS data and protest events.</p><p>As an exploratory exercise, let's plot both the PIRUS and the protests data over time to see the spikes in activity.</p>

In [None]:
protests_iss_attendees_known.plot(kind='scatter',x='Date',y='Attendees',figsize=(20,10),s=8,c='red',alpha=0.2)

<p>we see clear spikes in protest participation and protest size.</p>

In [None]:
pirus_temp.Date_Exposure = pd.to_datetime(pirus_temp.Date_Exposure)


In [None]:
pirus_temp

In [None]:
pirus_temp.set_index('Date_Exposure')
rad_counts = pirus_temp.sort_index().value_counts('Date_Exposure',sort=False)
rad_counts.plot(x='Date_Exposure',figsize=(20,10))

#rad_counts.plot(x='Date_Exposure', y = '0')

<p>Now we can plot protests and radicalization on the same axis, though our protest data only starts at 2017. We will have to filter the radicalization data.</p>

In [None]:
pirus_temp.Year = pd.to_numeric(pirus_temp.Year)
since_17 = pirus_temp.loc[pirus_temp.Year>=17]
since_17.set_index('Date_Exposure')

In [None]:
rad_counts2 = since_17.sort_index().value_counts('Date_Exposure',sort=False)

In [None]:
rad_counts2 = rad_counts2.reset_index().rename(columns={"Date_Exposure":"Date",0:"freq"})

In [None]:
merged_data = protests_iss_attendees_known[["Date","Attendees"]].merge(rad_counts2[["Date","freq"]], on='Date', how='inner')
merged_data

In [None]:
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
fig.set_size_inches(18.5, 10.5)

ax1.scatter(merged_data["Date"], merged_data["Attendees"], c='blue', s=50, alpha=0.4)
ax2.scatter(merged_data["Date"], merged_data["freq"], c='red', s=50, alpha=0.15)
plt.title("Temporary Title") # title this
ax1.set_xlabel("Date")
ax1.set_ylabel("Attendees")
ax2.set_ylabel("Frequency")

plt.show()


<p>This chart shows the relationship between protest attendance/number and the number of individuals radicalized. We will have to code protests by issue and radicalized individuals by issue to get a better idea of the relationships between radicalization and protests.</p>

<h1>Project Strategy</h1><p>Beyond looking at the aggregate state data, the team will look at relationships between political participation, resistance, and violence over time by year for each state. Additionally, if time permits, the group will look at factors such as party affiliation and interest group affiliation.</p>