## Imports & Data

In [1]:
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder, StandardScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
# unzipping the dataset
with zipfile.ZipFile('data/dataverse_files.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [3]:
# import data
protests_df = pd.read_csv('data/mmALL_073120_csv.csv')
protests_df.head(2)

Unnamed: 0,id,country,ccode,year,region,protest,protestnumber,startday,startmonth,startyear,...,protesterdemand4,stateresponse1,stateresponse2,stateresponse3,stateresponse4,stateresponse5,stateresponse6,stateresponse7,sources,notes
0,201990001,Canada,20,1990,North America,1,1,15.0,1.0,1990.0,...,,ignore,,,,,,,1. great canadian train journeys into history;...,canada s railway passenger system was finally ...
1,201990002,Canada,20,1990,North America,1,2,25.0,6.0,1990.0,...,,ignore,,,,,,,1. autonomy s cry revived in quebec the new yo...,protestors were only identified as young peopl...


## EDA

In [4]:
protests_df.shape

(17145, 31)

In [5]:
protests_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17145 entries, 0 to 17144
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     17145 non-null  int64  
 1   country                17145 non-null  object 
 2   ccode                  17145 non-null  int64  
 3   year                   17145 non-null  int64  
 4   region                 17145 non-null  object 
 5   protest                17145 non-null  int64  
 6   protestnumber          17145 non-null  int64  
 7   startday               15239 non-null  float64
 8   startmonth             15239 non-null  float64
 9   startyear              15239 non-null  float64
 10  endday                 15239 non-null  float64
 11  endmonth               15239 non-null  float64
 12  endyear                15239 non-null  float64
 13  protesterviolence      15758 non-null  float64
 14  location               15218 non-null  object 
 15  pa

In [6]:
#Years included in the project
print(protests_df['year'].unique())
#Total number of years
print('Total:', len(protests_df['year'].unique()), 'years')

[1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003
 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
 2018 2019 2020]
Total: 31 years


In [7]:
# Show all Regions
protests_df['region'].unique()

array(['North America', 'Central America', 'South America', 'Europe',
       'Africa', 'MENA', 'Asia', 'Oceania'], dtype=object)

In [8]:
#Finf all data under MENA region (Middle East & South Africa)
#protests_df[protests_df['region'].str.contains("MENA")]

In [9]:
#Exploring participants_category feature
protests_df['participants_category'].value_counts()

100-999       3204
50-99         2508
2000-4999     1580
>10000        1470
5000-10000     642
1000-1999      483
Name: participants_category, dtype: int64

In [10]:
#How many countries included in the duniqueset
len(protests_df['country'].unique())

166

In [11]:
# Exploring protesterviolence feature
print(protests_df['protesterviolence'].value_counts() )

print(protests_df['protesterviolence'].unique())

0.0    11723
1.0     4035
Name: protesterviolence, dtype: int64
[ 0.  1. nan]


In [12]:
# Look at 'protesterdemand1' values
protests_df['protesterdemand1'].value_counts()

political behavior, process    9680
labor wage dispute             1710
price increases, tax policy    1087
removal of politician          1011
police brutality                825
land farm issue                 467
social restrictions             458
Name: protesterdemand1, dtype: int64

#### Exploring state response 1-7 (stateresponse) features

In [13]:
# stateresponse1
print(protests_df['stateresponse1'].value_counts())
print ('Number of categories:', len(protests_df['stateresponse1'].unique()))

ignore             8239
crowd dispersal    3935
arrests            1088
accomodation        956
shootings           424
beatings            303
killings            263
Name: stateresponse1, dtype: int64
Number of categories: 8


In [14]:
protests_df['stateresponse1'].unique()

array(['ignore', 'accomodation', 'crowd dispersal', 'arrests',
       'shootings', nan, 'beatings', 'killings'], dtype=object)

In [15]:
# stateresponse2
print(protests_df['stateresponse2'].value_counts())
print ('Number of categories:', len(protests_df['stateresponse2'].unique()))

arrests            764
crowd dispersal    585
shootings          408
beatings           391
accomodation       389
killings           315
ignore              36
Name: stateresponse2, dtype: int64
Number of categories: 8


In [16]:
# stateresponse3
print(protests_df['stateresponse3'].value_counts())
print ('Number of categories:', len(protests_df['stateresponse3'].unique()))

arrests            231
crowd dispersal    223
killings           199
accomodation       100
beatings            85
shootings           81
ignore              11
Name: stateresponse3, dtype: int64
Number of categories: 8


In [17]:
# stateresponse4
print(protests_df['stateresponse4'].value_counts())
print ('Number of categories:', len(protests_df['stateresponse4'].unique()))

accomodation       65
arrests            52
crowd dispersal    45
killings           43
shootings          20
beatings           19
Name: stateresponse4, dtype: int64
Number of categories: 7


In [18]:
# stateresponse5
print(protests_df['stateresponse5'].value_counts())
print ('Number of categories:', len(protests_df['stateresponse5'].unique()))

.                  796
accomodation        16
arrests             14
crowd dispersal      9
killings             7
beatings             4
shootings            3
Name: stateresponse5, dtype: int64
Number of categories: 8


In [19]:
# stateresponse6
print(protests_df['stateresponse6'].value_counts())
print ('Number of categories:', len(protests_df['stateresponse6'].unique()))

accomodation       9
crowd dispersal    3
killings           2
arrests            1
beatings           1
Name: stateresponse6, dtype: int64
Number of categories: 6


In [20]:
# stateresponse7
print(protests_df['stateresponse7'].unique())
print(protests_df['stateresponse7'].value_counts())
print ('Number of categories:', len(protests_df['stateresponse7'].unique()))

[nan '.' 'accomodation' 'arrests' 'killings' 'beatings']
.               913
accomodation      3
arrests           2
killings          1
beatings          1
Name: stateresponse7, dtype: int64
Number of categories: 6


## Data cleaning & preprocessing

In [21]:
# Replacing all '.' with nan
protests_df = protests_df.replace('.', None)

In [22]:
print(protests_df['stateresponse7'].value_counts())
print ('Number of categories:', len(protests_df['stateresponse7'].unique()))

accomodation    3
arrests         2
killings        1
beatings        1
Name: stateresponse7, dtype: int64
Number of categories: 5


In [23]:
# Display stateresponse columns only
resp_column_img = protests_df.iloc[:, 22:29].head(10)
resp_column_img

Unnamed: 0,stateresponse1,stateresponse2,stateresponse3,stateresponse4,stateresponse5,stateresponse6,stateresponse7
0,ignore,,,,,,
1,ignore,,,,,,
2,ignore,,,,,,
3,accomodation,,,,,,
4,crowd dispersal,arrests,accomodation,,,,
5,crowd dispersal,shootings,,,,,
6,ignore,,,,,,
7,ignore,,,,,,
8,arrests,,,,,,
9,ignore,,,,,,


In [24]:
# stateresponse1-7 EDA
all_responses = pd.DataFrame({'stateresponse1': protests_df['stateresponse1'].value_counts(),
                   'stateresponse2': protests_df['stateresponse2'].value_counts(),
                   'stateresponse3': protests_df['stateresponse3'].value_counts(),
                   'stateresponse4': protests_df['stateresponse4'].value_counts(),
                   'stateresponse5': protests_df['stateresponse5'].value_counts(),
                   'stateresponse6': protests_df['stateresponse6'].value_counts(),
                   'stateresponse7': protests_df['stateresponse7'].value_counts()
                             
})
all_responses

Unnamed: 0,stateresponse1,stateresponse2,stateresponse3,stateresponse4,stateresponse5,stateresponse6,stateresponse7
accomodation,956,389,100,65.0,16.0,9.0,3.0
arrests,1088,764,231,52.0,14.0,1.0,2.0
beatings,303,391,85,19.0,4.0,1.0,1.0
crowd dispersal,3935,585,223,45.0,9.0,3.0,
ignore,8239,36,11,,,,
killings,263,315,199,43.0,7.0,2.0,1.0
shootings,424,408,81,20.0,3.0,,


In [25]:
all_responses_transpose = all_responses.transpose()
#all_responses_transpose['response_number'] = all_responses_transpose.index
all_responses_transpose

Unnamed: 0,accomodation,arrests,beatings,crowd dispersal,ignore,killings,shootings
stateresponse1,956.0,1088.0,303.0,3935.0,8239.0,263.0,424.0
stateresponse2,389.0,764.0,391.0,585.0,36.0,315.0,408.0
stateresponse3,100.0,231.0,85.0,223.0,11.0,199.0,81.0
stateresponse4,65.0,52.0,19.0,45.0,,43.0,20.0
stateresponse5,16.0,14.0,4.0,9.0,,7.0,3.0
stateresponse6,9.0,1.0,1.0,3.0,,2.0,
stateresponse7,3.0,2.0,1.0,,,1.0,


In [26]:
all_responses_transpose.rename(columns={'accomodation': 'accommodation'}, inplace=True)

In [27]:
# drop nulls from stateresponse1
protests_df.dropna(subset=['stateresponse1'], how='all', inplace=True)

In [28]:
protests_df.shape
#1937 rows removed.

(15208, 31)

#### Make one column with all state responses

In [29]:
# drop all of the dots
protests_df[protests_df.columns[22:29]] = protests_df[protests_df.columns[22:29]].replace('.', '')

In [30]:
# drop all os the commas
protests_df[protests_df.columns[22:29]] = protests_df[protests_df.columns[22:29]].replace(',', '')

In [31]:
protests_df['stateresponse3'].value_counts()

arrests            231
crowd dispersal    223
killings           199
accomodation       100
beatings            85
shootings           81
ignore              11
Name: stateresponse3, dtype: int64

In [32]:
protests_df[protests_df.columns[22:29]].apply(
    lambda x: ', '.join(x.dropna().astype(str)),
    axis=1
)[:10]

0                                    ignore
1                                    ignore
2                                    ignore
3                              accomodation
4    crowd dispersal, arrests, accomodation
5                crowd dispersal, shootings
6                                    ignore
7                                    ignore
8                                   arrests
9                                    ignore
dtype: object

In [33]:
# Creating a new column combining all state responses:
# protests_df.iloc[:, 22:29]
protests_df['all_responses'] = protests_df[protests_df.columns[22:29]].apply(
    lambda x: ','.join(x.dropna().astype(str).str.strip()),
    axis=1
)

protests_df.head(5)

Unnamed: 0,id,country,ccode,year,region,protest,protestnumber,startday,startmonth,startyear,...,stateresponse1,stateresponse2,stateresponse3,stateresponse4,stateresponse5,stateresponse6,stateresponse7,sources,notes,all_responses
0,201990001,Canada,20,1990,North America,1,1,15.0,1.0,1990.0,...,ignore,,,,,,,1. great canadian train journeys into history;...,canada s railway passenger system was finally ...,ignore
1,201990002,Canada,20,1990,North America,1,2,25.0,6.0,1990.0,...,ignore,,,,,,,1. autonomy s cry revived in quebec the new yo...,protestors were only identified as young peopl...,ignore
2,201990003,Canada,20,1990,North America,1,3,1.0,7.0,1990.0,...,ignore,,,,,,,1. quebec protest after queen calls for unity ...,"the queen, after calling on canadians to remai...",ignore
3,201990004,Canada,20,1990,North America,1,4,12.0,7.0,1990.0,...,accomodation,,,,,,,1. indians gather as siege intensifies; armed ...,canada s federal government has agreed to acqu...,accomodation
4,201990005,Canada,20,1990,North America,1,5,14.0,8.0,1990.0,...,crowd dispersal,arrests,accomodation,,,,,1. dozens hurt in mohawk blockade protest the ...,protests were directed against the state due t...,"crowd dispersal,arrests,accomodation"


In [34]:
# Show values' type
type(protests_df['all_responses'].iloc[0])  #.apply(lambda x: len(x))

str

In [35]:
protests_df['all_responses'].value_counts()[:10]

ignore                          7989
crowd dispersal                 2453
accomodation                     885
arrests                          633
crowd dispersal,arrests          554
arrests,crowd dispersal          262
crowd dispersal,accomodation     169
ignore,accomodation              141
crowd dispersal,beatings         138
beatings                         125
Name: all_responses, dtype: int64

In [36]:
protests_df[['all_responses']].head(8)

Unnamed: 0,all_responses
0,ignore
1,ignore
2,ignore
3,accomodation
4,"crowd dispersal,arrests,accomodation"
5,"crowd dispersal,shootings"
6,ignore
7,ignore


In [37]:
#protests_df['responses_list'] = protests_df['all_responses'].apply(lambda x: x.split(','))


In [38]:
#protests_df['responses_list'].apply(lambda x: len(x)).sort_values()

### Binarize state responses: violent vs. non-violent
Violent responses: beating, shooting, killing  
Non-violent responses: accommodation, ignore, crowd dispersal, arrests

In [39]:
def violent_response(string):
    keywords = ['beating', 'shooting', 'killing']
    if any(keyword in string for keyword in keywords):
        return 1
    else:
        return 0

# Add a new column 'binarization' to the dataframe by applying the function to the 'strings' column
protests_df['state_violence'] = protests_df['all_responses'].apply(violent_response)

protests_df[['state_violence']].head()

Unnamed: 0,state_violence
0,0
1,0
2,0
3,0
4,0


In [40]:
# Show state_violence values
protests_df['state_violence'].value_counts()

0    13310
1     1898
Name: state_violence, dtype: int64

###### State violence was used in 1,898 of 15,208 protests (12.48%).

#### Checking the statistics of binarization (violence/no-violence) vs differet vatiables (years, participants category, region, ect.)

In [41]:
violence_by_country = protests_df.groupby(['country']).state_violence.describe()
violence_by_country.head(3)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Afghanistan,39.0,0.128205,0.338688,0.0,0.0,0.0,0.0,1.0
Albania,78.0,0.192308,0.396664,0.0,0.0,0.0,0.0,1.0
Algeria,105.0,0.095238,0.294951,0.0,0.0,0.0,0.0,1.0


In [42]:
violence_by_country_sorted = violence_by_country.sort_values(by=['mean'], ascending=False)
violence_by_country_sorted

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
South Sudan,1.0,1.000000,,1.0,1.00,1.0,1.00,1.0
Djibouti,10.0,0.700000,0.483046,0.0,0.25,1.0,1.00,1.0
Serbia and Montenegro,2.0,0.500000,0.707107,0.0,0.25,0.5,0.75,1.0
Bhutan,2.0,0.500000,0.707107,0.0,0.25,0.5,0.75,1.0
Guinea,108.0,0.490741,0.502245,0.0,0.00,0.0,1.00,1.0
...,...,...,...,...,...,...,...,...
Qatar,1.0,0.000000,,0.0,0.00,0.0,0.00,0.0
Suriname,51.0,0.000000,0.000000,0.0,0.00,0.0,0.00,0.0
Croatia,26.0,0.000000,0.000000,0.0,0.00,0.0,0.00,0.0
Singapore,15.0,0.000000,0.000000,0.0,0.00,0.0,0.00,0.0


In [43]:
protests_df.groupby(['participants_category']).state_violence.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
participants_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
100-999,3200.0,0.1175,0.322065,0.0,0.0,0.0,0.0,1.0
1000-1999,483.0,0.115942,0.320487,0.0,0.0,0.0,0.0,1.0
2000-4999,1580.0,0.111392,0.314717,0.0,0.0,0.0,0.0,1.0
50-99,2506.0,0.103751,0.304998,0.0,0.0,0.0,0.0,1.0
5000-10000,640.0,0.103125,0.30436,0.0,0.0,0.0,0.0,1.0
>10000,1468.0,0.075613,0.264468,0.0,0.0,0.0,0.0,1.0


In [44]:
protests_df.groupby(['protesterviolence']).state_violence.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
protesterviolence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,11181.0,0.056256,0.230426,0.0,0.0,0.0,0.0,1.0
1.0,4027.0,0.315123,0.464623,0.0,0.0,0.0,1.0,1.0


In [45]:
protests_df.groupby(['region']).state_violence.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Africa,3177.0,0.203966,0.403007,0.0,0.0,0.0,0.0,1.0
Asia,3118.0,0.156831,0.3637,0.0,0.0,0.0,0.0,1.0
Central America,448.0,0.151786,0.359214,0.0,0.0,0.0,0.0,1.0
Europe,4991.0,0.03867,0.192826,0.0,0.0,0.0,0.0,1.0
MENA,1257.0,0.244232,0.429802,0.0,0.0,0.0,0.0,1.0
North America,527.0,0.113852,0.317933,0.0,0.0,0.0,0.0,1.0
Oceania,38.0,0.078947,0.273276,0.0,0.0,0.0,0.0,1.0
South America,1652.0,0.078692,0.26934,0.0,0.0,0.0,0.0,1.0


In [46]:
protests_df.groupby(['region']).state_violence.agg(['mean', 'std'])

Unnamed: 0_level_0,mean,std
region,Unnamed: 1_level_1,Unnamed: 2_level_1
Africa,0.203966,0.403007
Asia,0.156831,0.3637
Central America,0.151786,0.359214
Europe,0.03867,0.192826
MENA,0.244232,0.429802
North America,0.113852,0.317933
Oceania,0.078947,0.273276
South America,0.078692,0.26934


In [47]:
protests_df.groupby(['year']).state_violence.describe().head(3)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1990,569.0,0.219684,0.414397,0.0,0.0,0.0,0.0,1.0
1991,371.0,0.199461,0.400135,0.0,0.0,0.0,0.0,1.0
1992,409.0,0.173594,0.379224,0.0,0.0,0.0,0.0,1.0


In [48]:
protests_df.describe()

Unnamed: 0,id,ccode,year,protest,protestnumber,startday,startmonth,startyear,endday,endmonth,endyear,protesterviolence,state_violence
count,15208.0,15208.0,15208.0,15208.0,15208.0,15208.0,15208.0,15208.0,15208.0,15208.0,15208.0,15208.0,15208.0
mean,4315554000.0,431.354813,2006.334561,1.0,8.336731,15.45384,6.225671,2006.334561,15.578248,6.241255,2006.337322,0.264795,0.124803
std,2342708000.0,234.270748,8.958449,0.0,12.271631,8.816169,3.461818,8.958449,8.801587,3.461707,8.959696,0.441238,0.330506
min,201990000.0,20.0,1990.0,1.0,1.0,1.0,1.0,1990.0,1.0,1.0,1990.0,0.0,0.0
25%,2201993000.0,220.0,1999.0,1.0,2.0,8.0,3.0,1999.0,8.0,3.0,1999.0,0.0,0.0
50%,3852000000.0,385.0,2007.0,1.0,4.0,15.0,6.0,2007.0,16.0,6.0,2007.0,0.0,0.0
75%,6452015000.0,645.0,2014.0,1.0,10.0,23.0,9.0,2014.0,23.0,9.0,2014.0,1.0,0.0
max,9102017000.0,910.0,2020.0,1.0,143.0,31.0,12.0,2020.0,31.0,12.0,2020.0,1.0,1.0


In [50]:
# Creating a new data frame consisted of year, region, country, participants category, protester violence and state violence

stateviolence_df = protests_df[['year', 'region', 'country', 'participants_category', 'protesterviolence', 'state_violence']]
stateviolence_df

Unnamed: 0,year,region,country,participants_category,protesterviolence,state_violence
0,1990,North America,Canada,,0.0,0
1,1990,North America,Canada,,0.0,0
2,1990,North America,Canada,,0.0,0
3,1990,North America,Canada,,1.0,0
4,1990,North America,Canada,,1.0,0
...,...,...,...,...,...,...
17136,2014,Oceania,Papua New Guinea,100-999,1.0,1
17138,2016,Oceania,Papua New Guinea,1000-1999,1.0,1
17139,2017,Oceania,Papua New Guinea,50-99,0.0,0
17140,2017,Oceania,Papua New Guinea,50-99,1.0,0


In [51]:
stateviolence_df.isnull().sum()

year                        0
region                      0
country                     0
participants_category    5331
protesterviolence           0
state_violence              0
dtype: int64

In [52]:
#df.to_csv('example.csv', index=False)
stateviolence_df.to_csv('./data/stateviolence.csv', index=False)