# Crimes in Europe and Correlation with Social Conditions

## 1. Libraries & Utility Functions

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
from scipy import signal
from functools import reduce 
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import unittest

In [2]:
# Constants
COUNTRIES = ['Belgium',
 'Czechia',
 'France',
 'Germany (until 1990 former territory of the FRG)',
 'Greece',
 'Hungary',
 'Italy',
 'Netherlands',
 'Poland',
 'Portugal',
 'Romania',
 'Spain',
 'Sweden',
 'United Kingdom']

CRIME_TYPOLOGIES = [
    'Intentional homicide',
    'Robbery',
    'Theft of a motorized land vehicle',
    'Unlawful acts involving controlled drugs or precursors']

MAX_P_VALUE = 0.05

In [12]:
type({}) == int or float

float

In [25]:
def normalize_value(x, min, max):
    
    if type(x) != int and type(x) != float:
        return None
    if type(min) != int and type(x) != float:
        return None
    if type(max) != int and type(x) != float:
        return None
    if max == min:
        return None
    if x < min:
        return None
    if x > max:
        return None
    
    return (x - min) / (max - min)


def normalize_list(l):
    min = np.min(l)
    max = np.max(l)
    
    return [normalize_value(x,min,max) for x in l]

def get_best_lag(v1,v2,mode):
    corr = signal.correlate(v1, v2, mode=mode)
    lags = signal.correlation_lags(len(v1), len(v2), mode=mode)
    corr /= np.max(corr)
    return lags[np.argmax(np.abs(corr))]

def shift_array(v1, v2, shift):
    if shift == 0:
        return v1, v2
    
    if shift < 0:
        return v1[:best_lag], v2[-best_lag:]
    
    if shift > 0:
        return v1[best_lag:], v2[:-best_lag]
    
def filter_p_value(d, cut_off, lag):
    p_value_col = 'p_value_lag' if lag else 'p_value'
    return [x for x in d if x[p_value_col] <= cut_off]

In [43]:
type([])

list

In [45]:
# Unit test
class TestNormalizeValue(unittest.TestCase):
    
    def test_returnType(self):
        result = normalize_value(1,0,2)
        self.assertTrue((type(result) == int) or (type(result) == float))
    
    def test_values(self):
        result = normalize_value(1,1,2)
        self.assertEqual(result, 0)
        result = normalize_value(2,1,2)
        self.assertEqual(result, 1)
        result = normalize_value(1,0,2)
        self.assertEqual(result, 0.5)
    
    
    def test_notNumber(self):
        result = normalize_value('a',{},21)
        self.assertEqual(result, None)
        
        result = normalize_value(False, False, [1,2,3])
        self.assertEqual(result, None)

    def test_minEqualsMax(self):
        result = normalize_value(0,2,2)
        self.assertEqual(result, None)
    
    def test_xGreaterThanMax(self):
        result = normalize_value(4,2,3)
        self.assertEqual(result, None)
    
    def test_xLessThanMin(self):
        result = normalize_value(1,2,3)
        self.assertEqual(result, None)
        
class TestNormalizeList(unittest.TestCase):
    def setUp(self):
        self.standard_input= [1,2,3]
        self.standard_output = [0, 0.5, 1]
        
    def test_return(self):
        result = normalize_list(self.standard_input)
        self.assertTrue(type(result) == list)
    
    def test_value(self):
        result = normalize_list(self.standard_input)
        self.assertEqual(result, self.standard_output)
    
        
if __name__ == '__main__':
    unittest.main(argv=['ignored', '-v'], exit=False)


test_return (__main__.TestNormalizeList) ... ok
test_value (__main__.TestNormalizeList) ... FAIL
test_minEqualsMax (__main__.TestNormalizeValue) ... ok
test_notNumber (__main__.TestNormalizeValue) ... ok
test_returnType (__main__.TestNormalizeValue) ... ok
test_values (__main__.TestNormalizeValue) ... ok
test_xGreaterThanMax (__main__.TestNormalizeValue) ... ok
test_xLessThanMin (__main__.TestNormalizeValue) ... ok

FAIL: test_value (__main__.TestNormalizeList)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-45-fbcdbf1881b4>", line 47, in test_value
    self.assertEqual(result, self.standard_output)
AssertionError: Lists differ: [None, None, None] != [0, 0.5, 1]

First differing element 0:
None
0

- [None, None, None]
+ [0, 0.5, 1]

----------------------------------------------------------------------
Ran 8 tests in 0.007s

FAILED (failures=1)


## 2. Dataset & Normalization

In [5]:
raw_data = pd.read_csv("dataset_output/rep_crimes.csv")

raw_data.drop('Unnamed: 0', axis=1, inplace=True)

raw_data.head()


Unnamed: 0,Country,Crime Typology,Crimes Reported,Year,Pop on 1st Jan,Crimes per 100k,Unemp Perc,Early Leavers Perc
0,Belgium,Intentional homicide,216,2000,10239085.0,2.109564,7.0,13.8
1,Belgium,Intentional homicide,282,2001,10263414.0,2.747624,6.6,13.8
2,Belgium,Intentional homicide,320,2002,10309725.0,3.103866,7.5,14.1
3,Belgium,Intentional homicide,229,2003,10355844.0,2.211312,8.2,14.3
4,Belgium,Intentional homicide,267,2004,10396421.0,2.568191,8.4,13.1


In [6]:
data_t1 = raw_data.copy()
norm_cols = data_t1.groupby(['Country','Crime Typology'])['Crimes per 100k','Unemp Perc','Early Leavers Perc'].transform(normalize_list)

data_t1['Crimes per 100k'] = norm_cols['Crimes per 100k']
data_t1['Unemp Perc'] = norm_cols['Unemp Perc']
data_t1['Early Leavers Perc'] = norm_cols['Early Leavers Perc']

data_t1

  norm_cols = data_t1.groupby(['Country','Crime Typology'])['Crimes per 100k','Unemp Perc','Early Leavers Perc'].transform(normalize_list)


AttributeError: 'float' object has no attribute 'isnumeric'

## 3. Criminality vs Unemployment Rates

In [None]:
g = sns.lmplot(
    data=data_t1, 
    x='Unemp Perc', 
    y='Crimes per 100k', 
    hue='Crime Typology',
    col='Country', 
    col_wrap=4, 
    legend=False)

g.set(ylim=(0, 1.2))
plt.legend(bbox_to_anchor=(2, 5))

In [None]:
data_t2 = []
for country in COUNTRIES:
    for crime_type in CRIME_TYPOLOGIES:
        df_split = data_t1[(data_t1['Country'] == country) & (data_t1['Crime Typology'] == crime_type)]
        data = {
            'country': country,
            'crime_type': crime_type,
            'crime_values': list(df_split['Crimes per 100k']),
            'unemp_values': list(df_split['Unemp Perc'])
        }
        data_t2.append(data)
        
data_t2

In [None]:
for i,d in enumerate(data_t2):
    corr_idx, p_value = stats.pearsonr(d['crime_values'], d['unemp_values'])
    data_t2[i]['pearson'] = corr_idx
    data_t2[i]['p_value'] = p_value
    
data_t2

In [None]:
data_t3 = filter_p_value(data_t2, MAX_P_VALUE, False)

data_t4 = pd.DataFrame(data_t3)
data_t4.sort_values(['pearson'], ascending=False).reset_index()

## 4. Criminality vs Early Leavers from School Rates

In [None]:
g = sns.lmplot(
    data=data_t1, 
    x='Early Leavers Perc', 
    y='Crimes per 100k', 
    hue='Crime Typology',
    col='Country', 
    col_wrap=4, 
    legend=False)

g.set(ylim=(0, 1.2))
plt.legend(bbox_to_anchor=(2, 5))

In [None]:
data_t5 = []
for country in COUNTRIES:
    for crime_type in CRIME_TYPOLOGIES:
        df_split = data_t1[(data_t1['Country'] == country) & (data_t1['Crime Typology'] == crime_type)]
        data = {
            'country': country,
            'crime_type': crime_type,
            'crime_values': list(df_split['Crimes per 100k']),
            'early_leavers_values':list(df_split['Early Leavers Perc'])
        }
        data_t5.append(data)
        
data_t5

In [None]:
for i,d in enumerate(data_t5):
    corr_idx, p_value = stats.pearsonr(d['crime_values'], d['early_leavers_values'])
    data_t5[i]['pearson'] = corr_idx
    data_t5[i]['p_value'] = p_value
data_t5

## 4.1 Looking for lag in causation

In [None]:
data_t6 = filter_p_value(data_t5, MAX_P_VALUE, False)

data_t7 = pd.DataFrame(data_t6)
data_t7.sort_values(['pearson'], ascending=False).reset_index()[['country','crime_type','pearson','p_value']]

In [None]:
for i,d in enumerate(data_t5):    
    best_lag = get_best_lag(d['crime_values'],d['early_leavers_values'],'same')
    data_t5[i]['best_lag'] = best_lag
data_t5

In [None]:
for i,d in enumerate(data_t5):
    v1, v2 = shift_array(d['crime_values'], d['early_leavers_values'], d['best_lag'])
    
    corr_lag,p_value_lag = stats.pearsonr(v1, v2)
    data_t5[i]['pearson_lag'] = corr_lag
    data_t5[i]['p_value_lag'] = p_value_lag
    
data_t5

In [None]:
data_t8 = filter_p_value(data_t5, MAX_P_VALUE, True)
data_t9 = pd.DataFrame(data_t8)
data_t9.sort_values(['pearson'], ascending=False).reset_index()[['country','crime_type','pearson','p_value','pearson_lag','p_value_lag','best_lag']]

## Criminality Prediction

In [None]:
italy_crim_data = pd.read_csv("italy_next_decade/estat_crim_off_cat.tsv", sep="\t")
italy_unemp_data = pd.read_csv("italy_next_decade/estat_une_rt_a.tsv", sep="\t")
italy_early_data = pd.read_csv("italy_next_decade/estat_edat_lfse_14.tsv", sep="\t")

italy_crim_data = list(italy_crim_data.drop('2010 ', axis=1).values[0][1:])
italy_unemp_data = list(italy_unemp_data.drop('2020 ', axis=1).values[0][1:])
italy_early_data = list(italy_early_data.drop('2020 ', axis=1).values[0][1:])
italy_early_data = [15.0 if x=='15.0 b' else x for x in italy_early_data]


#y_test = list(italy_crim_data.values[0][1:])

#list(zip(list(italy_unemp_data.values[0][1:]),list(italy_early_data.values[0][1:])))
y_test = italy_crim_data
X_test = [list(x) for x in zip(italy_unemp_data, italy_early_data)]


In [None]:
italy_data = raw_data[(raw_data['Country'] == 'Italy') & (raw_data['Crime Typology'] == 'Intentional homicide')]

y_train = list(italy_data['Crimes Reported'].values)
X_train = [list(x) for x in zip(italy_data['Unemp Perc'].values, italy_data['Early Leavers Perc'].values)]

print(X_train)
print(X_test)

print(y_train)
print(y_test)

In [None]:
y = italy_crim_data + list(italy_data['Crimes Reported'].values)
X = [list(x) for x in zip(italy_unemp_data, italy_early_data)] + [list(x) for x in zip(italy_data['Unemp Perc'].values, italy_data['Early Leavers Perc'].values)]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)


In [None]:
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

crim_predictions = regr.predict(X_test)
print(crim_predictions)
print(y_test)

score=r2_score(y_test,crim_predictions)
score