In [1]:
import os
import glob
import pandas as pd
from geopandas import GeoDataFrame, GeoSeries
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
import html5lib
import re

In [2]:
# crime data
def combine_csv(path,filename):
    file_list = [i for i in glob.glob(os.path.join(path, filename))]
    combined_data = pd.concat([pd.read_csv(files) for files in file_list], ignore_index = True)
    
    return combined_data

# shapefile for policing areas
def combine_geo(path, filename):
        file_list = [i for i in glob.glob(os.path.join(path, filename))]
        combined_geo = pd.concat([GeoDataFrame.from_file(files) for files in file_list],ignore_index = True)
        
        return combined_geo
    
# function for webscapping
def webscrap_data(str_url, str_class):
    url = requests.get(str_url)
    soup = BeautifulSoup(url.content, 'lxml')
    table = soup.find(class_ = str_class)
    table_to_df = pd.read_html(str(table))[0]
    
    return table_to_df

In [3]:
path = "/Users/jlee/Predictive_Policing"

file_name = ["WestMidlands/**/*street.csv", "WMP_areas/**/england_lsoa_2011.shx"]

wm_crime= combine_csv(path, file_name[0])
wmp_areas = combine_geo(path, file_name[1])

In [4]:
# webscapping weather for WMP areas between 2017-11 - 2020-10 
url = "https://www.timeanddate.com/weather/uk/birmingham/historic?month=change&year=2017"
str_class = 'eight columns'
month_list = [str(i) for i in range(1, 13, 1)]
urls =[]

year = 2016

# collecting urls
while (year <= 2019):
    
    url = url.replace(str(year), str(year+1))
    year = year + 1  
    
    if str(year) == '2017':
        for i in month_list[-2:]:
            u = url.replace('change', i)
            urls.append(u) 
    
    elif str(year) == '2018' or  str(year) =='2019':
        for i in month_list:
            u = url.replace('change', i)
            urls.append(u)
            
    else :
        for i in month_list[:-2]:
            u = url.replace('change', i)
            urls.append(u)

# collecting avg temp
avg_temp = [webscrap_data(i, str_class) for i in urls]
avg_temp = [avg_temp[i].iloc[2]['Temperature'] for i in range(0, len(avg_temp), 1)]
avg_temp =[i.replace('\xa0', '') for i in avg_temp]

# date index
dti = pd.Series(pd.period_range('2017-11-01', freq='M', periods=36))
avg_temp = pd.DataFrame(avg_temp, index = dti, columns = ['Avg_temp'])

In [5]:
# median property price data
med_house = pd.read_excel('med_house_lsoa.xls', sheet_name = 'Data', header = 5)

# get the relavant columns
cols_drop = med_house.iloc[:, 4:-13].columns 
cols_drop = cols_drop.append(med_house.iloc[:, -3:].columns)
med_house.drop(columns = cols_drop, inplace = True)

In [6]:
# get the relavant locations 
# delete locations that are not WMP areas
district_list = ["Birmingham", "Coventry", "Dudley", "Sandwell", "Solihull", "Walsall", "Wolverhampton"]
wmp_lsoa = med_house["LSOA name"]
lsoa_not_in = []

for lsoa in wmp_lsoa:
    if re.findall(r"(?=("+'|'.join(district_list)+r"))", lsoa):
        pass
    else:
        lsoa_not_in.append(lsoa)
                
filt = med_house[med_house['LSOA name'].isin(lsoa_not_in)].index
med_house.drop(filt, inplace = True)

In [7]:
# universal credit data
uni_credit = pd.read_excel('people_uc.xlsx', sheet_name = 'Data Sheet 0', header = 9)
uni_credit.drop(columns = 'Month', axis = 1, inplace = True) # drop cols
idx_list = [i for i in range(1681, 1693, 1)]
idx_list.append(0)
uni_credit.drop(uni_credit.index[idx_list], inplace = True) # drop rows
uni_credit.rename(columns = {'Unnamed: 1': 'LSOA'}, inplace = True)

In [8]:
wm_crime.to_csv('wm_crime.csv', index = False)
wmp_areas.to_csv('wmp_areas.csv', index = False)
avg_temp.to_csv('avg_temp.csv')
med_house.to_csv('med_house.csv')
uni_credit.to_csv('uni_credit.csv', index = False)