# Cleaning the Raw Data

## This Notebook is to clean the parks and crime data. Both crime datasets will be cleansed simultaneously.

In [1]:
import os
import pandas as pd
import re

#### Cleaning the parks file (1 of 3)

In [2]:
parks= os.path.join("../Resources/Raw/city-of-austin-parks-1.csv")

In [3]:
parks_df= pd.read_csv(parks)

parks_df = parks_df[["PARK_ID","PARK_ACRES","SHAPE_AREA","SHAPE_LEN","ZIP_CODE"]]
parks_df.head()

Unnamed: 0,PARK_ID,PARK_ACRES,SHAPE_AREA,SHAPE_LEN,ZIP_CODE
0,215,1.152581,50206.24,912.073535,78731
1,313,4.943708,215347.0,2456.140724,78703
2,371,77.390907,3371134.0,9262.547633,78746
3,289,2.248317,97936.3,1665.906556,78703
4,315,91.228997,3973919.0,35791.080093,78745


In [4]:
zip_parks_df = parks_df.groupby(["ZIP_CODE"])["PARK_ID"].count().reset_index()
zip_parks_df= zip_parks_df.rename(columns={"ZIP_CODE": "Zip Code","PARK_ID": "Number of Parks"})

zip_parks_df.head()

Unnamed: 0,Zip Code,Number of Parks
0,78613,1
1,78617,5
2,78652,2
3,78660,1
4,78664,1


In [5]:
zip_parks_df.to_csv("../Resources/CLean/parks_by_zip.csv")

#### Cleaning Crime Data

In [6]:
crime_2014= os.path.join("../Resources/Raw/Annual_Crime_2014.csv")
crime_2016= os.path.join("../Resources/Raw/2016_Annual_Crime_Data.csv.")

In [7]:
crime_2014_df = pd.read_csv(crime_2014)
crime_2016_df = pd.read_csv(crime_2016)

In [8]:
crime_2014_df = crime_2014_df[["GO Location Zip","GO Highest Offense Desc","Highest NIBRS/UCR Offense Description"]]
crime_2014_df= crime_2014_df.rename(columns={"GO Location Zip": "Zip Code", "Highest NIBRS/UCR Offense Description":"UCR Description"})

In [9]:
crime_2014_df["UCR Description"].unique()

array(['Robbery', 'Burglary / \nBreaking & Entering', 'Auto Theft',
       'Homicide: Murder & Nonnegligent Manslaughter', 'Rape',
       'Aggravated Assault', 'Theft: Shoplifting',
       'Theft: Pocket Picking', 'Theft: Purse Snatching',
       'Theft: from Building', 'Theft: Coin Op Machine', 'Theft: BOV',
       'Theft: Auto Parts', 'Theft: All Other Larceny'], dtype=object)

In [10]:
crime_2016_df= crime_2016_df[["GO Location Zip","GO Highest Offense Desc","Highest NIBRS/UCR Offense Description"]]
crime_2016_df= crime_2016_df.rename(columns={"GO Location Zip": "Zip Code", "Highest NIBRS/UCR Offense Description":"UCR Description"})

In [11]:
crime_2016_df["UCR Description"].unique()

array(['Agg Assault', 'Theft', 'Robbery', 'Rape', 'Burglary',
       'Auto Theft', 'Murder'], dtype=object)

#### Write a function that will make both 2016 & 2014's Crime Type columns to match

In [12]:
def cleanNIBRS(offense):
    offenses = ['Agg Assault', 'Theft', 'Robbery', 'Rape', 'Burglary',
       'Auto Theft', 'Murder']
    for off in offenses:
        if re.search(off, offense):
            return off
        elif offense == "Aggravated Assault":
            return 'Agg Assault'
        else:
            continue

In [13]:
crime_2014_df["UCR Description"] = crime_2014_df["UCR Description"].apply(lambda row: cleanNIBRS(row))

In [14]:
crime_2014_df["UCR Description"].unique()

array(['Robbery', 'Burglary', 'Theft', 'Murder', 'Rape', 'Agg Assault'],
      dtype=object)

In [15]:
crime_2014_df.head()

Unnamed: 0,Zip Code,GO Highest Offense Desc,UCR Description
0,78753,AGG ROBBERY/DEADLY WEAPON,Robbery
1,78723,ROBBERY BY ASSAULT,Robbery
2,78702,ROBBERY BY THREAT,Robbery
3,78723,AGG ROBBERY/DEADLY WEAPON,Robbery
4,78702,AGG ROBBERY/DEADLY WEAPON,Robbery


In [16]:
crime_2016_df.head()

Unnamed: 0,Zip Code,GO Highest Offense Desc,UCR Description
0,78735.0,AGG ASLT ENHANC STRANGL/SUFFOC,Agg Assault
1,78701.0,THEFT,Theft
2,78753.0,AGG ROBBERY/DEADLY WEAPON,Robbery
3,78701.0,THEFT,Theft
4,78753.0,SEXUAL ASSAULT W/ OBJECT,Rape


In [17]:
crime_2016_df.to_csv("../Resources/CLean/crime_2016_clean.csv")
crime_2014_df.to_csv("../Resources/CLean/crime_2014_clean.csv")