# Cleaning up Portland('s data)

## Objectives:
- Find and remove records with null values
- Harmonize multiple variations of the same offense type
- Display a filtered view of a DataFrame

### Import Dependencies

In [1]:
import pandas as pd

### Load the data file

In [2]:
# Reference the file where the CSV is located
crime_csv_path = "Resources/crime_incident_data2017.csv"

# Import the data into a Pandas DataFrame
crime_df = pd.read_csv(crime_csv_path)
crime_df.head()

Unnamed: 0,Address,Case Number,Crime Against,Neighborhood,Number of Records,Occur Date,Occur Month Year,Occur Time,Offense Category,Offense Count,Offense Type,Open Data Lat,Open Data Lon,Open Data X,Open Data Y,Report Date,Report Month Year
0,,17-X4762181,Person,,1,1/1/96,1/1/96,800,Sex Offenses,1,Rape,,,,,1/26/17,1/1/17
1,,17-X4757824,Property,Centennial,1,1/20/00,1/1/00,1615,Fraud Offenses,1,Identity Theft,,,,,1/20/17,1/1/17
2,200 BLOCK OF SE 78TH AVE,17-900367,Property,Montavilla,1,12/1/03,12/1/03,800,Fraud Offenses,1,False Pretenses/Swindle/Confidence Game,45.5207,-122.583,7668150.0,682825.0,1/9/17,1/1/17
3,,17-X4748982,Property,Southwest Hills,1,1/1/10,1/1/10,0,Fraud Offenses,1,Identity Theft,,,,,1/5/17,1/1/17
4,,17-X4748982,Property,Southwest Hills,1,1/1/10,1/1/10,0,Larceny Offenses,1,All Other Larceny,,,,,1/5/17,1/1/17


### Look for missing values

In [3]:
crime_df.count()

Address              37365
Case Number          41032
Crime Against        41032
Neighborhood         39712
Number of Records    41032
Occur Date           41032
Occur Month Year     41032
Occur Time           41032
Offense Category     41032
Offense Count        41032
Offense Type         41032
Open Data Lat        36712
Open Data Lon        36712
Open Data X          36712
Open Data Y          36712
Report Date          41032
Report Month Year    41032
dtype: int64

### Drop null rows

In [4]:
no_null_crime_df = crime_df.dropna(how='any')

### Verify counts

In [5]:
no_null_crime_df.count()

Address              36146
Case Number          36146
Crime Against        36146
Neighborhood         36146
Number of Records    36146
Occur Date           36146
Occur Month Year     36146
Occur Time           36146
Offense Category     36146
Offense Count        36146
Offense Type         36146
Open Data Lat        36146
Open Data Lon        36146
Open Data X          36146
Open Data Y          36146
Report Date          36146
Report Month Year    36146
dtype: int64

### Check to see if there are any values with mispelled or similar values in "Offense Type"

In [6]:
no_null_crime_df["Offense Type"].value_counts()

Theft From Motor Vehicle                       6947
Motor Vehicle Theft                            4689
All Other Larceny                              4558
Vandalism                                      3863
Burglary                                       2824
Shoplifting                                    2259
Identity Theft                                 1794
Simple Assault                                 1216
Drug/Narcotic Violations                       1095
Theft of Motor Vehicle Parts or Accessories    1073
Intimidation                                    900
Theft From Building                             895
False Pretenses/Swindle/Confidence Game         870
Aggravated Assault                              839
Robbery                                         608
Counterfeiting/Forgery                          448
Weapons Law Violations                          266
Credit Card/ATM Fraud                           226
Arson                                           200
Prostitution

### Combining similar offenses together

In [7]:
no_null_crime_df["Offense Type"] = no_null_crime_df["Offense Type"].replace({"Commercial Sex Acts": "Prostitution", "Assisting or Promoting Prostitution": "Prostitution"})
no_null_crime_df.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Address,Case Number,Crime Against,Neighborhood,Number of Records,Occur Date,Occur Month Year,Occur Time,Offense Category,Offense Count,Offense Type,Open Data Lat,Open Data Lon,Open Data X,Open Data Y,Report Date,Report Month Year
2,200 BLOCK OF SE 78TH AVE,17-900367,Property,Montavilla,1,12/1/03,12/1/03,800,Fraud Offenses,1,False Pretenses/Swindle/Confidence Game,45.5207,-122.583,7668150.0,682825.0,1/9/17,1/1/17
5,5400 BLOCK OF NE MALLORY AVE,17-900129,Property,King,1,11/28/10,11/1/10,1612,Fraud Offenses,1,Identity Theft,45.5625,-122.664,7647987.0,698581.0,1/3/17,1/1/17
6,5000 BLOCK OF NE 19TH AVE,17-901079,Property,Vernon,1,11/8/13,11/1/13,1200,Fraud Offenses,1,False Pretenses/Swindle/Confidence Game,45.5594,-122.646,7652567.0,697337.0,1/26/17,1/1/17
7,5000 BLOCK OF NE 19TH AVE,17-901079,Property,Vernon,1,11/8/13,11/1/13,1200,Fraud Offenses,1,Identity Theft,45.5594,-122.646,7652567.0,697337.0,1/26/17,1/1/17
8,12000 BLOCK OF SE PINE ST,17-900253,Property,Hazelwood,1,1/6/14,1/1/14,805,Fraud Offenses,1,Credit Card/ATM Fraud,45.5204,-122.539,7679522.0,682404.0,1/6/17,1/1/17
9,12000 BLOCK OF SE PINE ST,17-900253,Property,Hazelwood,1,1/6/14,1/1/14,805,Fraud Offenses,1,Identity Theft,45.5204,-122.539,7679522.0,682404.0,1/6/17,1/1/17
10,12200 BLOCK OF N JANTZEN DR,17-11435,Property,Hayden Island,1,3/6/14,3/1/14,900,Fraud Offenses,1,Identity Theft,45.6116,-122.676,7645287.0,716588.0,1/12/17,1/1/17
11,1600 BLOCK OF NE 148TH PL,17-901114,Property,Wilkes,1,5/1/14,5/1/14,1120,Fraud Offenses,1,Identity Theft,45.5352,-122.508,7687592.0,687631.0,1/27/17,1/1/17
12,12800 BLOCK OF NE AIRPORT WAY,17-30672,Property,Argay,1,10/16/14,10/1/14,0,Embezzlement,1,Embezzlement,45.5639,-122.53,7682294.0,698212.0,1/31/17,1/1/17
15,800 BLOCK OF N WEBSTER ST,17-901179,Property,Humboldt,1,4/28/15,4/1/15,1620,Fraud Offenses,1,Identity Theft,45.5598,-122.676,7644953.0,697679.0,1/28/17,1/1/17


### Create a new DataFrame that looks into a specific neighborhood

In [None]:
vernon_crime_df = no_null_crime_df.loc[no_null_crime_df["Neighborhood"] == "Vernon"]
vernon_crime_df.head(20)

In [8]:
no_null_crime_df["Neighborhood"] == "Vernon"

2        False
5        False
6         True
7         True
8        False
9        False
10       False
11       False
12       False
15       False
16       False
17       False
18       False
19       False
20       False
21       False
22       False
23       False
24       False
25       False
26       False
27       False
30       False
31       False
32       False
33       False
34       False
35       False
36       False
37       False
         ...  
41001    False
41002    False
41003    False
41004    False
41005    False
41006    False
41007    False
41008    False
41009    False
41010    False
41011    False
41012    False
41013    False
41014    False
41015    False
41016    False
41017    False
41018    False
41019    False
41020    False
41021    False
41022    False
41023    False
41024    False
41025    False
41026    False
41027    False
41028    False
41029    False
41031    False
Name: Neighborhood, Length: 36146, dtype: bool