## Load Libraries and Data

In [1]:
import numpy as np
import pandas as pd
import sqlite3 as sql
from modules.cleaning_utils import multiple_descriptions, replace_description
from modules.sql_utils import create_table

In [2]:
arrest_df = pd.concat([
    pd.read_parquet('../data/cleaned/arrest_data_historic_cleaned_1.parquet.gz'),
    pd.read_parquet('../data/cleaned/arrest_data_historic_cleaned_2.parquet.gz')
])
complaint_df = pd.concat([
    pd.read_parquet('../data/cleaned/complaint_data_historic_cleaned_1.parquet.gz'),
    pd.read_parquet('../data/cleaned/complaint_data_historic_cleaned_2.parquet.gz')
])
shooting_df = pd.read_parquet('../data/cleaned/shooting_data_historic_cleaned.parquet.gz')
population_df = pd.read_parquet('../data/cleaned/population_data_cleaned.parquet.gz')

In [3]:
arrest_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5684630 entries, 0 to 2842314
Data columns (total 16 columns):
 #   Column             Dtype         
---  ------             -----         
 0   ARREST_KEY         int64         
 1   ARREST_DATE        datetime64[ns]
 2   PD_CD              int64         
 3   PD_DESC            category      
 4   KY_CD              int64         
 5   OFNS_DESC          category      
 6   LAW_CODE           category      
 7   LAW_CAT_CD         category      
 8   ARREST_BORO        category      
 9   ARREST_PRECINCT    int64         
 10  JURISDICTION_CODE  int64         
 11  AGE_GROUP          category      
 12  PERP_SEX           category      
 13  PERP_RACE          category      
 14  Latitude           float64       
 15  Longitude          float64       
dtypes: category(8), datetime64[ns](1), float64(2), int64(5)
memory usage: 444.6 MB


In [4]:
complaint_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8859411 entries, 0 to 4429705
Data columns (total 22 columns):
 #   Column             Dtype         
---  ------             -----         
 0   CMPLNT_NUM         int64         
 1   CMPLNT_FR_DT       datetime64[ns]
 2   CMPLNT_FR_TM       object        
 3   ADDR_PCT_CD        int64         
 4   RPT_DT             datetime64[ns]
 5   KY_CD              int64         
 6   OFNS_DESC          category      
 7   PD_CD              int64         
 8   PD_DESC            category      
 9   CRM_ATPT_CPTD_CD   category      
 10  LAW_CAT_CD         category      
 11  BORO_NM            category      
 12  JURIS_DESC         category      
 13  JURISDICTION_CODE  int64         
 14  SUSP_AGE_GROUP     category      
 15  SUSP_RACE          category      
 16  SUSP_SEX           category      
 17  Latitude           float64       
 18  Longitude          float64       
 19  VIC_AGE_GROUP      category      
 20  VIC_RACE           category  

In [5]:
shooting_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28501 entries, 0 to 28500
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   INCIDENT_KEY             28501 non-null  int64         
 1   OCCUR_DATE               28501 non-null  datetime64[ns]
 2   OCCUR_TIME               28501 non-null  object        
 3   BORO                     28501 non-null  category      
 4   PRECINCT                 28501 non-null  int64         
 5   JURISDICTION_CODE        28501 non-null  float64       
 6   STATISTICAL_MURDER_FLAG  28501 non-null  bool          
 7   PERP_AGE_GROUP           28501 non-null  category      
 8   PERP_SEX                 28501 non-null  category      
 9   PERP_RACE                28501 non-null  category      
 10  VIC_AGE_GROUP            28501 non-null  category      
 11  VIC_SEX                  28501 non-null  category      
 12  VIC_RACE                 28501 n

In [6]:
population_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Borough  6 non-null      object
 1   1950     6 non-null      int64 
 2   1960     6 non-null      int64 
 3   1970     6 non-null      int64 
 4   1980     6 non-null      int64 
 5   1990     6 non-null      int64 
 6   2000     6 non-null      int64 
 7   2010     6 non-null      int64 
 8   2020     6 non-null      int64 
 9   2030     6 non-null      int64 
 10  2040     6 non-null      int64 
dtypes: int64(10), object(1)
memory usage: 656.0+ bytes


## Identify Matching Columns and Normalize

### Offense Code and Description (KY_CD, OFNS_DESC)

From the documentation:
- KY_CD: Three digit offense classification code
- OFNS_DESC: Description of offense corresponding with key code

In [7]:
cols = ['KY_CD', 'OFNS_DESC']
offense_code_df = pd.concat([arrest_df[cols], complaint_df[cols]]).value_counts().reset_index()[cols].sort_values(cols[0])

In [8]:
# Multiple descriptions for different codes
multi_dict = multiple_descriptions(offense_code_df, cols[0], cols [1])
multi_dict

{103: ['HOMICIDE-NEGLIGENT,UNCLASSIFIE', 'HOMICIDE-NEGLIGENT,UNCLASSIFIED'],
 121: ['CRIMINAL MISCHIEF & RELATED OF',
  'CRIMINAL MISCHIEF & RELATED OFFENSES'],
 125: ['NYS LAWS-UNCLASSIFIED FELONY', 'VEHICLE AND TRAFFIC LAWS'],
 233: ['FORCIBLE TOUCHING', 'SEX CRIMES'],
 343: ['OTHER OFFENSES RELATED TO THEFT', 'THEFT OF SERVICES'],
 347: ['INTOXICATED & IMPAIRED DRIVING', 'INTOXICATED/IMPAIRED DRIVING'],
 349: ['DISRUPTION OF A RELIGIOUS SERV', 'DISRUPTION OF A RELIGIOUS SERVICE'],
 351: ['CRIMINAL MISCHIEF & RELATED OF',
  'CRIMINAL MISCHIEF & RELATED OFFENSES'],
 359: ['OFFENSES AGAINST PUBLIC ADMINI',
  'OFFENSES AGAINST PUBLIC ADMINISTRATION'],
 361: ['HARASSMENT', 'OFF. AGNST PUB ORD SENSBLTY &'],
 362: ['OFFENSES AGAINST MARRIAGE UNCL',
  'OFFENSES AGAINST MARRIAGE UNCLASSIFIED'],
 455: ['UNLAWFUL POSS. WEAP. ON SCHOOL',
  'UNLAWFUL POSS. WEAP. ON SCHOOL GROUNDS'],
 571: ['LOITERING/GAMBLING (CARDS, DIC',
  'LOITERING/GAMBLING (CARDS, DICE, ETC)'],
 577: ['UNDER THE INFLUENCE O

In [9]:
# Replace differing descriptions with normalized version, then drop duplicates
# Each row is now unique
index_map = [1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0]
replace_description(offense_code_df, multi_dict, index_map, cols[0], cols[1])
offense_code_df = offense_code_df.drop_duplicates()
offense_code_df

Unnamed: 0,KY_CD,OFNS_DESC
46,101,MURDER & NON-NEGL. MANSLAUGHTER
75,102,HOMICIDE-NEGLIGENT-VEHICLE
77,103,"HOMICIDE-NEGLIGENT,UNCLASSIFIED"
40,104,RAPE
7,105,ROBBERY
...,...,...
78,685,ADMINISTRATIVE CODES
54,880,MOVING INFRACTIONS
22,881,OTHER TRAFFIC INFRACTION
79,882,PARKING OFFENSES


In [10]:
# Drop description columns from arrest and complaint data, as they can use
# KY_CD as a foreign key into the offense description table
arrest_df.drop(cols[1], axis = 1, inplace = True)
complaint_df.drop(cols[1], axis = 1, inplace = True)

### Internal Classification Code and Description (PD_CD, PD_DESC)

From the documentation:
- PD_CD: Three digit internal classification code (more granular than Key Code)
- PD_DESC: Description of internal classification corresponding with PD code (more granular than Offense Description)

In [11]:
# Multiple descriptions for different codes
cols = ['PD_CD', 'PD_DESC']
internal_code_df = pd.concat([arrest_df[cols], complaint_df[cols]]).value_counts().reset_index()[cols].sort_values(cols[0])

In [12]:
# Multiple descriptions for different codes
multi_dict = multiple_descriptions(internal_code_df, cols[0], cols [1])
multi_dict

{104: ['VEHICULAR ASSAULT (INTOX DRIVE', 'VEHICULAR ASSAULT (INTOX DRIVER)'],
 107: ['END WELFARE VULNERABLE ELDERLY PERSON',
  'ENDANGERING VULNERABLE ELDERLY'],
 112: ['MENACING 1ST DEGREE (VICT NOT',
  'MENACING 1ST DEGREE (VICT NOT PEACE OFFICER)'],
 122: ['HOMICIDE, NEGLIGENT, VEHICLE,',
  'HOMICIDE, NEGLIGENT, VEHICLE, INTOX DRIVER'],
 125: ['HOMICIDE,NEGLIGENT,UNCLASSIFIE', 'HOMICIDE,NEGLIGENT,UNCLASSIFIED'],
 178: ['FAC. SEXUAL OFFENSE W/CONTROLL',
  'FAC. SEXUAL OFFENSE W/CONTROLLED SUBSTANCE'],
 180: ['COURSE OF SEXUAL CONDUCT AGAIN',
  'COURSE OF SEXUAL CONDUCT AGAINST A CHILD'],
 201: ['TRESPASS 4,CRIMINAL', 'TRESPASS 4,CRIMINAL SUB 2'],
 244: ['BURGLARY,UNCLASSIFIED,UNKNOWN', 'BURGLARY,UNCLASSIFIED,UNKNOWN TIME'],
 248: ['RADIO DEVICES,UNLAWFUL POSSESS', 'RADIO DEVICES,UNLAWFUL POSSESSION'],
 254: ['MISCHIEF, CRIMINAL 4, OF MOTOR',
  'MISCHIEF, CRIMINAL 4, OF MOTOR VEHICLE'],
 258: ['CRIMINAL MISCHIEF 4TH, GRAFFIT', 'CRIMINAL MISCHIEF 4TH, GRAFFITI'],
 267: ['MISCHIEF, CRI

In [13]:
# Replace differing descriptions with normalized version, then drop duplicates
# Each row is now unique
index_map = [
    1, 1, 1, 1, 1,
    1, 1, 0, 1, 1,
    1, 1, 1, 1, 1,
    1, 1, 1, 1, 1,
    1, 1, 1, 1, 1,
    1, 1, 1, 1, 1,
    1, 1, 1, 1, 1,
    1, 1, 1, 1, 1,
    1, 1, 1, 1, 1,
    1, 1, 1, 1, 0,
    1, 1, 1, 1, 1,
    1, 0, 1, 1, 1,
    1, 1, 1, 1, 1,
    1, 1, 1, 1, 1,
    1, 1, 1, 1, 1,
    1, 1, 1, 1, 1,
    1, 1, 1, 1, 1
]
replace_description(internal_code_df, multi_dict, index_map, cols[0], cols[1])
internal_code_df = internal_code_df.drop_duplicates()
internal_code_df

Unnamed: 0,PD_CD,PD_DESC
514,11,"NY CITY,TRAFFIC SUMMONS WARRANT"
460,100,STALKING COMMIT SEX OFFENSE
0,101,ASSAULT 3
416,102,ASSAULT SCHOOL SAFETY AGENT
399,103,ASSAULT TRAFFIC AGENT
...,...,...
489,970,SPILLBACK
410,972,SEAT BELTS
388,973,USE OF CELLULAR TELEPHONE WHILE DRIVING
552,975,ENVIRONMENTAL CONTROL BOARD


In [14]:
# Drop description columns from arrest and complaint data, as they can use
# PD_CD as a foreign key into the internal description table
arrest_df.drop(cols[1], axis = 1, inplace = True)
complaint_df.drop(cols[1], axis = 1, inplace = True)

## Standardizing Labels and Column Names

### Law Categories

Both the arrest and complaints table have law categories, these should be normalized.

In [15]:
arrest_df.LAW_CAT_CD.unique()

['F', 'M', 'V', 'I']
Categories (4, object): ['F', 'I', 'M', 'V']

In [16]:
complaint_df.LAW_CAT_CD.unique()

['FELONY', 'VIOLATION', 'MISDEMEANOR']
Categories (3, object): ['FELONY', 'MISDEMEANOR', 'VIOLATION']

In [17]:
# We will use the single letter categories as used in the arrest df
# F = Felony, M = Misdemeneor, V = Violation, I = Traffic Infractions
complaint_df.LAW_CAT_CD = complaint_df.LAW_CAT_CD.cat.rename_categories(
    {'FELONY': 'F', 'VIOLATION': 'V', 'MISDEMEANOR': 'M'}
)

### Borough Names

Borough names should be standardized to single letter codes, to be used as a join on the population table. Names of columns should also be standardized.

- B = Bronx
- K = Brooklyn
- M = Manhattan
- Q = Queens
- S = Staten Island

In [18]:
arrest_df.ARREST_BORO.unique()

['B', 'K', 'S', 'Q', 'M']
Categories (5, object): ['B', 'K', 'M', 'Q', 'S']

In [19]:
# Already has correct codes, just rename the column
arrest_df.rename({'ARREST_BORO': 'BORO'}, axis = 1, inplace = True)

In [20]:
complaint_df.BORO_NM.unique()

['MANHATTAN', 'BROOKLYN', 'BRONX', 'QUEENS', 'STATEN ISLAND']
Categories (5, object): ['BRONX', 'BROOKLYN', 'MANHATTAN', 'QUEENS', 'STATEN ISLAND']

In [21]:
# Need to replace long-form names with codes and rename column
complaint_df.BORO_NM = complaint_df.BORO_NM.cat.rename_categories(
    {'BRONX': 'B', 'BROOKLYN': 'K', 'MANHATTAN': 'M', 'QUEENS': 'Q', 'STATEN ISLAND': 'S'}
)
complaint_df.rename({'BORO_NM': 'BORO'}, axis = 1, inplace = True)

In [22]:
shooting_df.BORO.unique()

['MANHATTAN', 'BRONX', 'QUEENS', 'BROOKLYN', 'STATEN ISLAND']
Categories (5, object): ['BRONX', 'BROOKLYN', 'MANHATTAN', 'QUEENS', 'STATEN ISLAND']

In [23]:
# Need to replace long-form names with codes
shooting_df.BORO = shooting_df.BORO.cat.rename_categories(
    {'BRONX': 'B', 'BROOKLYN': 'K', 'MANHATTAN': 'M', 'QUEENS': 'Q', 'STATEN ISLAND': 'S'}
)

In [24]:
population_df.Borough.unique()

array(['NYC Total', '   Bronx', '   Brooklyn', '   Manhattan',
       '   Queens', '   Staten Island'], dtype=object)

In [25]:
# Need to rename the column and fix values, as well as add a code column for joining
population_df.Borough = ['NYC Total', 'Bronx', 'Brooklyn', 'Manhattan', 'Queens', 'Staten Island']
population_df.rename({'Borough': 'BORO_DESC'}, axis = 1, inplace = True)
population_df.insert(0, 'BORO_CD', ['T', 'B', 'K', 'M', 'Q', 'S'])
population_df

Unnamed: 0,BORO_CD,BORO_DESC,1950,1960,1970,1980,1990,2000,2010,2020,2030,2040
0,T,NYC Total,7891957,7781984,7894862,7071639,7322564,8008278,8242624,8550971,8821027,9025145
1,B,Bronx,1451277,1424815,1471701,1168972,1203789,1332650,1385108,1446788,1518998,1579245
2,K,Brooklyn,2738175,2627319,2602012,2230936,2300664,2465326,2552911,2648452,2754009,2840525
3,M,Manhattan,1960101,1698281,1539233,1428285,1487536,1537195,1585873,1638281,1676720,1691617
4,Q,Queens,1550849,1809578,1986473,1891325,1951598,2229379,2250002,2330295,2373551,2412649
5,S,Staten Island,191555,221991,295443,352121,378977,443728,468730,487155,497749,501109


### Cleaning up remaining column names and order

In [26]:
# Standardizing naming conventions
arrest_df.rename({'AGE_GROUP': 'PERP_AGE_GROUP',
                  'ARREST_PRECINCT': 'PRECINCT_CD',
                  'Latitude': 'LATITUDE',
                  'Longitude': 'LONGITUDE'}, axis = 1, inplace = True)
complaint_df.rename({'CMPLNT_NUM': 'CMPLNT_KEY',
                     'CMPLNT_FR_DT': 'CMPLNT_DATE',
                     'CMPLNT_FR_TM': 'CMPLNT_TIME',
                     'ADDR_PCT_CD': 'PRECINCT_CD',
                     'Latitude': 'LATITUDE',
                     'Longitude': 'LONGITUDE'}, axis = 1, inplace = True)
shooting_df.rename({'PRECINCT': 'PRECINCT_CD',
                    'Latitude': 'LATITUDE',
                    'Longitude': 'LONGITUDE'}, axis = 1, inplace = True)

In [27]:
# Standardizing column order
ordered_col = ['ARREST_KEY', 'ARREST_DATE', 'KY_CD', 'PD_CD',
               'LAW_CAT_CD', 'LAW_CODE', 'PRECINCT_CD', 'JURISDICTION_CODE',
               'BORO', 'PERP_AGE_GROUP', 'PERP_SEX', 'PERP_RACE', 'LATITUDE', 'LONGITUDE']

arrest_df = arrest_df.reindex(columns = ordered_col)

ordered_col = ['CMPLNT_KEY', 'CMPLNT_DATE', 'CMPLNT_TIME', 'RPT_DT', 'KY_CD', 'PD_CD',
               'LAW_CAT_CD', 'PRECINCT_CD', 'JURISDICTION_CODE', 'JURIS_DESC',
               'BORO', 'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX',
               'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX', 'LATITUDE', 'LONGITUDE']

complaint_df = complaint_df.reindex(columns = ordered_col)

ordered_col = ['INCIDENT_KEY', 'OCCUR_DATE', 'OCCUR_TIME', 'PRECINCT_CD',
               'JURISDICTION_CODE', 'BORO', 'STATISTICAL_MURDER_FLAG',
               'PERP_AGE_GROUP', 'PERP_SEX', 'PERP_RACE', 'VIC_AGE_GROUP',
               'VIC_SEX', 'VIC_RACE', 'LATITUDE', 'LONGITUDE']

shooting_df = shooting_df.reindex(columns = ordered_col)

In [28]:
arrest_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5684630 entries, 0 to 2842314
Data columns (total 14 columns):
 #   Column             Dtype         
---  ------             -----         
 0   ARREST_KEY         int64         
 1   ARREST_DATE        datetime64[ns]
 2   KY_CD              int64         
 3   PD_CD              int64         
 4   LAW_CAT_CD         category      
 5   LAW_CODE           category      
 6   PRECINCT_CD        int64         
 7   JURISDICTION_CODE  int64         
 8   BORO               category      
 9   PERP_AGE_GROUP     category      
 10  PERP_SEX           category      
 11  PERP_RACE          category      
 12  LATITUDE           float64       
 13  LONGITUDE          float64       
dtypes: category(6), datetime64[ns](1), float64(2), int64(5)
memory usage: 428.4 MB


In [29]:
complaint_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8859411 entries, 0 to 4429705
Data columns (total 19 columns):
 #   Column             Dtype         
---  ------             -----         
 0   CMPLNT_KEY         int64         
 1   CMPLNT_DATE        datetime64[ns]
 2   CMPLNT_TIME        object        
 3   RPT_DT             datetime64[ns]
 4   KY_CD              int64         
 5   PD_CD              int64         
 6   LAW_CAT_CD         category      
 7   PRECINCT_CD        int64         
 8   JURISDICTION_CODE  int64         
 9   JURIS_DESC         category      
 10  BORO               category      
 11  SUSP_AGE_GROUP     category      
 12  SUSP_RACE          category      
 13  SUSP_SEX           category      
 14  VIC_AGE_GROUP      category      
 15  VIC_RACE           category      
 16  VIC_SEX            category      
 17  LATITUDE           float64       
 18  LONGITUDE          float64       
dtypes: category(9), datetime64[ns](2), float64(2), int64(5), object(1)
m

In [30]:
shooting_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28501 entries, 0 to 28500
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   INCIDENT_KEY             28501 non-null  int64         
 1   OCCUR_DATE               28501 non-null  datetime64[ns]
 2   OCCUR_TIME               28501 non-null  object        
 3   PRECINCT_CD              28501 non-null  int64         
 4   JURISDICTION_CODE        28501 non-null  float64       
 5   BORO                     28501 non-null  category      
 6   STATISTICAL_MURDER_FLAG  28501 non-null  bool          
 7   PERP_AGE_GROUP           28501 non-null  category      
 8   PERP_SEX                 28501 non-null  category      
 9   PERP_RACE                28501 non-null  category      
 10  VIC_AGE_GROUP            28501 non-null  category      
 11  VIC_SEX                  28501 non-null  category      
 12  VIC_RACE                 28501 n

## SQLite Database Creation

In [31]:
# Establish connection and cursor (automatically creates database if it does not exist)
conn = sql.connect('../data/database/NYC_CRIME_DATABASE.db')
cur = conn.cursor()

# Enable foreign keys
if cur.execute('PRAGMA foreign_keys;').fetchone()[0] == 0:
    cur.execute('PRAGMA foreign_keys = ON;')

### Table Creation and Population

In [32]:
# Create internal code descriptions table and populate
res = create_table(cursor = cur, table_name = 'INTERNAL_CODES', df = internal_code_df, primary_key = 'PD_CD')
internal_code_df.to_sql('INTERNAL_CODES', conn, if_exists = 'replace', index = False)

CREATE TABLE IF NOT EXISTS INTERNAL_CODES(PD_CD INTEGER PRIMARY KEY, PD_DESC TEXT);
Table INTERNAL_CODES successfully created.


475

In [33]:
# Create offense code descriptions table and populate
res = create_table(cursor = cur, table_name = 'OFFENSE_CODES', df = offense_code_df, primary_key = 'KY_CD')
offense_code_df.to_sql('OFFENSE_CODES', conn, if_exists = 'replace', index = False)

CREATE TABLE IF NOT EXISTS OFFENSE_CODES(KY_CD INTEGER PRIMARY KEY, OFNS_DESC TEXT);
Table OFFENSE_CODES successfully created.


79

In [34]:
# Create arrest table and populate
res = create_table(cursor = cur, table_name = 'ARRESTS', df = arrest_df,
                   primary_key = 'ARREST_KEY',
                   foreign_keys = {
                       'PD_CD': ('INTERNAL_CODES', 'PD_CD'),
                       'KY_CD': ('OFFENSE_CODES', 'KY_CD')})
arrest_df.to_sql('ARRESTS', conn, if_exists = 'replace', index = False)

CREATE TABLE IF NOT EXISTS ARRESTS(ARREST_KEY INTEGER PRIMARY KEY, ARREST_DATE TEXT, KY_CD INTEGER, PD_CD INTEGER, LAW_CAT_CD TEXT, LAW_CODE TEXT, PRECINCT_CD INTEGER, JURISDICTION_CODE INTEGER, BORO TEXT, PERP_AGE_GROUP TEXT, PERP_SEX TEXT, PERP_RACE TEXT, LATITUDE REAL, LONGITUDE REAL, FOREIGN KEY (PD_CD) REFERENCES INTERNAL_CODES(PD_CD), FOREIGN KEY (KY_CD) REFERENCES OFFENSE_CODES(KY_CD));
Table ARRESTS successfully created.


5684630

In [35]:
# Create complaints table and populate
res = create_table(cursor = cur, table_name = 'COMPLAINTS', df = complaint_df,
                   primary_key = 'CMPLNT_KEY',
                   foreign_keys = {
                       'PD_CD': ('INTERNAL_CODES', 'PD_CD'),
                       'KY_CD': ('OFFENSE_CODES', 'KY_CD')})
complaint_df.to_sql('COMPLAINTS', conn, if_exists = 'replace', index = False)

CREATE TABLE IF NOT EXISTS COMPLAINTS(CMPLNT_KEY INTEGER PRIMARY KEY, CMPLNT_DATE TEXT, CMPLNT_TIME TEXT, RPT_DT TEXT, KY_CD INTEGER, PD_CD INTEGER, LAW_CAT_CD TEXT, PRECINCT_CD INTEGER, JURISDICTION_CODE INTEGER, JURIS_DESC TEXT, BORO TEXT, SUSP_AGE_GROUP TEXT, SUSP_RACE TEXT, SUSP_SEX TEXT, VIC_AGE_GROUP TEXT, VIC_RACE TEXT, VIC_SEX TEXT, LATITUDE REAL, LONGITUDE REAL, FOREIGN KEY (PD_CD) REFERENCES INTERNAL_CODES(PD_CD), FOREIGN KEY (KY_CD) REFERENCES OFFENSE_CODES(KY_CD));
Table COMPLAINTS successfully created.


8859411

In [36]:
# Create shootings table and populate
res = create_table(cursor = cur, table_name = 'SHOOTINGS', df = shooting_df, primary_key = 'INCIDENT_KEY')
shooting_df.to_sql('SHOOTINGS', conn, if_exists = 'replace', index = False)

CREATE TABLE IF NOT EXISTS SHOOTINGS(INCIDENT_KEY INTEGER PRIMARY KEY, OCCUR_DATE TEXT, OCCUR_TIME TEXT, PRECINCT_CD INTEGER, JURISDICTION_CODE REAL, BORO TEXT, STATISTICAL_MURDER_FLAG INTEGER, PERP_AGE_GROUP TEXT, PERP_SEX TEXT, PERP_RACE TEXT, VIC_AGE_GROUP TEXT, VIC_SEX TEXT, VIC_RACE TEXT, LATITUDE REAL, LONGITUDE REAL);
Table SHOOTINGS successfully created.


28501

In [37]:
# Create borough table and populate
res = create_table(cursor = cur, table_name = 'BOROUGHS', df = population_df, primary_key = 'BORO_CD')
population_df.to_sql('BOROUGHS', conn, if_exists = 'replace', index = False)

CREATE TABLE IF NOT EXISTS BOROUGHS(BORO_CD TEXT PRIMARY KEY, BORO_DESC TEXT, "1950" INTEGER, "1960" INTEGER, "1970" INTEGER, "1980" INTEGER, "1990" INTEGER, "2000" INTEGER, "2010" INTEGER, "2020" INTEGER, "2030" INTEGER, "2040" INTEGER);
Table BOROUGHS successfully created.


6

In [38]:
# Close cursor and connection
cur.close()
conn.close()