In [1]:
import numpy as np
import pandas as pd
import psycopg2

### Connect to Postgres DB

In [2]:
import json

with open('config.json') as f:
    conf = json.load(f)
    host = conf['host']
    database = conf['database']
    user = conf['user']
    passw = conf['passw']

In [3]:
conn_str = "host={} dbname={} user={} password={}".format(host, database, user, passw)
conn = psycopg2.connect(conn_str)

### Initial Query

In [4]:
Vic_Off_query = "SELECT vic.victim_id AS VICTIM_ID,\
            vic.incident_id AS INCIDENT_ID,\
            vic.victim_type_id AS VICTIM_TYPE_ID,\
            ty.victim_type_name AS VICTIM_TYPE,\
            vic.age_range_low_num AS AGE_RANGE_LOW,\
            vic.age_range_high_num AS AGE_RANGE_HIGH,\
            vic.sex_code AS VICTIM_SEX,\
            oft.crime_against AS CRIME_AGAINST,\
            oft.offense_name AS OFFENSE,\
            oft.offense_category_name AS OFFENSE_CATEGORY,\
            oft.offense_group AS OFFENSE_GROUP,\
            ofr.offender_id AS OFFENDER_ID,\
            ofr.age_num AS OFFENDER_AGE,\
            ofr.sex_code AS OFFENDER_SEX,\
            off.location_id,\
            loc.location_name,\
            inc.ddocname,\
            ori.fips,\
            ori.countyname,\
            ori.name\
            FROM nibrs_victim as vic\
            JOIN nibrs_victim_type as ty\
            ON vic.victim_type_id = ty.victim_type_id\
            JOIN nibrs_offender as ofr\
            ON ofr.incident_id = vic.incident_id\
            JOIN nibrs_offense as off\
            ON off.incident_id = vic.incident_id\
            JOIN nibrs_offense_type as oft\
            ON oft.offense_type_id = off.offense_type_id\
            JOIN nibrs_location_type as loc\
            ON off.location_id = loc.location_id\
            JOIN nibrs_incident as inc\
            ON inc.incident_id = vic.incident_id\
            JOIN cde_agencies as ags\
            ON ags.agency_id = inc.agency_id\
            JOIN ori_to_fips as ori\
            ON ori.ori9 = ags.ori\
            WHERE vic.victim_type_id = 4;"

### Load into Pandas DataFrame

In [5]:
Vic_Off_df = pd.read_csv('init_query.csv')
Vic_Off_df.head()

Unnamed: 0,victim_id,incident_id,victim_type_id,victim_type,age_range_low,age_range_high,victim_sex,crime_against,offense,offense_category,offense_group,offender_id,offender_age,offender_sex,location_id,location_name,ddocname,fips,countyname,name
0,57023969,52643345,4,Individual,,,M,Property,Theft of Motor Vehicle Parts or Accessories,Larceny/Theft Offenses,A,59269093,,U,18,Parking/Drop Lot/Garage,2010_07_TX2201200_100074401_INC_NIBRS,48439,TARRANT,FORT WORTH POLICE DEPARTMENT
1,57023969,52643345,4,Individual,,,M,Property,Theft of Motor Vehicle Parts or Accessories,Larceny/Theft Offenses,A,59269093,,U,18,Parking/Drop Lot/Garage,2010_07_TX2201200_100074401_INC_NIBRS,48439,TARRANT,FORT WORTH POLICE DEPARTMENT
2,57023969,52643345,4,Individual,,,M,Property,Theft of Motor Vehicle Parts or Accessories,Larceny/Theft Offenses,A,59269093,,U,18,Parking/Drop Lot/Garage,2010_07_TX2201200_100074401_INC_NIBRS,48439,TARRANT,FORT WORTH POLICE DEPARTMENT
3,57023969,52643345,4,Individual,,,M,Property,Theft of Motor Vehicle Parts or Accessories,Larceny/Theft Offenses,A,59269093,,U,18,Parking/Drop Lot/Garage,2010_07_TX2201200_100074401_INC_NIBRS,48439,TARRANT,FORT WORTH POLICE DEPARTMENT
4,57023969,52643345,4,Individual,,,M,Property,Theft of Motor Vehicle Parts or Accessories,Larceny/Theft Offenses,A,59269093,,U,18,Parking/Drop Lot/Garage,2010_07_TX2201200_100074401_INC_NIBRS,48439,TARRANT,FORT WORTH POLICE DEPARTMENT


### Count of NaN's by column

In [6]:
Vic_Off_df.isna().sum()

victim_id                 0
incident_id               0
victim_type_id            0
victim_type               0
age_range_low       8332860
age_range_high      8332860
victim_sex                0
crime_against             0
offense                   0
offense_category          0
offense_group             0
offender_id               0
offender_age        4539013
offender_sex        2291595
location_id               0
location_name             0
ddocname                  0
fips                      0
countyname                0
name                      0
dtype: int64

In [7]:
Vic_Off_df.offender_sex.value_counts()

M    4290919
U    1727647
F    1560047
Name: offender_sex, dtype: int64

In [8]:
Vic_Off_df.offender_age.value_counts()

22.0    225823
18.0    224907
19.0    220802
20.0    214282
25.0    213399
21.0    212574
17.0    209290
23.0    182973
24.0    179607
27.0    177338
16.0    167717
30.0    167269
26.0    161633
28.0    149465
32.0    145722
29.0    140668
15.0    138475
31.0    129450
33.0    125733
35.0    121336
34.0    110449
14.0    101877
37.0     97185
40.0     95459
36.0     95336
38.0     81775
39.0     80374
42.0     74773
45.0     74072
41.0     72795
         ...  
73.0      1858
74.0      1756
7.0       1575
76.0      1210
77.0      1153
88.0       981
78.0       914
79.0       836
80.0       806
6.0        796
81.0       665
5.0        608
82.0       566
83.0       493
84.0       481
85.0       343
4.0        273
87.0       264
86.0       229
90.0       158
89.0       153
2.0        118
3.0         48
94.0        42
91.0        35
96.0        28
98.0        28
95.0        21
92.0        21
93.0        13
Name: offender_age, Length: 97, dtype: int64

In [9]:
Vic_Off_df.corr()

Unnamed: 0,victim_id,incident_id,victim_type_id,age_range_low,age_range_high,offender_id,offender_age,location_id,fips
victim_id,1.0,0.996071,,-7.7e-05,0.003392,0.999997,0.058586,0.041547,0.023948
incident_id,0.996071,1.0,,-6.6e-05,0.003396,0.996097,0.058194,0.041297,0.01279
victim_type_id,,,,,,,,,
age_range_low,-7.7e-05,-6.6e-05,,1.0,-0.034574,-5.3e-05,0.277042,-0.036352,-0.011188
age_range_high,0.003392,0.003396,,-0.034574,1.0,0.003403,-0.002626,-0.001916,-0.012405
offender_id,0.999997,0.996097,,-5.3e-05,0.003403,1.0,0.058595,0.041542,0.023558
offender_age,0.058586,0.058194,,0.277042,-0.002626,0.058595,1.0,-0.030647,0.019437
location_id,0.041547,0.041297,,-0.036352,-0.001916,0.041542,-0.030647,1.0,-0.02428
fips,0.023948,0.01279,,-0.011188,-0.012405,0.023558,0.019437,-0.02428,1.0


In [None]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [None]:
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english')
X = vectorizer.fit_transform(article_df['content'])

In [None]:
kmeans = KMeans(n_clusters=8, init = 'k-means++', random_state=0).fit(X)

In [None]:
feat_names = vectorizer.get_feature_names()

In [None]:
labels = kmeans.labels_
labels

In [None]:
K_centers = kmeans.cluster_centers_