# SciKit-Learn Machine Learning

In [1]:
import pandas as pd
# import matplotlib.pyplot as plt
from datetime import datetime, date, time
# from collections import OrderedDict
from sklearn import cluster, datasets, preprocessing
# from sklearn.cluster import AgglomerativeClustering


from lapd_codes.crime_codes import crime_codes
from lapd_codes.mo_codes import mo_codes

In [2]:
# max_rows = 2000000
max_rows = 10
url = 'https://data.lacity.org/resource/7fvc-faax.csv?$limit=' + str(max_rows)
df = pd.read_csv(url)

In [3]:
print(df.keys())

Index(['area_id', 'area_name', 'crm_cd', 'crm_cd_1', 'crm_cd_2', 'crm_cd_3',
       'crm_cd_4', 'crm_cd_desc', 'cross_street', 'date_occ', 'date_rptd',
       'dr_no', 'location', 'location_1', 'location_1_address',
       'location_1_city', 'location_1_state', 'location_1_zip', 'mocodes',
       'premis_cd', 'premis_desc', 'rpt_dist_no', 'status', 'status_desc',
       'time_occ', 'vict_age', 'vict_descent', 'vict_sex', 'weapon_desc',
       'weapon_used_cd'],
      dtype='object')


In [4]:
print(df['location_1'])

0    POINT (-118.2459 34.0401)
1    POINT (-118.2655 34.0599)
2    POINT (-118.3742 34.0375)
3    POINT (-118.3555 34.0761)
4    POINT (-118.3451 34.0649)
5    POINT (-118.3614 34.0761)
6     POINT (-118.3765 34.073)
7    POINT (-118.3219 34.0516)
8    POINT (-118.3353 34.0399)
9    POINT (-118.3353 34.0399)
Name: location_1, dtype: object


In [10]:
def remove_columns(ml_df, columns):
    # Potentially useful: mocodes, premis_cd, weapon_used_cd
    for column in columns:
        ml_df = ml_df.drop(column, 1)
    return ml_df  

def remove_partial_rows(ml_df):
    null_indicies  = ml_df.index[ml_df.isnull().any(axis=1)].tolist()
    return ml_df.drop(null_indicies)
        
def format_occ_datetime(ml_df):
    date_occ_index = list(ml_df.keys()).index('date_occ')
    time_occ_index = list(ml_df.keys()).index('time_occ')
    datetimes = []
    for row in ml_df.values:
        date_occured = datetime.strptime(row[date_occ_index], '%Y-%m-%dT%H:%M:%S.%f')
        time_occured = str(row[time_occ_index])

        if len(time_occured) == 4:
            hour = int(time_occured[0:2])
            minute = int(time_occured[2:len(time_occured)])
        elif len(time_occured) == 3:
            hour = int(time_occured[0])
            minute = int(time_occured[1:len(time_occured)])
        else:
            hour = 0
            minute = int(time_occured[0:len(time_occured)])

        datetime_occured  = date_occured.replace(hour=hour, minute=minute)
        datetimes.append(datetime_occured)

    ml_df = remove_columns(ml_df, ['date_occ', 'time_occ'])
    ml_df['year_occ'] = pd.Series([int(date.year) for date in datetimes], index=ml_df.index)
    ml_df['month_occ'] = pd.Series([int(date.month) for date in datetimes], index=ml_df.index)
    ml_df['day_occ'] = pd.Series([int(date.day) for date in datetimes], index=ml_df.index)
    ml_df['hour_occ'] = pd.Series([int(date.hour) for date in datetimes], index=ml_df.index)
    ml_df['minute_occ'] = pd.Series([int(date.minute) for date in datetimes], index=ml_df.index)
    
    return ml_df

def format_rptd_datetime(ml_df):
    date_rptd_index = list(ml_df.keys()).index('date_rptd')
    datetimes = []
    for row in ml_df.values:
        date_reported = datetime.strptime(row[date_rptd_index], '%Y-%m-%dT%H:%M:%S.%f')
        datetimes.append(date_reported)
    
    ml_df = remove_columns(ml_df, ['date_rptd'])
    ml_df['year_rptd'] = pd.Series([int(date.year) for date in datetimes], index=ml_df.index)
    ml_df['month_rptd'] = pd.Series([int(date.month) for date in datetimes], index=ml_df.index)
    ml_df['day_rptd'] = pd.Series([int(date.day) for date in datetimes], index=ml_df.index)
    return ml_df

# def convert_column_to_int(ml_df, column_title, mapping):
#     column = ml_df[column_title]
#     column = [mapping[element] for element in column]
    
#     ml_df = ml_df.drop(column_title, 1)
#     ml_df[column_title] = pd.Series(column, index=ml_df.index)
#     return ml_df

def convert_location_to_lat_long(ml_df):
    locations = ml_df['location_1']
    longs = []
    lats = []
    for location in locations:
        location = location[7:len(location)-1]
        long, lat = location.split(' ')
        longs.append(float(long))
        lats.append(float(lat))
    
    ml_df = remove_columns(ml_df, ['location_1'])
    ml_df['location_long'] = pd.Series(longs, index=ml_df.index)
    ml_df['location_lat'] = pd.Series(lats, index=ml_df.index)
    return ml_df

def encode_labels(ml_df, column_titles):
    for column_title in column_titles:
        lb = preprocessing.LabelBinarizer()
        ml_df[column_title] = lb.fit_transform(ml_df[column_title]).tolist()
    return ml_df


columns_remove = ['area_name', 'crm_cd_1', 'crm_cd_2', 'crm_cd_3', 'crm_cd_4', 'crm_cd_desc', 'cross_street', 'dr_no', 'location', 'location_1_address', 'location_1_city', 'location_1_state', 'location_1_zip', 'mocodes', 'premis_cd', 'premis_desc', 'status', 'status_desc', 'weapon_desc', 'weapon_used_cd']

ml_df = remove_columns(df, columns_remove)
ml_df = remove_partial_rows(ml_df)
ml_df = format_occ_datetime(ml_df)
ml_df = format_rptd_datetime(ml_df)
ml_df = convert_location_to_lat_long(ml_df)

encode_labels(ml_df, ['vict_sex', 'vict_descent', 'crm_cd', 'rpt_dist_no'])
# sex_mapping = {'F':0, 'M':1,}
# ml_df = convert_column_to_int(ml_df, 'vict_sex', sex_mapping)
# descent_mapping = {'H':0, 'B':1, 'O':2, 'K':3}
# ml_df = convert_column_to_int(ml_df, 'vict_descent', descent_mapping)


ml_array = ml_df.values
print(ml_df)
# print(ml_array)

   area_id           crm_cd               rpt_dist_no  vict_age  vict_descent  \
0        1  [0, 0, 0, 1, 0]  [1, 0, 0, 0, 0, 0, 0, 0]      30.0  [0, 1, 0, 0]   
1        2  [0, 0, 1, 0, 0]  [0, 1, 0, 0, 0, 0, 0, 0]      11.0  [0, 1, 0, 0]   
2        7  [0, 1, 0, 0, 0]  [0, 0, 0, 0, 0, 0, 1, 0]      49.0  [0, 1, 0, 0]   
3        7  [0, 0, 0, 0, 1]  [0, 0, 0, 1, 0, 0, 0, 0]      41.0  [1, 0, 0, 0]   
4        7  [0, 1, 0, 0, 0]  [0, 0, 0, 0, 1, 0, 0, 0]      27.0  [0, 0, 0, 1]   
6        7  [0, 0, 0, 0, 1]  [0, 0, 1, 0, 0, 0, 0, 0]      20.0  [0, 1, 0, 0]   
7        7  [0, 1, 0, 0, 0]  [0, 0, 0, 0, 0, 1, 0, 0]      35.0  [0, 0, 0, 1]   
8        7  [1, 0, 0, 0, 0]  [0, 0, 0, 0, 0, 0, 0, 1]      43.0  [0, 0, 1, 0]   
9        7  [1, 0, 0, 0, 0]  [0, 0, 0, 0, 0, 0, 0, 1]      44.0  [0, 1, 0, 0]   

  vict_sex  year_occ  month_occ  day_occ  hour_occ  minute_occ  year_rptd  \
0      [1]      2010         11       15        20          45       2010   
1      [0]      2010          3    

## Machine Learning

In [6]:
k_means = cluster.KMeans(n_clusters=3)
k_means.fit_predict(ml_df) 

print(k_means.labels_[::10])
print(k_means.cluster_centers_)

# print(y_iris[::10])

ValueError: setting an array element with a sequence.

In [None]:
ml_df.columns.to_series().groupby(ml_df.dtypes).groups

In [7]:
ward = cluster.AgglomerativeClustering(n_clusters=3)
ward.fit_predict(ml_df) 

ValueError: setting an array element with a sequence.