# SciKit-Learn Machine Learning

In [1]:
import pandas as pd
# import matplotlib.pyplot as plt
from datetime import datetime, date, time
# from collections import OrderedDict
from sklearn import cluster, datasets, preprocessing
# from sklearn.cluster import AgglomerativeClustering
from sklearn.neural_network import BernoulliRBM

from lapd_codes.crime_codes import crime_codes
from lapd_codes.mo_codes import mo_codes

In [2]:
max_rows = 2000000
# max_rows = 10
url = 'https://data.lacity.org/resource/7fvc-faax.csv?$limit=' + str(max_rows)
df = pd.read_csv(url)

In [3]:
print(df.keys())

Index(['area_id', 'area_name', 'crm_cd', 'crm_cd_1', 'crm_cd_2', 'crm_cd_3',
       'crm_cd_4', 'crm_cd_desc', 'cross_street', 'date_occ', 'date_rptd',
       'dr_no', 'location', 'location_1', 'location_1_address',
       'location_1_city', 'location_1_state', 'location_1_zip', 'mocodes',
       'premis_cd', 'premis_desc', 'rpt_dist_no', 'status', 'status_desc',
       'time_occ', 'vict_age', 'vict_descent', 'vict_sex', 'weapon_desc',
       'weapon_used_cd'],
      dtype='object')


In [5]:
def remove_columns(ml_df, columns):
    for column in columns:
        ml_df = ml_df.drop(column, 1)
    return ml_df  

def remove_partial_rows(ml_df):
    null_indicies  = ml_df.index[ml_df.isnull().any(axis=1)].tolist()
    return ml_df.drop(null_indicies)
        
def format_occ_datetime(ml_df):
    date_occ_index = list(ml_df.keys()).index('date_occ')
    time_occ_index = list(ml_df.keys()).index('time_occ')
    datetimes = []
    for row in ml_df.values:
        date_occured = datetime.strptime(row[date_occ_index], '%Y-%m-%dT%H:%M:%S.%f')
        time_occured = str(row[time_occ_index])

        if len(time_occured) == 4:
            hour = int(time_occured[0:2])
            minute = int(time_occured[2:len(time_occured)])
        elif len(time_occured) == 3:
            hour = int(time_occured[0])
            minute = int(time_occured[1:len(time_occured)])
        else:
            hour = 0
            minute = int(time_occured[0:len(time_occured)])

        datetime_occured  = date_occured.replace(hour=hour, minute=minute)
        datetimes.append(datetime_occured)

    ml_df = remove_columns(ml_df, ['date_occ', 'time_occ'])
    ml_df['year_occ'] = pd.Series([int(date.year) for date in datetimes], index=ml_df.index)
    ml_df['month_occ'] = pd.Series([int(date.month) for date in datetimes], index=ml_df.index)
    ml_df['weekday_occ'] = pd.Series([int(date.weekday()) for date in datetimes], index=ml_df.index)
    ml_df['hour_occ'] = pd.Series([int(date.hour) for date in datetimes], index=ml_df.index)
#     ml_df['minute_occ'] = pd.Series([int(date.minute) for date in datetimes], index=ml_df.index)
    
    return ml_df

def format_rptd_datetime(ml_df):
    date_rptd_index = list(ml_df.keys()).index('date_rptd')
    datetimes = []
    for row in ml_df.values:
        date_reported = datetime.strptime(row[date_rptd_index], '%Y-%m-%dT%H:%M:%S.%f')
        datetimes.append(date_reported)
    
    ml_df = remove_columns(ml_df, ['date_rptd'])
    ml_df['year_rptd'] = pd.Series([int(date.year) for date in datetimes], index=ml_df.index)
    ml_df['month_rptd'] = pd.Series([int(date.month) for date in datetimes], index=ml_df.index)
    ml_df['day_rptd'] = pd.Series([int(date.day) for date in datetimes], index=ml_df.index)
    return ml_df

def convert_location_to_lat_long(ml_df):
    locations = ml_df['location_1']
    longs = []
    lats = []
    for location in locations:
        location = location[7:len(location)-1]
        long, lat = location.split(' ')
        longs.append(float(long))
        lats.append(float(lat))
    
    ml_df = remove_columns(ml_df, ['location_1'])
    ml_df['location_long'] = pd.Series(longs, index=ml_df.index)
    ml_df['location_lat'] = pd.Series(lats, index=ml_df.index)
    return ml_df
 
def select_year_occ(ml_df, year):
    return ml_df.loc[ml_df['year_occ'] == year]

def encode_labels(ml_df, column_titles):
    return pd.get_dummies(ml_df, columns=column_titles)


# Potentially useful: mocodes, premis_cd, weapon_used_cd, date_rptd, rpt_dist_no
columns_remove = [
    'area_name', 'crm_cd_1', 'crm_cd_2', 'crm_cd_3', 'crm_cd_4', 'crm_cd_desc', 'cross_street', 'dr_no', 
    'location', 'location_1_address', 'location_1_city', 'location_1_state', 'location_1_zip', 'mocodes', 
    'premis_cd', 'premis_desc', 'status', 'status_desc', 'weapon_desc', 'weapon_used_cd',
    'date_rptd', 'rpt_dist_no'
]

ml_df = remove_columns(df, columns_remove)
ml_df = remove_partial_rows(ml_df)
ml_df = format_occ_datetime(ml_df)
# ml_df = format_rptd_datetime(ml_df)
ml_df = convert_location_to_lat_long(ml_df)

ml_df = select_year_occ(ml_df, 2017)

# Try encoding year
# ml_df = encode_labels(ml_df, ['area_id', 'crm_cd', 'vict_descent', 'vict_sex', 'month_occ', 'weekday_occ'])


# Supervised Neural Network
crm_cds = ml_df['crm_cd'].tolist()
# crm_cds = encode_labels(crm_cds, ['crm_cd'])

ml_df = remove_columns(ml_df, ['crm_cd'])
ml_df = encode_labels(ml_df, ['area_id', 'vict_descent', 'vict_sex', 'month_occ', 'weekday_occ'])

In [5]:
print('ml_df:')
print(len(ml_df.keys()))
print(ml_df.columns.to_series().groupby(ml_df.dtypes).groups)

ml_df:
67
{dtype('float64'): Index(['vict_age', 'location_long', 'location_lat'], dtype='object'), dtype('int64'): Index(['year_occ', 'hour_occ'], dtype='object'), dtype('uint8'): Index(['area_id_1', 'area_id_2', 'area_id_3', 'area_id_4', 'area_id_5',
       'area_id_6', 'area_id_7', 'area_id_8', 'area_id_9', 'area_id_10',
       'area_id_11', 'area_id_12', 'area_id_13', 'area_id_14', 'area_id_15',
       'area_id_16', 'area_id_17', 'area_id_18', 'area_id_19', 'area_id_20',
       'area_id_21', 'vict_descent_A', 'vict_descent_B', 'vict_descent_C',
       'vict_descent_D', 'vict_descent_F', 'vict_descent_G', 'vict_descent_H',
       'vict_descent_I', 'vict_descent_J', 'vict_descent_K', 'vict_descent_O',
       'vict_descent_P', 'vict_descent_S', 'vict_descent_U', 'vict_descent_V',
       'vict_descent_W', 'vict_descent_X', 'vict_descent_Z', 'vict_sex_F',
       'vict_sex_H', 'vict_sex_M', 'vict_sex_X', 'month_occ_1', 'month_occ_2',
       'month_occ_3', 'month_occ_4', 'month_occ_5', 'mo

In [36]:
location_df = ml_df[['location_long', 'location_lat', 'crm_cd']]
location_df = encode_labels(location_df, ['crm_cd'])

## Machine Learning

In [None]:
k_means = cluster.KMeans(n_clusters=10)
k_means.fit_predict(location_df) 

# print(k_means.labels_[::10])
print(k_means.cluster_centers_)

# print(y_iris[::10])

In [None]:
print(location_df.keys())
print(k_means.cluster_centers_[3])

In [None]:
ward = cluster.AgglomerativeClustering(n_clusters=3)
ward.fit_predict(location_df) 

### MLP Classifier

In [54]:
from sklearn.neural_network import MLPClassifier

X = ml_df.as_matrix()
y = crm_cds

clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X, y)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [55]:
# clf.predict(ml_df.loc[[]])
clf.score(X, y)

0.09939342287316594

### Random Forest Classifier

In [6]:
from sklearn.ensemble import RandomForestClassifier

X = ml_df
y = crm_cds

clf_tree = RandomForestClassifier(random_state=0)
clf_tree.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [7]:
clf_tree.score(X, y)

0.9874858032221427

In [18]:
print(y[0:10])
clf_tree.predict(X[0:10])

[626, 624, 236, 624, 930, 626, 230, 930, 930, 230]


array([626, 624, 236, 624, 930, 626, 230, 930, 930, 230])

In [8]:
clf_tree.decision_path(X.iloc[[1]])

(<1x2154026 sparse matrix of type '<class 'numpy.int64'>'
 	with 254 stored elements in Compressed Sparse Row format>,
 array([      0,  216259,  431470,  646927,  862648, 1077705, 1292906,
        1507527, 1722908, 1938633, 2154026]))

In [4]:
print(clf_tree.estimators_[0])

NameError: name 'clf_tree' is not defined