In [122]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
import sklearn

## Get the Data

Note: before completing this step, the following steps should be run:
* ChicagoPDData notebook
* make all in main directory
* make all in AoT_data/

In [5]:
# a_dir = 'AoT_data/'
# node_loc_list = pd.read_csv(a_dir+'node_loc_list.csv',index_col=0)
# full_data= pd.read_csv(a_dir+'fixed_data.csv')

In [6]:
def create_crime_dict():
    crime_dict = {}
    with open('final_dat.csv','r') as file:
        for line in file:
            curr_line=line.strip().split(',')
            if curr_line[0] not in crime_dict.keys():
                crime_dict[curr_line[0]]={'coords':[]}
            crime_dict[curr_line[0]]['coords'].append((float(curr_line[2]),float(curr_line[1])))
    return crime_dict

In [7]:
#crime_dict=create_crime_dict()

In [8]:
# full_data =full_data.loc[full_data['parameter']!='id']

In [9]:
# full_data.shape

In [10]:
# full_data.head()

In [11]:
# full_data.values.tolist()

In [12]:
# len(crime_data.index.unique())

In [13]:
# len(crime_data)

In [14]:
# node_loc_list.head()

# Transform the Data

There is a very large amount of data to handle here, so dictionaries are used for speed

In [15]:
#node_dict=node_loc_list.to_dict('index')
#full_list=full_data.values.tolist()

The function below takes a list version of the full data (obtained from full_data.values.tolist()) and converts it to a dictionary

In [16]:
def get_full_dict(full_list,node_dict):
    full_dict={}
    for index,row in enumerate(full_list):
        timestamp=row[0][0:13]
        node_id=row[1]
        full_id = str(timestamp)+'_'+str(node_id)
        feature_name = str(row[3])+'_'+str(row[4])
        value=row[5]
        node_lat=node_dict[node_id]['lat']
        node_lon=node_dict[node_id]['lon']
        if not full_id in full_dict.keys():
            full_dict[full_id]={}
        full_dict[full_id][feature_name]=value
        full_dict[full_id]['timestamp']=timestamp
        full_dict[full_id]['node_id']=node_id
        full_dict[full_id]['latitude']=node_lat
        full_dict[full_id]['longitude']=node_lon
    return full_dict

In [17]:
# get_full_dict(full)

In [18]:
# len(full_dict)

In [19]:
def add_labels(full_dict,crime_dict,max_dist=2):
    for id in full_dict.keys():
        label=0
        timestamp=full_dict[id]['timestamp']
        node_lat=full_dict[id]['latitude']
        node_lon=full_dict[id]['longitude']
        if timestamp in crime_dict.keys():
            crime_coords=crime_dict[timestamp]['coords']
            label = crime_occurrence_check((node_lat,node_lon),crime_coords,max_dist)
        full_dict[id]['label']=label
    return full_dict

In [20]:
#next(iter(full_dict.values()))

Note, the function below is not mine, it is pulled from [https://stackoverflow.com/questions/15736995/how-can-i-quickly-estimate-the-distance-between-two-latitude-longitude-points]

In [21]:
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # Radius of earth in kilometers is 6371
    km = 6371* c
    return km

In [22]:
def crime_occurrence_check(node_coords,crime_coods_list,max_dist=2):
    crime_occurred=0
    for crime_coord in crime_coods_list:
        if haversine(node_coords[1],node_coords[0],crime_coord[1],crime_coord[0]) < max_dist:
            crime_occurred=1
            return crime_occurred
    return crime_occurred

In [23]:
# full_data = pd.DataFrame.from_dict(full_dict,orient='index')

To decide an appropriate value for max_dist, let's look at the average minimum distance between the current nodes. We choose this statistic, as it means on average, the closest node is that far away. Thus, by assigning crimes only if they are less than this distance, we will on average only assign the crime to the closest node. 

In [24]:
import statistics
def find_avg_node_dist(node_loc_list):
    distance_avgs=[]
    for index,row in node_loc_list.iterrows():
        base_lat=row[0]
        base_lon=row[1]
        distances=[]
        for index2,row2 in node_loc_list.iterrows():
            if index != index2:
                distances.append(haversine(base_lon,base_lat,row2[1],row2[0]))  
        distance_avgs.append(min([x for x in distances if x>0]))
    return math.ceil(statistics.mean(distance_avgs))

In [25]:
import math
a_dir = 'AoT_data/'
node_loc_list = pd.read_csv(a_dir+'node_loc_list.csv',index_col=0)
max_dist = math.ceil(find_avg_node_dist(node_loc_list))
print(max_dist,"km")

2 km


In [26]:
def get_full_set():
    a_dir = 'AoT_data/'
    node_loc_list = pd.read_csv(a_dir+'node_loc_list.csv',index_col=0)
    max_dist = math.ceil(find_avg_node_dist(node_loc_list))
    full_data= pd.read_csv(a_dir+'fixed_data.csv')
    full_data =full_data.loc[full_data['parameter']!='id']
    full_list=full_data.values.tolist()
    node_dict=node_loc_list.to_dict('index')
    full_dict=get_full_dict(full_list,node_dict)
    crime_dict=create_crime_dict()
    full_dict=add_labels(full_dict,crime_dict,max_dist)
    labeled_data = pd.DataFrame.from_dict(full_dict,orient='index')
    return labeled_data

In [27]:
# NOTE: This cell takes around 2 minutes to run but does all of the data preprocessing
import time
start=time.time()
labeled_data=get_full_set()
print("--- %s seconds ---" % (time.time() - start))

--- 127.46793913841248 seconds ---


In [28]:
print(labeled_data.shape)
labeled_data.head()

(546257, 145)


Unnamed: 0,timestamp,latitude,label,tsl250rd_intensity,pr103j2_temperature,node_id,hih4030_humidity,longitude,spv1840lr5h_b_intensity,mlx75305_intensity,...,microphone_octave_6_intensity,microphone_octave_8_intensity,microphone_octave_9_intensity,net_usb_rx,microphone_octave_7_intensity,microphone_octave_5_intensity,microphone_octave_3_intensity,microphone_octave_1_intensity,microphone_octave_2_intensity,microphone_octave_10_intensity
2017/03/28 17_001e0610ba46,2017/03/28 17,41.878377,1,1.639,28.159,001e0610ba46,47.155,-87.627678,0.0,37.366,...,,,,,,,,,,
2017/03/28 19_001e0610ba46,2017/03/28 19,41.878377,1,1.088,28.889,001e0610ba46,45.425,-87.627678,0.0,18.574,...,,,,,,,,,,
2017/03/28 20_001e0610ba46,2017/03/28 20,41.878377,1,0.863,30.43,001e0610ba46,43.239,-87.627678,0.0,12.016,...,,,,,,,,,,
2017/03/28 21_001e0610ba46,2017/03/28 21,41.878377,0,1.25,30.614,001e0610ba46,42.085,-87.627678,0.0,14.069,...,,,,,,,,,,
2017/03/28 22_001e0610ba46,2017/03/28 22,41.878377,1,3.289,30.834,001e0610ba46,41.477,-87.627678,0.0,23.893,...,,,,,,,,,,


In [29]:
labeled_data =labeled_data.fillna(0)

In [30]:
print(len(labeled_data[labeled_data['label']==0]))
len(labeled_data[labeled_data['label']==1])

409665


136592

## Split out the train, dev, and test set

At this point, we want to split the data into training, development, and test sets

In [31]:
from sklearn.model_selection import train_test_split

We want our X to be all values except the label,timestamp, and node_id as these should be the only non_numerical features

In [100]:
All_X = labeled_data.drop(['label','timestamp','node_id'],axis=1)
All_y = labeled_data['label']

In [101]:
All_X.head()

Unnamed: 0,latitude,tsl250rd_intensity,pr103j2_temperature,hih4030_humidity,longitude,spv1840lr5h_b_intensity,mlx75305_intensity,mma8452q_acceleration_z,ml8511_intensity,htu21d_temperature,...,microphone_octave_6_intensity,microphone_octave_8_intensity,microphone_octave_9_intensity,net_usb_rx,microphone_octave_7_intensity,microphone_octave_5_intensity,microphone_octave_3_intensity,microphone_octave_1_intensity,microphone_octave_2_intensity,microphone_octave_10_intensity
2017/03/28 17_001e0610ba46,41.878377,1.639,28.159,47.155,-87.627678,0.0,37.366,0.0,43.646,27.715,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017/03/28 19_001e0610ba46,41.878377,1.088,28.889,45.425,-87.627678,0.0,18.574,0.0,42.939,28.431,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017/03/28 20_001e0610ba46,41.878377,0.863,30.43,43.239,-87.627678,0.0,12.016,0.0,41.709,29.749,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017/03/28 21_001e0610ba46,41.878377,1.25,30.614,42.085,-87.627678,0.0,14.069,0.0,40.99,29.929,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017/03/28 22_001e0610ba46,41.878377,3.289,30.834,41.477,-87.627678,0.0,23.893,0.0,40.849,30.135,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [102]:
All_y.head()

2017/03/28 17_001e0610ba46    1
2017/03/28 19_001e0610ba46    1
2017/03/28 20_001e0610ba46    1
2017/03/28 21_001e0610ba46    0
2017/03/28 22_001e0610ba46    1
Name: label, dtype: int64

In [103]:
X,X_test,y,y_test=train_test_split(All_X,All_y,test_size=0.2,random_state=42)

We will now never ever look at X_test and y_test, and will leave them until the very end of the process

We do also want a development set, so we split again

In [104]:
X_train,X_dev,y_train,y_dev=train_test_split(X,y,test_size=0.1,random_state=42)

In [105]:
print('X_train: ',X_train.shape,'Y_train: ',y_train.shape)
print('X_dev: ',X_dev.shape, 'Y_dev: ',y_dev.shape)

X_train:  (393304, 142) Y_train:  (393304,)
X_dev:  (43701, 142) Y_dev:  (43701,)


Perfect! Now we have something we can work with!

In [106]:
X_train.describe()

Unnamed: 0,latitude,tsl250rd_intensity,pr103j2_temperature,hih4030_humidity,longitude,spv1840lr5h_b_intensity,mlx75305_intensity,mma8452q_acceleration_z,ml8511_intensity,htu21d_temperature,...,microphone_octave_6_intensity,microphone_octave_8_intensity,microphone_octave_9_intensity,net_usb_rx,microphone_octave_7_intensity,microphone_octave_5_intensity,microphone_octave_3_intensity,microphone_octave_1_intensity,microphone_octave_2_intensity,microphone_octave_10_intensity
count,393304.0,393304.0,393304.0,393304.0,393304.0,393304.0,393304.0,393304.0,393304.0,393304.0,...,393304.0,393304.0,393304.0,393304.0,393304.0,393304.0,393304.0,393304.0,393304.0,393304.0
mean,41.846957,14.619944,11.377222,70.570331,-87.659522,40.748382,455.059155,1.697578,209.773681,26.06331,...,-2.2901710000000002e+32,-2.374399e+32,-2.205548e+32,287375.7,-0.003797,516505.2,1033222.0,-517760.2,1169906.0,-0.014686
std,0.077617,16.318736,13.915346,22.178959,0.054524,27.995619,502.470859,68.010812,385.657505,41.522349,...,1.4362569999999999e+35,1.489079e+35,1.3831869999999998e+35,29695750.0,2.420317,323920800.0,647895000.0,324787000.0,733809700.0,9.210395
min,41.666078,-0.071,-54.9,0.0,-87.982901,0.0,-13.191,-1648.438,-143.631,-124.868,...,-9.007336e+37,-9.338605e+37,-8.67451e+37,0.0,-1517.678,-0.089,-0.089,-203686800000.0,-111308300.0,-5776.203
25%,41.78843,0.075,1.918,66.766,-87.683048,0.0,1.883,-0.977,42.457,1.583,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,41.857797,1.784,10.106,76.229,-87.665685,56.513,330.471,0.0,45.785,11.8495,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,41.912681,34.541,23.312,83.03225,-87.624179,58.977,687.858,15.208,52.005,26.85,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,41.994597,159.907,80.0,154.21,-87.536509,104.85,1449.643,108.793,1135.624,128.86,...,0.044,0.0,0.044,5456206000.0,24.53,203143500000.0,406320600000.0,49600210.0,460201000000.0,0.0


## Scaling the features

In [107]:
from sklearn.preprocessing import StandardScaler
X_train_scaled = StandardScaler().fit_transform(X_train)
X_dev_scaled = StandardScaler().fit_transform(X_dev)

In [39]:
print(len([x for x in y_train if x==1]), len([x for x in y_train if x==0]))

98496 294808


# Trying different algorithms

In [63]:
from sklearn.linear_model import LogisticRegression
#clf = LogisticRegression(C=0.001,random_state=0, solver='lbfgs',max_iter=1000)
clf = LogisticRegression()
clf.fit(X_train_scaled, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

With many models ahead, a function to evaluate our models is extremely helpful

In [87]:
def evaluate_model(clf):
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import recall_score
    from sklearn.metrics import precision_score

    y_train_pred=clf.predict(X_train_scaled)
    y_dev_pred=clf.predict(X_dev_scaled)

    print('train_acc: ', accuracy_score(y_train,y_train_pred),'dev_acc: ', accuracy_score(y_dev,y_dev_pred), '\n',
          'train_rec: ',recall_score(y_train,y_train_pred),'dev_rec: ',recall_score(y_dev,y_dev_pred), '\n',
          'train_pre: ',precision_score(y_train,y_train_pred),'dev_pre: ',precision_score(y_dev,y_dev_pred))  
    return (y_train_pred,y_dev_pred)

In [88]:
y_train_pred, y_dev_pred= evaluate_model(clf)

train_acc:  0.7488202510017696 dev_acc:  0.7450859248072127 
 train_rec:  0.022742040285899934 dev_rec:  0.03306531755352787 
 train_pre:  0.46920821114369504 dev_pre:  0.45579078455790784


In [89]:
print('total train examples: ', len(y_train_pred), '\n',
      'predicted positive train examples: ',len([y for y in y_train_pred if y >0.5]), '\n',
     'actual positive train examples: ',len([y for y in y_train if y >0.5]))

total train examples:  393304 
 predicted positive train examples:  4774 
 actual positive train examples:  98496


Let's see how a decision tree classifier does

In [127]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train_scaled, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [128]:
y_train_pred, y_dev_pred= evaluate_model(clf)

train_acc:  0.9991787523137319 dev_acc:  0.6056154321411409 
 train_rec:  0.9967511371020142 dev_rec:  0.42894570421899 
 train_pre:  0.9999694435673616 dev_pre:  0.30315413101774996


In [129]:
print('total train examples: ', len(y_train_pred), '\n',
      'predicted positive train examples: ',len([y for y in y_train_pred if y >0.5]), '\n',
     'actual positive train examples: ',len([y for y in y_train if y >0.5]))

total train examples:  393304 
 predicted positive train examples:  98179 
 actual positive train examples:  98496


This is clearly overfitting, so let's add some regularization

In [112]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0,max_depth=10)
clf.fit(X_train_scaled, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [113]:
y_train_pred, y_dev_pred= evaluate_model(clf)

train_acc:  0.7611542216707686 dev_acc:  0.7462071806137159 
 train_rec:  0.20025178687459388 dev_rec:  0.18583431204264161 
 train_pre:  0.5653033733627583 dev_pre:  0.4973404255319149


In [114]:
print('total train examples: ', len(y_train_pred), '\n',
      'predicted positive train examples: ',len([y for y in y_train_pred if y >0.5]), '\n',
     'actual positive train examples: ',len([y for y in y_train if y >0.5]))

total train examples:  393304 
 predicted positive train examples:  34891 
 actual positive train examples:  98496


Good! Overfitting was fixed, but now we're back to a pretty low recall. let's see if we can get away with any less regularization

In [118]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0,max_depth=15)
clf.fit(X_train_scaled, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=15,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [119]:
y_train_pred, y_dev_pred= evaluate_model(clf)

train_acc:  0.7851992351972011 dev_acc:  0.7280382599940505 
 train_rec:  0.27014294996751137 dev_rec:  0.22919866293251423 
 train_pre:  0.678740880567318 dev_pre:  0.4307300509337861


In [120]:
print('total train examples: ', len(y_train_pred), '\n',
      'predicted positive train examples: ',len([y for y in y_train_pred if y >0.5]), '\n',
     'actual positive train examples: ',len([y for y in y_train if y >0.5]))

total train examples:  393304 
 predicted positive train examples:  39202 
 actual positive train examples:  98496


Okay, that didn't help. Let's try a Random Forest Classifier

In [134]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100,max_depth=10,random_state=0)
clf.fit(X_train_scaled,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [135]:
y_train_pred, y_dev_pred= evaluate_model(clf)

train_acc:  0.753646034619531 dev_acc:  0.7489531132010709 
 train_rec:  0.03190992527615335 dev_rec:  0.02249525702412142 
 train_pre:  0.6712943186672362 dev_pre:  0.6225


In [136]:
print('total train examples: ', len(y_train_pred), '\n',
      'predicted positive train examples: ',len([y for y in y_train_pred if y >0.5]), '\n',
     'actual positive train examples: ',len([y for y in y_train if y >0.5]))

total train examples:  393304 
 predicted positive train examples:  4682 
 actual positive train examples:  98496


Now let's try a neural network

In [151]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
keras_model = Sequential()
keras_model.add(Dense(10, input_shape = (len(X_train_scaled[0]),),activation='relu'))
keras_model.add(Dense(10, activation='relu'))
keras_model.add(Dense(1, activation='sigmoid'))
keras_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [152]:
keras_model.fit(X_train_scaled, y_train, epochs=5,validation_data=(X_dev_scaled,y_dev))

Train on 393304 samples, validate on 43701 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f407d528320>

In [153]:
y_train_pred =keras_model.predict_classes(X_train_scaled)
y_dev_pred = keras_model.predict_classes(X_dev_scaled)

In [155]:
print('train_acc: ', accuracy_score(y_train,y_train_pred),'dev_acc: ', accuracy_score(y_dev,y_dev_pred), '\n',
          'train_rec: ',recall_score(y_train,y_train_pred),'dev_rec: ',recall_score(y_dev,y_dev_pred), '\n',
          'train_pre: ',precision_score(y_train,y_train_pred),'dev_pre: ',precision_score(y_dev,y_dev_pred))

train_acc:  0.7542104834936842 dev_acc:  0.7519507562756002 
 train_rec:  0.08778021442495126 dev_rec:  0.08257295148613245 
 train_pre:  0.5590327169274538 dev_pre:  0.5716072545340838


Let's see if a different loss function helps

In [170]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
keras_model = Sequential()
keras_model.add(Dense(10, input_shape = (len(X_train_scaled[0]),),activation='relu'))
keras_model.add(Dense(10, activation='relu'))
keras_model.add(Dense(1, activation='sigmoid'))
keras_model.compile(loss='mean_squared_error',optimizer='adam',metrics=['accuracy'])

In [171]:
keras_model.fit(X_train_scaled, y_train, epochs=5,validation_data=(X_dev_scaled,y_dev))

Train on 393304 samples, validate on 43701 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f4105e0d390>

In [172]:
y_train_pred =keras_model.predict_classes(X_train_scaled)
y_dev_pred = keras_model.predict_classes(X_dev_scaled)

In [173]:
print('train_acc: ', accuracy_score(y_train,y_train_pred),'dev_acc: ', accuracy_score(y_dev,y_dev_pred), '\n',
          'train_rec: ',recall_score(y_train,y_train_pred),'dev_rec: ',recall_score(y_dev,y_dev_pred), '\n',
          'train_pre: ',precision_score(y_train,y_train_pred),'dev_pre: ',precision_score(y_dev,y_dev_pred))

train_acc:  0.7574725911762911 dev_acc:  0.7549712821216906 
 train_rec:  0.13067535737491878 dev_rec:  0.12720209594362633 
 train_pre:  0.5686828966553263 dev_pre:  0.5735234215885947


That's a great boost in recall performance so let's try another

In [164]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
keras_model = Sequential()
keras_model.add(Dense(10, input_shape = (len(X_train_scaled[0]),),activation='relu'))
keras_model.add(Dense(10, activation='relu'))
keras_model.add(Dense(1, activation='sigmoid'))
keras_model.compile(loss='squared_hinge',optimizer='adam',metrics=['accuracy'])

In [165]:
keras_model.fit(X_train_scaled, y_train, epochs=5,validation_data=(X_dev_scaled,y_dev))

Train on 393304 samples, validate on 43701 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f407f2ba710>

In [166]:
y_train_pred =keras_model.predict_classes(X_train_scaled)
y_dev_pred = keras_model.predict_classes(X_dev_scaled)

In [168]:
print('train_acc: ', accuracy_score(y_train,y_train_pred),'dev_acc: ', accuracy_score(y_dev,y_dev_pred), '\n',
          'train_rec: ',recall_score(y_train,y_train_pred),'dev_rec: ',recall_score(y_dev,y_dev_pred), '\n',
          'train_pre: ',precision_score(y_train,y_train_pred),'dev_pre: ',precision_score(y_dev,y_dev_pred))

train_acc:  0.25043223562435163 dev_acc:  0.25328939841193565 
 train_rec:  1.0 dev_rec:  1.0 
 train_pre:  0.25043223562435163 dev_pre:  0.25328939841193565


In [169]:
print('total train examples: ', len(y_train_pred), '\n',
      'predicted positive train examples: ',len([y for y in y_train_pred if y >0.5]), '\n',
     'actual positive train examples: ',len([y for y in y_train if y >0.5]))

total train examples:  393304 
 predicted positive train examples:  393304 
 actual positive train examples:  98496


Let's evaluate our mean_squared_error neural network on the test set

In [177]:
X_test_scaled = StandardScaler().fit_transform(X_test)
y_test_pred= keras_model.predict_classes(X_test_scaled)
print('test_acc: ', accuracy_score(y_test,y_test_pred), '\n',
          'test_rec: ',recall_score(y_test,y_test_pred), '\n',
          'test_pre: ',precision_score(y_test,y_test_pred))

test_acc:  0.760050159264819 
 test_rec:  0.12857512857512857 
 test_pre:  0.5661453242098403


## TO-DO
* Look at what the best options for increasing recall are
* look at what other steps might be taken to modify the data in preprocessing
* find a balanced model
* evaluate all on the test set