In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
np.random.seed(42)

from matplotlib import rcParams
rcParams.update({'figure.autolayout':True})

plt.style.use(["presentation"])

%matplotlib inline

In [2]:
prediction_data=pd.read_csv('./assets/test.csv')

In [3]:
working_data=pd.read_csv('./assets/train.csv')

In [4]:
NOAA=pd.read_csv('./assets/weather.csv')

In [5]:
spray=pd.read_csv('./assets/spray.csv')

In [6]:
train, test=train_test_split(working_data, test_size=.20, random_state=523)

In [7]:
spray.head(3)

Unnamed: 0,Date,Time,Latitude,Longitude
0,2011-08-29,6:56:58 PM,42.391623,-88.089163
1,2011-08-29,6:57:08 PM,42.391348,-88.089163
2,2011-08-29,6:57:18 PM,42.391022,-88.089157


In [8]:
working_data.head(3)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0


In [9]:
NOAA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2944 entries, 0 to 2943
Data columns (total 22 columns):
Station        2944 non-null int64
Date           2944 non-null object
Tmax           2944 non-null int64
Tmin           2944 non-null int64
Tavg           2944 non-null object
Depart         2944 non-null object
DewPoint       2944 non-null int64
WetBulb        2944 non-null object
Heat           2944 non-null object
Cool           2944 non-null object
Sunrise        2944 non-null object
Sunset         2944 non-null object
CodeSum        2944 non-null object
Depth          2944 non-null object
Water1         2944 non-null object
SnowFall       2944 non-null object
PrecipTotal    2944 non-null object
StnPressure    2944 non-null object
SeaLevel       2944 non-null object
ResultSpeed    2944 non-null float64
ResultDir      2944 non-null int64
AvgSpeed       2944 non-null object
dtypes: float64(1), int64(5), object(16)
memory usage: 506.1+ KB


In [None]:
# ## We can tackle the multiple addresses and the grouping in this one step
# traps = working_data[['Trap',
#                'Latitude',
#                'Longitude',
#                'Date',
# #                'Species',
#                'WnvPresent',
#                'NumMosquitos']].groupby(['Trap',
#                                          'Latitude',
#                                          'Longitude',
#                                          'Date', 
#                                          'Species',
#                                          'WnvPresent']).agg({'NumMosquitos':np.sum}).reset_index()

# feats = traps.columns

# # ## get species dummies and add to traps df
# # # traps = pd.get_dummies(traps, columns=['Species'])

# # ## convert to date
# # traps.Date = pd.to_datetime(traps.Date)

In [10]:
from haversine import haversine

In [11]:
def clean(df):
    df['Date']=pd.to_datetime(df['Date'])
    dummies=pd.get_dummies(df['Species']).drop(['CULEX TERRITANS'], axis=1)
    df=pd.concat([df, dummies], axis=1)
    df['Location']=list(zip(df['Latitude'],df['Longitude']))
    df=df.drop(['Address','Block', 'Street', 'Trap', 
                'AddressNumberAndStreet', 'AddressAccuracy','Species'], axis=1)
                    
    return df

In [12]:
df = clean(working_data)

In [13]:
df.head(2)

Unnamed: 0,Date,Latitude,Longitude,NumMosquitos,WnvPresent,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,Location
0,2007-05-29,41.95469,-87.800991,1,0,0,0,1,0,0,0,"(41.95469, -87.800991)"
1,2007-05-29,41.95469,-87.800991,1,0,0,0,0,1,0,0,"(41.95469, -87.800991)"


In [None]:
spray.head(3)

In [14]:
spray['Location']=list(zip(spray['Latitude'],spray['Longitude']))

In [15]:
spray.drop_duplicates(['Location'], keep='last',inplace=True)

In [17]:
df.shape

(10506, 12)

In [18]:
df.head(2)

Unnamed: 0,Date,Latitude,Longitude,NumMosquitos,WnvPresent,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,Location
0,2007-05-29,41.95469,-87.800991,1,0,0,0,1,0,0,0,"(41.95469, -87.800991)"
1,2007-05-29,41.95469,-87.800991,1,0,0,0,0,1,0,0,"(41.95469, -87.800991)"


In [16]:
spray.head(2)



Unnamed: 0,Date,Time,Latitude,Longitude,Location
0,2011-08-29,6:56:58 PM,42.391623,-88.089163,"(42.3916233333333, -88.0891633333333)"
1,2011-08-29,6:57:08 PM,42.391348,-88.089163,"(42.3913483333333, -88.0891633333333)"


In [19]:
traps =spray['Location'].dropna()

In [20]:
def measure_to_spray(input_location):
    distances=[]
    for location in traps:
        trap_distance=haversine(location, input_location)
        distances+=[trap_distance]
    return min(distances)
        

df['dist_to_traps']=df['Location'].map(measure_to_spray)
# test['dist_to_standing_water']=train['Location'].map(measure_to_standing)
# prediction_data['dist_to_standing_water']=train['Location'].map(measure_to_standing)

In [21]:
df.head()

Unnamed: 0,Date,Latitude,Longitude,NumMosquitos,WnvPresent,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,Location,dist_to_traps
0,2007-05-29,41.95469,-87.800991,1,0,0,0,1,0,0,0,"(41.95469, -87.800991)",0.256049
1,2007-05-29,41.95469,-87.800991,1,0,0,0,0,1,0,0,"(41.95469, -87.800991)",0.256049
2,2007-05-29,41.994991,-87.769279,1,0,0,0,0,1,0,0,"(41.994991, -87.769279)",0.336892
3,2007-05-29,41.974089,-87.824812,1,0,0,0,1,0,0,0,"(41.974089, -87.824812)",0.011196
4,2007-05-29,41.974089,-87.824812,4,0,0,0,0,1,0,0,"(41.974089, -87.824812)",0.011196


In [None]:

n = df.Location.shape[0]
dist1 = []

for i in range(n):
    spray1 = (spray['Latitude'][i], spray['Longitude'][i])
    trap = (df['Latitude'][i], df['Longitude'][i])
    distance = haversine(spray, trap)
    dist.append(distance)
#     print(spray_a, spray_b)

# dist
df['Dist'].loc[i] = min(dist)

In [None]:
n = spray.shape[0]
dist = []

for i in range(n):
    spray_a = (spray['Latitude'][i], spray['Longitude'][i])
    trap = (spray['Latitude'][i+1], spray['Longitude'][i+1])
    distance = haversine(spray_a, spray_b)
    dist.append(distance)
    print(spray_a, spray_b)

dist
# df_dist = pd.DataFrame(dist)

In [None]:
n = spray.shape[0]
dist = []

for i in range(n):
    spray_a = (spray['Latitude'][i], spray['Longitude'][i])
    spray_b = (spray['Latitude'][i+1], spray['Longitude'][i+1])
    distance = haversine(spray_a, spray_b)
    dist.append(distance)
    print(spray_a, spray_b)

dist
# df_dist = pd.DataFrame(dist)

In [None]:
dist

In [None]:
n = df.Location.shape[0]
dist = []

for i in range(n):
    spray1 = (spray['Location'][i])
    trap = (df['Location'][i+1])
    distance = haversine(spray, trap)
    dist.append(distance)
#     print(spray_a, spray_b)

# dist
df['Dist'].loc[i] = min(dist)

In [None]:
def measure_to_standing(input_location):
    distances=[]
    for standing_spot_location in standing:
        standing_spot_distance=haversine(standing_spot_location, input_location)
        distances+=[standing_spot_distance]
    return min(distances)
        

train['dist_to_standing_water']=train['Location'].map(measure_to_standing)
test['dist_to_standing_water']=train['Location'].map(measure_to_standing)
prediction_data['dist_to_standing_water']=train['Location'].map(measure_to_standing)

In [1]:
def dist_spray_to_trap(input_location, df):
    distances=[]     
    spray_locations=df['Location']
    for location in enumerate(spray_locations):   
        
        spray_distance=haversine(location, locations)
        distances+=[spray_distance]
    return min(distances)
        

train['dist_spray_to_trap']=train['Location'].map(dist_spray_to_trap(input_location))

NameError: name 'train' is not defined

In [None]:
dist_spray_to_trap(input_location, df)

In [None]:
standing

In [None]:
df['dist_spray_to_trap']=df['Location'].map(dist_spray_to_trap(input_location))

In [None]:
train['dist_to_standing_water']=train['Location'].map(measure_to_standing)
test['dist_to_standing_water']=train['Location'].map(measure_to_standing)
prediction_data['dist_to_standing_water']=train['Location'].map(measure_to_standing)

In [None]:
df.head(2)

In [None]:
spray.head(2)

In [None]:
trap_location=spray['Location']

In [None]:
from haversine import haversine


In [None]:
from haversine import haversine
# def distance_calc(trap_location ,spray_location):
dist = []
for i in trap_location:
    for j in range(0,len(trap_location)):
#             spray_lat = spray_temp.Latitude.iloc[j] 
#             spray_long = spray_temp.Longitude.iloc[j]
#             a = (trap_lat, trap_long) # trap coordinates
#             b = (spray_lat, spray_long) # spray coordinates
        dist.append(haversine(trap_location, spray_location),miles=True) # calculate the distance between the points
df['Dist'].loc[i] = min(dist)
    

In [None]:
df= clean(working_data
         )

In [None]:
traps.head(3)


In [None]:
>>> from haversine import haversine
>>> lyon = (45.7597, 4.8422)
>>> paris = (48.8567, 2.3508)
>>> haversine(lyon, paris)
392.00124794121825  # in kilometers
>>> haversine(lyon, paris, miles=True)
243.589575470673  # in miles