In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
np.random.seed(42)

from matplotlib import rcParams
rcParams.update({'figure.autolayout':True})

plt.style.use(["presentation"])

%matplotlib inline

In [2]:
prediction_data=pd.read_csv('./assets/test.csv')

In [3]:
working_data=pd.read_csv('./assets/train.csv')

In [4]:
NOAA=pd.read_csv('./assets/weather.csv')

In [5]:
spray=pd.read_csv('./assets/spray.csv')

In [6]:
train, test=train_test_split(working_data, test_size=.20, random_state=523)

In [7]:
spray.head(3)

Unnamed: 0,Date,Time,Latitude,Longitude
0,2011-08-29,6:56:58 PM,42.391623,-88.089163
1,2011-08-29,6:57:08 PM,42.391348,-88.089163
2,2011-08-29,6:57:18 PM,42.391022,-88.089157


In [8]:
working_data.head(3)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0


In [9]:
NOAA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2944 entries, 0 to 2943
Data columns (total 22 columns):
Station        2944 non-null int64
Date           2944 non-null object
Tmax           2944 non-null int64
Tmin           2944 non-null int64
Tavg           2944 non-null object
Depart         2944 non-null object
DewPoint       2944 non-null int64
WetBulb        2944 non-null object
Heat           2944 non-null object
Cool           2944 non-null object
Sunrise        2944 non-null object
Sunset         2944 non-null object
CodeSum        2944 non-null object
Depth          2944 non-null object
Water1         2944 non-null object
SnowFall       2944 non-null object
PrecipTotal    2944 non-null object
StnPressure    2944 non-null object
SeaLevel       2944 non-null object
ResultSpeed    2944 non-null float64
ResultDir      2944 non-null int64
AvgSpeed       2944 non-null object
dtypes: float64(1), int64(5), object(16)
memory usage: 506.1+ KB


In [10]:
# ## We can tackle the multiple addresses and the grouping in this one step
# traps = working_data[['Trap',
#                'Latitude',
#                'Longitude',
#                'Date',
# #                'Species',
#                'WnvPresent',
#                'NumMosquitos']].groupby(['Trap',
#                                          'Latitude',
#                                          'Longitude',
#                                          'Date', 
#                                          'Species',
#                                          'WnvPresent']).agg({'NumMosquitos':np.sum}).reset_index()

# feats = traps.columns

# # ## get species dummies and add to traps df
# # # traps = pd.get_dummies(traps, columns=['Species'])

# # ## convert to date
# # traps.Date = pd.to_datetime(traps.Date)

In [11]:
from haversine import haversine

In [12]:
def clean(df):
    df['Date']=pd.to_datetime(df['Date'])
    dummies=pd.get_dummies(df['Species']).drop(['CULEX TERRITANS'], axis=1)
    df=pd.concat([df, dummies], axis=1)
    df['Location']=list(zip(df['Latitude'],df['Longitude']))
    df=df.drop(['Address','Block', 'Street', 'Trap', 
                'AddressNumberAndStreet', 'AddressAccuracy','Species'], axis=1)
                    
    return df

In [13]:
df = clean(working_data)

In [14]:
df.head(2)

Unnamed: 0,Date,Latitude,Longitude,NumMosquitos,WnvPresent,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,Location
0,2007-05-29,41.95469,-87.800991,1,0,0,0,1,0,0,0,"(41.95469, -87.800991)"
1,2007-05-29,41.95469,-87.800991,1,0,0,0,0,1,0,0,"(41.95469, -87.800991)"


In [15]:
spray.head(3)

Unnamed: 0,Date,Time,Latitude,Longitude
0,2011-08-29,6:56:58 PM,42.391623,-88.089163
1,2011-08-29,6:57:08 PM,42.391348,-88.089163
2,2011-08-29,6:57:18 PM,42.391022,-88.089157


In [16]:
spray['Location']=list(zip(spray['Latitude'],spray['Longitude']))

In [17]:
spray.drop_duplicates(['Location'], keep='last',inplace=True)

In [18]:
spray.head(2)



Unnamed: 0,Date,Time,Latitude,Longitude,Location
0,2011-08-29,6:56:58 PM,42.391623,-88.089163,"(42.3916233333333, -88.0891633333333)"
1,2011-08-29,6:57:08 PM,42.391348,-88.089163,"(42.3913483333333, -88.0891633333333)"


In [19]:
df.shape

(10506, 12)

In [20]:
df.head(2)

Unnamed: 0,Date,Latitude,Longitude,NumMosquitos,WnvPresent,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,Location
0,2007-05-29,41.95469,-87.800991,1,0,0,0,1,0,0,0,"(41.95469, -87.800991)"
1,2007-05-29,41.95469,-87.800991,1,0,0,0,0,1,0,0,"(41.95469, -87.800991)"


In [1]:

n = df.Location.shape[0]
dist1 = []

for i in range(n):
    spray1 = (spray['Latitude'][i], spray['Longitude'][i])
    trap = (df['Latitude'][i], df['Longitude'][i])
    distance = haversine(spray, trap)
    dist.append(distance)
#     print(spray_a, spray_b)

# dist
df['Dist'].loc[i] = min(dist)

NameError: name 'df' is not defined

In [3]:
n = spray.shape[0]
dist = []

for i in range(n):
    spray_a = (spray['Latitude'][i], spray['Longitude'][i])
    trap = (spray['Latitude'][i+1], spray['Longitude'][i+1])
    distance = haversine(spray_a, spray_b)
    dist.append(distance)
    print(spray_a, spray_b)

dist
# df_dist = pd.DataFrame(dist)

NameError: name 'spray' is not defined

In [24]:
n = spray.shape[0]
dist = []

for i in range(n):
    spray_a = (spray['Latitude'][i], spray['Longitude'][i])
    spray_b = (spray['Latitude'][i+1], spray['Longitude'][i+1])
    distance = haversine(spray_a, spray_b)
    dist.append(distance)
    print(spray_a, spray_b)

dist
# df_dist = pd.DataFrame(dist)

(42.3916233333333, -88.0891633333333) (42.3913483333333, -88.0891633333333)
(42.3913483333333, -88.0891633333333) (42.3910216666667, -88.0891566666667)
(42.3910216666667, -88.0891566666667) (42.3906366666667, -88.0891583333333)
(42.3906366666667, -88.0891583333333) (42.390409999999996, -88.0888583333333)
(42.390409999999996, -88.0888583333333) (42.390395, -88.08831500000001)
(42.390395, -88.08831500000001) (42.3906733333333, -88.0880016666667)
(42.3906733333333, -88.0880016666667) (42.391026666666704, -88.0880016666667)
(42.391026666666704, -88.0880016666667) (42.391403333333294, -88.0880033333333)
(42.391403333333294, -88.0880033333333) (42.391718333333294, -88.08799499999999)
(42.391718333333294, -88.08799499999999) (42.392038333333296, -88.0879883333333)
(42.392038333333296, -88.0879883333333) (42.39241, -88.0880783333333)
(42.39241, -88.0880783333333) (42.3928, -88.0883316666667)
(42.3928, -88.0883316666667) (42.3932566666667, -88.08837)
(42.3932566666667, -88.08837) (42.3936, -88.

KeyError: 484

In [25]:
dist

[0.030578604827916633,
 0.03632780187907335,
 0.042810265567791175,
 0.035245646636518475,
 0.04465243716891745,
 0.04024942082374431,
 0.03928887408906678,
 0.04188364600814101,
 0.035033086960579544,
 0.03558658822846801,
 0.041983156185129086,
 0.048098128269290955,
 0.05087650135408001,
 0.03983125768754378,
 0.046650455554722366,
 0.04658030973880674,
 0.03600772157830849,
 0.04124553508529052,
 0.046764593452226014,
 0.0328955120811162,
 0.03655431994245814,
 0.038613762190426976,
 0.033802472174902565,
 0.04898910911321127,
 0.058868146808056375,
 0.03272018528916501,
 0.044227839781246656,
 0.05282932171193629,
 0.04560794267711418,
 0.036321565112449855,
 0.038530584239526405,
 0.042706137280479506,
 0.041570491948115115,
 0.042489024137801014,
 0.03608560087702006,
 0.042032220943050015,
 0.04055436589108848,
 0.03675401788530221,
 0.038695037647749904,
 0.03249165430525294,
 0.08526745688634588,
 0.03361602491466193,
 0.035193130581190474,
 0.03632470743935922,
 0.0382491588

In [None]:
n = df.Location.shape[0]
dist = []

for i in range(n):
    spray1 = (spray['Location'][i])
    trap = (df['Location'][i+1])
    distance = haversine(spray, trap)
    dist.append(distance)
#     print(spray_a, spray_b)

# dist
df['Dist'].loc[i] = min(dist)

In [5]:
def measure_to_standing(input_location):
    distances=[]
    for standing_spot_location in standing:
        standing_spot_distance=haversine(standing_spot_location, input_location)
        distances+=[standing_spot_distance]
    return min(distances)
        

train['dist_to_standing_water']=train['Location'].map(measure_to_standing)
test['dist_to_standing_water']=train['Location'].map(measure_to_standing)
prediction_data['dist_to_standing_water']=train['Location'].map(measure_to_standing)

NameError: name 'train' is not defined

In [None]:
def dist_spray_to_trap(input_location, df):
    distances=[]     
    spray_locations=df['Location']
    for location in enumerate(spray_locations):   
        
        spray_distance=haversine(location, locations)
        distances+=[spray_distance]
    return min(distances)
        

# df['dist_spray_to_trap']=df['Location'].map(dist_spray_to_trap(input_location))

In [None]:
dist_spray_to_trap(input_location, df)

In [4]:
standing

NameError: name 'standing' is not defined

In [None]:
df['dist_spray_to_trap']=df['Location'].map(dist_spray_to_trap(input_location))

In [None]:
train['dist_to_standing_water']=train['Location'].map(measure_to_standing)
test['dist_to_standing_water']=train['Location'].map(measure_to_standing)
prediction_data['dist_to_standing_water']=train['Location'].map(measure_to_standing)

In [None]:
df.head(2)

In [None]:
spray.head(2)

In [None]:
trap_location=spray['Location']

In [None]:
from haversine import haversine


In [None]:
from haversine import haversine
# def distance_calc(trap_location ,spray_location):
dist = []
for i in trap_location:
    for j in range(0,len(trap_location)):
#             spray_lat = spray_temp.Latitude.iloc[j] 
#             spray_long = spray_temp.Longitude.iloc[j]
#             a = (trap_lat, trap_long) # trap coordinates
#             b = (spray_lat, spray_long) # spray coordinates
        dist.append(haversine(trap_location, spray_location),miles=True) # calculate the distance between the points
df['Dist'].loc[i] = min(dist)
    

In [None]:
df= clean(working_data
         )

In [None]:
traps.head(3)


In [None]:
>>> from haversine import haversine
>>> lyon = (45.7597, 4.8422)
>>> paris = (48.8567, 2.3508)
>>> haversine(lyon, paris)
392.00124794121825  # in kilometers
>>> haversine(lyon, paris, miles=True)
243.589575470673  # in miles