In [1]:
import pandas as pd
import numpy as np
from math import sin, cos, sqrt, atan2, radians

# Train dataset

In [274]:
train_df = pd.read_csv("./datasets/train.csv")

In [275]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10506 entries, 0 to 10505
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Date                    10506 non-null  object 
 1   Address                 10506 non-null  object 
 2   Species                 10506 non-null  object 
 3   Block                   10506 non-null  int64  
 4   Street                  10506 non-null  object 
 5   Trap                    10506 non-null  object 
 6   AddressNumberAndStreet  10506 non-null  object 
 7   Latitude                10506 non-null  float64
 8   Longitude               10506 non-null  float64
 9   AddressAccuracy         10506 non-null  int64  
 10  NumMosquitos            10506 non-null  int64  
 11  WnvPresent              10506 non-null  int64  
dtypes: float64(2), int64(4), object(6)
memory usage: 985.1+ KB


In [276]:
train_df.shape

(10506, 12)

In [277]:
#94.7 of traps were negative on WMNV
train_df.WnvPresent.value_counts(normalize=True)

0    0.947554
1    0.052446
Name: WnvPresent, dtype: float64

In [278]:
train_df["Species"].value_counts(normalize=True)

CULEX PIPIENS/RESTUANS    0.452313
CULEX RESTUANS            0.260803
CULEX PIPIENS             0.256901
CULEX TERRITANS           0.021131
CULEX SALINARIUS          0.008186
CULEX TARSALIS            0.000571
CULEX ERRATICUS           0.000095
Name: Species, dtype: float64

In [279]:
#check how many rows with more than 50 mosqusito
len(train_df[train_df['NumMosquitos'] == 50])

1019

In [280]:
groupby_cols = [col for col in train_df.columns if col != "NumMosquitos"]

In [281]:
train_df = train_df.groupby(groupby_cols)["NumMosquitos"].sum().reset_index(name="NumMosquitos")

In [282]:
train_df.shape

(8610, 12)

In [283]:
train_df.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,WnvPresent,NumMosquitos
0,2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX PIPIENS/RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,0,1
1,2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,0,2
2,2007-05-29,"1100 South Peoria Street, Chicago, IL 60608, USA",CULEX RESTUANS,11,S PEORIA ST,T091,"1100 S PEORIA ST, Chicago, IL",41.862292,-87.64886,8,0,1
3,2007-05-29,"1100 West Chicago Avenue, Chicago, IL 60642, USA",CULEX RESTUANS,11,W CHICAGO,T049,"1100 W CHICAGO, Chicago, IL",41.896282,-87.655232,8,0,1
4,2007-05-29,"1500 North Long Avenue, Chicago, IL 60651, USA",CULEX RESTUANS,15,N LONG AVE,T153,"1500 N LONG AVE, Chicago, IL",41.907645,-87.760886,8,0,1


In [284]:
train_df["Date"] = pd.to_datetime(train_df["Date"])

In [285]:
train_df["Day"] = train_df["Date"].dt.day

In [286]:
train_df["Month"] = train_df['Date'].dt.month

In [287]:
train_df["Year"] = train_df["Date"].dt.year

In [288]:
train_df.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,WnvPresent,NumMosquitos,Day,Month,Year
0,2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX PIPIENS/RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,0,1,29,5,2007
1,2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,0,2,29,5,2007
2,2007-05-29,"1100 South Peoria Street, Chicago, IL 60608, USA",CULEX RESTUANS,11,S PEORIA ST,T091,"1100 S PEORIA ST, Chicago, IL",41.862292,-87.64886,8,0,1,29,5,2007
3,2007-05-29,"1100 West Chicago Avenue, Chicago, IL 60642, USA",CULEX RESTUANS,11,W CHICAGO,T049,"1100 W CHICAGO, Chicago, IL",41.896282,-87.655232,8,0,1,29,5,2007
4,2007-05-29,"1500 North Long Avenue, Chicago, IL 60651, USA",CULEX RESTUANS,15,N LONG AVE,T153,"1500 N LONG AVE, Chicago, IL",41.907645,-87.760886,8,0,1,29,5,2007


In [289]:
train_df['Year'].value_counts(normalize=True)

2007    0.329501
2013    0.239024
2009    0.223113
2011    0.208362
Name: Year, dtype: float64

In [290]:
train_df['Month'].value_counts(normalize=True)

8     0.329268
7     0.246225
9     0.229152
6     0.155285
10    0.030662
5     0.009408
Name: Month, dtype: float64

In [291]:
train_df['Day'].value_counts(normalize=True)

12    0.082811
19    0.066086
1     0.065505
15    0.062718
24    0.057724
25    0.057375
17    0.044019
7     0.042509
26    0.040534
5     0.037398
29    0.036934
21    0.032985
8     0.032753
2     0.032636
4     0.031359
11    0.030662
6     0.029501
27    0.028339
22    0.026481
18    0.024274
14    0.023345
30    0.016841
28    0.015912
13    0.015563
10    0.015215
16    0.013937
31    0.013705
23    0.011614
3     0.006736
9     0.004530
Name: Day, dtype: float64

In [292]:
#Create function to calculate distance
def distance(lat1, lon1, lat2, lon2):
    
    r = 6373.0
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)
    
    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = r * c

    return round(distance, 3)

    

In [268]:
def nearest_station(df):
    
    weather_station = {
    "station_1": {
        "name": "CHICAGO O'HARE INTERNATIONAL AIRPORT",
        "lat": 41.995,
        "lon": -87.933,
        "elev": 662
        },
    "station_2": {
        "name": "CHICAGO MIDWAY INTL ARPT",
        "lat": 41.786,
        "lon": -87.752,
        "elev": 612
    }
}

    
    for i in range(len(df)):
        lat = df.iloc[i]["Latitude"]
        lon = df.iloc[i]["Longitude"]
        
        dist_stn1 = abs(distance(lat, lon,
                             weather_station["station_1"]['lat'], 
                             weather_station["station_1"]['lon']))
        
        dist_stn2 = abs(distance(lat, lon,
                             weather_station["station_2"]['lat'], 
                             weather_station["station_2"]['lon']))
        
        if dist_stn1 > dist_stn2:
            df.loc[i, "Station"] = 2
        else:
            df.loc[i, "Station"] = 1
            
    

In [295]:
nearest_station(train_df)

In [320]:
train_df["Station"] = train_df["Station"].astype(int)

In [321]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8610 entries, 0 to 8609
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Date                    8610 non-null   datetime64[ns]
 1   Address                 8610 non-null   object        
 2   Species                 8610 non-null   object        
 3   Block                   8610 non-null   int64         
 4   Street                  8610 non-null   object        
 5   Trap                    8610 non-null   object        
 6   AddressNumberAndStreet  8610 non-null   object        
 7   Latitude                8610 non-null   float64       
 8   Longitude               8610 non-null   float64       
 9   AddressAccuracy         8610 non-null   int64         
 10  WnvPresent              8610 non-null   int64         
 11  NumMosquitos            8610 non-null   int64         
 12  Day                     8610 non-null   int64   

In [322]:
train_df["Station"].value_counts()

2    6090
1    2520
Name: Station, dtype: int64

In [226]:
train_df["Trap"].sort_values(ascending=False).value_counts()

T900     182
T115     158
T135     153
T002     150
T151     144
        ... 
T078       6
T076       5
T094B      5
T237       3
T040       2
Name: Trap, Length: 136, dtype: int64

In [326]:
train_df[train_df["Year"] == 2007]["Trap"].value_counts()

T138    57
T086    48
T115    47
T096    46
T002    45
        ..
T004    12
T072    11
T078     6
T076     5
T040     2
Name: Trap, Length: 116, dtype: int64

In [327]:
train_df[train_df["Year"] == 2009]["Trap"].value_counts()

T900    60
T048    46
T135    44
T031    44
T115    42
        ..
T224     8
T229     8
T081     8
T226     7
T045     1
Name: Trap, Length: 73, dtype: int64

In [328]:
train_df[train_df["Year"] == 2011]["Trap"].value_counts()

T900    65
T903    44
T027    43
T151    42
T031    42
        ..
T148     6
T236     6
T043     5
T077     5
T237     3
Name: Trap, Length: 80, dtype: int64

In [329]:
train_df[train_df["Year"] == 2013]["Trap"].value_counts()

T900     57
T054     38
T002     37
T209     37
T227     36
         ..
T039     18
T903     17
T077     15
T054C     9
T094B     5
Name: Trap, Length: 74, dtype: int64

# Spray dataset

In [142]:
spray_df = pd.read_csv("./datasets/spray.csv")

In [143]:
spray_df.head(20)

Unnamed: 0,Date,Time,Latitude,Longitude
0,2011-08-29,6:56:58 PM,42.391623,-88.089163
1,2011-08-29,6:57:08 PM,42.391348,-88.089163
2,2011-08-29,6:57:18 PM,42.391022,-88.089157
3,2011-08-29,6:57:28 PM,42.390637,-88.089158
4,2011-08-29,6:57:38 PM,42.39041,-88.088858
5,2011-08-29,6:57:48 PM,42.390395,-88.088315
6,2011-08-29,6:57:58 PM,42.390673,-88.088002
7,2011-08-29,6:58:08 PM,42.391027,-88.088002
8,2011-08-29,6:58:18 PM,42.391403,-88.088003
9,2011-08-29,6:58:28 PM,42.391718,-88.087995


In [144]:
spray_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14835 entries, 0 to 14834
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       14835 non-null  object 
 1   Time       14251 non-null  object 
 2   Latitude   14835 non-null  float64
 3   Longitude  14835 non-null  float64
dtypes: float64(2), object(2)
memory usage: 463.7+ KB


In [145]:
spray_df.isnull().sum()

Date           0
Time         584
Latitude       0
Longitude      0
dtype: int64

In [146]:
#Do we want to drop rows with time null?? 
spray_df[spray_df["Time"].isnull()]

Unnamed: 0,Date,Time,Latitude,Longitude
1030,2011-09-07,,41.987092,-87.794286
1031,2011-09-07,,41.987620,-87.794382
1032,2011-09-07,,41.988004,-87.794574
1033,2011-09-07,,41.988292,-87.795486
1034,2011-09-07,,41.988100,-87.796014
...,...,...,...,...
1609,2011-09-07,,41.995876,-87.811615
1610,2011-09-07,,41.995972,-87.810271
1611,2011-09-07,,41.995684,-87.810319
1612,2011-09-07,,41.994724,-87.810415


In [147]:
spray_df["Date"] = pd.to_datetime(spray_df["Date"])

In [148]:
spray_df["Day"] = spray_df["Date"].dt.day

In [149]:
spray_df["Month"] = spray_df["Date"].dt.month

In [150]:
spray_df["Year"] = spray_df["Date"].dt.year

In [151]:
spray_df["Day"].value_counts(normalize=True)

15    0.179845
29    0.161577
17    0.148433
7     0.142501
25    0.108325
22    0.106977
8     0.080553
5     0.062285
16    0.009505
Name: Day, dtype: float64

In [152]:
spray_df["Month"].value_counts(normalize=True)

8    0.538456
7    0.256758
9    0.204786
Name: Month, dtype: float64

In [153]:
spray_df["Year"].value_counts(normalize=True)

2013    0.851095
2011    0.148905
Name: Year, dtype: float64

# Weather dataset

In [309]:
weather_df = pd.read_csv("./datasets/weather.csv", na_values=["M"])

In [310]:
for col in weather_df.columns:
    print(col)
    print(weather_df[col].unique())

Station
[1 2]
Date
['2007-05-01' '2007-05-02' '2007-05-03' ... '2014-10-29' '2014-10-30'
 '2014-10-31']
Tmax
[ 83  84  59  60  66  67  78  68  82  80  77  76  70  73  64  65  69  90
  62  61  71  79  87  89  88  75  85  86  81  72  63  91  92  93  74  94
  54  53  56  57  58  55  50  95  52  47  45  51  48  44  49  46  96  99
 100 101  97  98 102 103 104  42  41]
Tmin
[50 52 42 43 46 48 49 51 53 54 47 60 61 63 56 59 44 57 45 55 66 65 70 68
 62 67 64 58 71 69 73 75 72 74 39 41 40 37 34 38 35 36 33 31 32 76 77 29
 78 79 80 81 82 83]
Tavg
[67. 68. 51. 52. 56. 58. nan 60. 59. 65. 70. 69. 71. 61. 55. 57. 73. 72.
 53. 62. 63. 74. 75. 78. 76. 77. 66. 80. 64. 81. 82. 79. 85. 84. 83. 50.
 49. 46. 48. 45. 54. 47. 44. 40. 41. 38. 39. 42. 37. 43. 86. 87. 89. 92.
 88. 91. 93. 94. 90. 36.]
Depart
[ 14.  nan  -3.   2.   4.   5.  10.  12.  13.  -2.  15.  11.  -4.  -6.
   8.  -5.   1.   9.   6.  -9.  -8.   3.   0.  -1.  -7.   7. -14.  18.
  16.  22.  21.  20. -10. -16.  17. -12. -11. -15. -17.  19. -13

In [311]:
weather_df.isnull().sum()

Station           0
Date              0
Tmax              0
Tmin              0
Tavg             11
Depart         1472
DewPoint          0
WetBulb           4
Heat             11
Cool             11
Sunrise           0
Sunset            0
CodeSum           0
Depth          1472
Water1         2944
SnowFall       1472
PrecipTotal       2
StnPressure       4
SeaLevel          9
ResultSpeed       0
ResultDir         0
AvgSpeed          3
dtype: int64

# EDA