train.csv, test.csv - the training and test set of the main dataset. The training set consists of data from 2007, 2009, 2011, and 2013, while in the test set you are requested to predict the test results for 2008, 2010, 2012, and 2014.
* Id: the id of the record
* Date: date that the WNV test is performed
* Address: approximate address of the location of trap. This is used to send to the GeoCoder. 
* Species: the species of mosquitos
* Block: block number of address
* Street: street name
* Trap: Id of the trap
* AddressNumberAndStreet: approximate address returned from GeoCoder
* Latitude, Longitude: Latitude and Longitude returned from GeoCoder
* AddressAccuracy: accuracy returned from GeoCoder
* NumMosquitos: number of mosquitoes caught in this trap
* WnvPresent: whether West Nile Virus was present in these mosquitos. 1 means WNV is present, and 0 means not present

In [223]:
import numpy as np
import pandas as pd
import plotly.express as px

In [224]:
train_df = pd.read_csv('./data/train.csv')

In [225]:
train_df.shape

(10506, 12)

In [226]:
train_df.dtypes

Date                       object
Address                    object
Species                    object
Block                       int64
Street                     object
Trap                       object
AddressNumberAndStreet     object
Latitude                  float64
Longitude                 float64
AddressAccuracy             int64
NumMosquitos                int64
WnvPresent                  int64
dtype: object

In [227]:
train_df.isnull().sum() # no null values

Date                      0
Address                   0
Species                   0
Block                     0
Street                    0
Trap                      0
AddressNumberAndStreet    0
Latitude                  0
Longitude                 0
AddressAccuracy           0
NumMosquitos              0
WnvPresent                0
dtype: int64

In [228]:
train_df['Species'].unique() # no abnormalities observed in Species

array(['CULEX PIPIENS/RESTUANS', 'CULEX RESTUANS', 'CULEX PIPIENS',
       'CULEX SALINARIUS', 'CULEX TERRITANS', 'CULEX TARSALIS',
       'CULEX ERRATICUS'], dtype=object)

In [229]:
train_df['NumMosquitos'].unique() # no abnormalities observed in NumMosquitos

array([ 1,  4,  2,  3,  5,  9,  7, 10,  8,  6, 19, 20, 25, 16, 11, 12, 28,
       18, 50, 35, 14, 22, 21, 37, 27, 13, 39, 29, 15, 17, 34, 26, 32, 47,
       44, 23, 46, 48, 42, 33, 45, 24, 41, 38, 40, 36, 43, 49, 30, 31])

In [230]:
train_df['Address'].unique() # One of the addresses is just 'Chicago, IL, USA', which is very vague.

array(['4100 North Oak Park Avenue, Chicago, IL 60634, USA',
       '6200 North Mandell Avenue, Chicago, IL 60646, USA',
       '7900 West Foster Avenue, Chicago, IL 60656, USA',
       '1500 West Webster Avenue, Chicago, IL 60614, USA',
       '2500 West Grand Avenue, Chicago, IL 60654, USA',
       '1100 Roosevelt Road, Chicago, IL 60608, USA',
       '1100 West Chicago Avenue, Chicago, IL 60642, USA',
       '2100 North Stave Street, Chicago, IL 60647, USA',
       '2200 North Cannon Drive, Chicago, IL 60614, USA',
       '2200 West 113th Street, Chicago, IL 60643, USA',
       '1100 South Peoria Street, Chicago, IL 60608, USA',
       '1700 West 95th Street, Chicago, IL 60643, USA',
       '2200 West 89th Street, Chicago, IL 60643, USA',
       'North Streeter Drive, Chicago, IL 60611, USA',
       '6500 North Oak Park Avenue, Chicago, IL 60631, USA',
       '7500 North Oakley Avenue, Chicago, IL 60645, USA',
       '1500 North Long Avenue, Chicago, IL 60651, USA',
       '8900 Sou

In [231]:
train_df.loc[train_df['Address']=='Chicago, IL, USA',:].describe()
# we observe that for addresses that are 'Chicago, IL, USA', the AddressAccuracy is 3. we will remove these rows

Unnamed: 0,Block,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
count,91.0,91.0,91.0,91.0,91.0,91.0
mean,43.0,41.87811,-87.6298,3.0,4.417582,0.010989
std,0.0,7.144793e-14,4.286876e-14,0.0,6.319399,0.104828
min,43.0,41.87811,-87.6298,3.0,1.0,0.0
25%,43.0,41.87811,-87.6298,3.0,1.0,0.0
50%,43.0,41.87811,-87.6298,3.0,2.0,0.0
75%,43.0,41.87811,-87.6298,3.0,4.0,0.0
max,43.0,41.87811,-87.6298,3.0,30.0,1.0


In [232]:
train_df_1 = train_df.loc[train_df['AddressAccuracy']!=3,:]

In [233]:
train_df_1.shape

(10415, 12)

In [234]:
# drop address, block, street and AddressNumberandStreet as we will be using latitude and longitude
train_df_1.drop(columns=['Address','Block','Street','AddressNumberAndStreet'],inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [235]:
train_df_1.columns

Index(['Date', 'Species', 'Trap', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent'],
      dtype='object')

In [246]:
train_df_2 = train_df.groupby(['Date','Species','Latitude','Longitude','AddressAccuracy'],as_index=False)[['NumMosquitos','WnvPresent']].sum()

# we sum the NumMosquitos and WnvPrsent


In [248]:
train_df_2.reset_index()

Unnamed: 0,index,Date,Species,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,0,2007-05-29,CULEX PIPIENS,41.731922,-87.677512,8,1,0
1,1,2007-05-29,CULEX PIPIENS/RESTUANS,41.688324,-87.676709,8,1,0
2,2,2007-05-29,CULEX PIPIENS/RESTUANS,41.867108,-87.654224,8,1,0
3,3,2007-05-29,CULEX PIPIENS/RESTUANS,41.891126,-87.61156,5,1,0
4,4,2007-05-29,CULEX PIPIENS/RESTUANS,41.919343,-87.694259,8,1,0
5,5,2007-05-29,CULEX PIPIENS/RESTUANS,41.921965,-87.632085,8,2,0
6,6,2007-05-29,CULEX PIPIENS/RESTUANS,41.95469,-87.800991,9,1,0
7,7,2007-05-29,CULEX PIPIENS/RESTUANS,41.974089,-87.824812,8,1,0
8,8,2007-05-29,CULEX PIPIENS/RESTUANS,41.999129,-87.795585,8,1,0
9,9,2007-05-29,CULEX PIPIENS/RESTUANS,42.01743,-87.687769,8,1,0


In [249]:
train_df_2.shape

(8475, 7)

In [250]:
def split_date(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df.Date.dt.year
    df['Month'] = df.Date.dt.month
    df['Day'] = df.Date.dt.day

In [251]:
split_date(train_df_2)

In [252]:
train_df_2['Year'].unique()

array([2007, 2009, 2011, 2013])

In [254]:
Year_virus = train_df_2[['Year', 'WnvPresent']].groupby('Year', as_index = False).sum()

In [255]:
fig = px.bar(Year_virus, x = 'Year', y = 'WnvPresent')
fig.update_layout(
    title="Virus vs Year",
    xaxis_title="Year",
    yaxis_title="Virus",
    width=500,
    height=300,)
fig.show()

In [256]:
train_df_2['Month'].unique()

array([ 5,  6,  7,  8,  9, 10])

In [258]:
month_virus = train_df_2[['Month', 'WnvPresent']].groupby('Month', as_index = False).sum()

In [259]:
fig = px.bar(month_virus, x = 'Month', y = 'WnvPresent')
fig.update_layout(
    title="Virus vs Month",
    xaxis_title="Month",
    yaxis_title="Virus",
    width=500,
    height=300,)
fig.show()

In [260]:
train_df_2['Day'].unique()

array([29,  5, 26,  2, 11, 18, 19, 25, 27,  1,  3,  7,  8,  9, 15, 16, 17,
       21, 22, 24, 28,  4,  6, 12, 10, 13, 31, 14, 30, 23])

In [261]:
day_virus = train_df_2[['Day', 'WnvPresent']].groupby('Day', as_index = False).sum()

In [262]:
fig = px.bar(day_virus, x = 'Day', y = 'WnvPresent')
fig.update_layout(
    title="Virus vs Day",
    xaxis_title="Day",
    yaxis_title="Virus",
    width=500,
    height=300,)
fig.show()

In [263]:
species_virus = train_df_2[['Species', 'WnvPresent']].groupby('Species', as_index = False).sum()

In [264]:
species_virus

Unnamed: 0,Species,WnvPresent
0,CULEX ERRATICUS,0
1,CULEX PIPIENS,240
2,CULEX PIPIENS/RESTUANS,262
3,CULEX RESTUANS,49
4,CULEX SALINARIUS,0
5,CULEX TARSALIS,0
6,CULEX TERRITANS,0


In [265]:
fig = px.bar(species_virus, x = 'Species', y = 'WnvPresent')
fig.update_layout(
    title="Virus vs Species",
    xaxis_title="Species",
    yaxis_title="Virus",
    width=500,
    height=300,)
fig.show()

In [266]:
train_df_2.shape

(8475, 10)

In [268]:
train_df_2.columns

Index(['Date', 'Species', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent', 'Year', 'Month', 'Day'],
      dtype='object')