In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt

## 准备工作

In [2]:
# 读取数据
weather = pd.read_csv('../Database/weatherstats_edmonton_hourly.csv')
weather.head()

Unnamed: 0,date_time_local,unixtime,pressure_station,pressure_sea,wind_dir,wind_dir_10s,wind_speed,wind_gust,relative_humidity,dew_point,temperature,windchill,humidex,visibility,health_index,cloud_cover_4,cloud_cover_8,cloud_cover_10,solar_radiation
0,2018-12-18 16:00:00 MST,1545174000,91.73,99.7,SW,23.0,5.0,,64.0,-7.3,-1.3,-3.0,,,,,,,
1,2018-12-18 15:00:00 MST,1545170400,91.67,99.7,S,18.0,5.0,,65.0,-6.3,-0.5,-2.0,,,4.0,,,,
2,2018-12-18 14:00:00 MST,1545166800,91.67,99.6,S,17.0,5.0,,51.0,-6.5,2.7,,,,4.0,,,,
3,2018-12-18 13:00:00 MST,1545163200,91.66,99.6,SE,13.0,2.0,,50.0,-6.4,2.9,,,,3.9,,,,
4,2018-12-18 12:00:00 MST,1545159600,91.75,99.7,,,0.0,,59.0,-6.6,0.5,,,,4.1,,,,


In [3]:
# 读取数据
mosquito = pd.read_csv('../Database/Mosquito_Trap_Data.csv')
mosquito.head()

Unnamed: 0,Trap Date,Genus,Specific Epithet,Gender,IDd,Count,Trap Region,Include,Comparison Group,Latitude,Longitude,Location
0,09/15/2015 12:00:00 AM,Aedes,vexans,Female,,3,Rural-West,,Outer,53.562973,-113.787353,POINT (-113.787353 53.5629733)
1,05/12/1998 12:00:00 AM,Male,UnID,Male,UnID,1,Rural-North West,,Peripheral,53.617221,-113.716182,POINT (-113.716182 53.617221)
2,08/28/2001 12:00:00 AM,Culex,tarsalis,Female,,1,Rural-North West,,Peripheral,53.617221,-113.716182,POINT (-113.716182 53.617221)
3,08/17/2004 12:00:00 AM,Culex,territans,Female,,1,Rural-North West,,Peripheral,53.617221,-113.716182,POINT (-113.716182 53.617221)
4,05/15/2018 12:00:00 AM,Culiseta,impatiens,Female,,2,Rural-West,,Outer,53.55902,-114.00119,POINT (-114.00119 53.559020000000004)


In [4]:
# Dataframe中的每一列都是Series类型
type(weather['date_time_local'])

pandas.core.series.Series

In [5]:
# 将日期中的 MST 删掉的方法
def format_date(series, offsets):
    date_formatted = []

    for date in series:
        date = date[0:-offsets]
        date_formatted.append(date)

    date_formatted = pd.Series(date_formatted)
    return date_formatted

### 处理weather数据集

In [6]:
date_formatted_0=format_date(weather['date_time_local'],4)

In [7]:
# 判断格式化日期后的类型是否正确
type(date_formatted_0)

pandas.core.series.Series

In [8]:
# 改正格式后的日期重新赋值 -> weather dataset
weather['date_time_local'] = date_formatted_0

In [9]:
weather.head()

Unnamed: 0,date_time_local,unixtime,pressure_station,pressure_sea,wind_dir,wind_dir_10s,wind_speed,wind_gust,relative_humidity,dew_point,temperature,windchill,humidex,visibility,health_index,cloud_cover_4,cloud_cover_8,cloud_cover_10,solar_radiation
0,2018-12-18 16:00:00,1545174000,91.73,99.7,SW,23.0,5.0,,64.0,-7.3,-1.3,-3.0,,,,,,,
1,2018-12-18 15:00:00,1545170400,91.67,99.7,S,18.0,5.0,,65.0,-6.3,-0.5,-2.0,,,4.0,,,,
2,2018-12-18 14:00:00,1545166800,91.67,99.6,S,17.0,5.0,,51.0,-6.5,2.7,,,,4.0,,,,
3,2018-12-18 13:00:00,1545163200,91.66,99.6,SE,13.0,2.0,,50.0,-6.4,2.9,,,,3.9,,,,
4,2018-12-18 12:00:00,1545159600,91.75,99.7,,,0.0,,59.0,-6.6,0.5,,,,4.1,,,,


In [10]:
# 获取我们想要的列
weather_with_temp = pd.DataFrame(weather,
                                 columns=['date_time_local', 'temperature'])

In [11]:
weather_with_temp.head()

Unnamed: 0,date_time_local,temperature
0,2018-12-18 16:00:00,-1.3
1,2018-12-18 15:00:00,-0.5
2,2018-12-18 14:00:00,2.7
3,2018-12-18 13:00:00,2.9
4,2018-12-18 12:00:00,0.5


In [12]:
# 日期转DatetimeIndex格式
weather_with_temp['date_time_local']=pd.DatetimeIndex(weather_with_temp['date_time_local'])

In [27]:
# 按时间间隔对数据分组 -> 对这一天不同时段的气温取平均值
weather_with_temp = weather_with_temp.resample('D',on='date_time_local').mean()
weather_with_temp.

Unnamed: 0_level_0,temperature
date_time_local,Unnamed: 1_level_1
2016-12-18,2.271429
2016-12-19,0.891667
2016-12-20,-3.945833
2016-12-21,-0.808333
2016-12-22,-0.941667
...,...
2018-12-14,1.412500
2018-12-15,-0.383333
2018-12-16,-7.712500
2018-12-17,-6.079167


### 处理mosquito数据集

In [15]:
date_formatted_1 = format_date(mosquito['Trap Date'], 2)

In [16]:
type(date_formatted_1)

pandas.core.series.Series

In [17]:
mosquito['Trap Date'] = date_formatted_1

In [18]:
mosquito.head()

Unnamed: 0,Trap Date,Genus,Specific Epithet,Gender,IDd,Count,Trap Region,Include,Comparison Group,Latitude,Longitude,Location
0,09/15/2015 12:00:00,Aedes,vexans,Female,,3,Rural-West,,Outer,53.562973,-113.787353,POINT (-113.787353 53.5629733)
1,05/12/1998 12:00:00,Male,UnID,Male,UnID,1,Rural-North West,,Peripheral,53.617221,-113.716182,POINT (-113.716182 53.617221)
2,08/28/2001 12:00:00,Culex,tarsalis,Female,,1,Rural-North West,,Peripheral,53.617221,-113.716182,POINT (-113.716182 53.617221)
3,08/17/2004 12:00:00,Culex,territans,Female,,1,Rural-North West,,Peripheral,53.617221,-113.716182,POINT (-113.716182 53.617221)
4,05/15/2018 12:00:00,Culiseta,impatiens,Female,,2,Rural-West,,Outer,53.55902,-114.00119,POINT (-114.00119 53.559020000000004)


In [19]:
mosquito['Trap Date'] = pd.DatetimeIndex(mosquito['Trap Date'])

In [20]:
mosquito_with_count = pd.DataFrame(mosquito, columns=['Trap Date', 'Count'])

In [21]:
mosquito_with_count.head()

Unnamed: 0,Trap Date,Count
0,2015-09-15 12:00:00,3
1,1998-05-12 12:00:00,1
2,2001-08-28 12:00:00,1
3,2004-08-17 12:00:00,1
4,2018-05-15 12:00:00,2


In [22]:
mosquito_with_count.resample('D',on='Trap Date').sum()

Unnamed: 0_level_0,Count
Trap Date,Unnamed: 1_level_1
1990-05-22,34
1990-05-23,0
1990-05-24,0
1990-05-25,0
1990-05-26,0
...,...
2020-09-18,0
2020-09-19,0
2020-09-20,0
2020-09-21,0


In [23]:
# 删除包含0的行
mosquito_with_count=mosquito_with_count[~mosquito_with_count['Count'].isin([1])]

In [24]:
mosquito_with_count.sort_values(by='Trap Date')

Unnamed: 0,Trap Date,Count
25423,1990-05-22 12:00:00,3
10962,1990-05-22 12:00:00,3
3074,1990-05-22 12:00:00,2
15710,1990-05-22 12:00:00,15
8573,1990-05-22 12:00:00,7
...,...,...
11489,2020-09-22 12:00:00,31
24633,2020-09-22 12:00:00,26
15505,2020-09-22 12:00:00,10
14280,2020-09-22 12:00:00,16


In [30]:
str(weather_with_temp.index.values[[1]])

"['2016-12-19T00:00:00.000000000']"