In [1]:
import pandas as pd
import numpy as np

print('Done')


#useful functions
def display_info(df, col = True, row = True, nan = True, unique = True, dtypes = True):
    if col:
        print('Columns:\n', df.columns)
        print('#' * 100)
    if row:
        print('Number of rows:\n', len(df))
        print('#' * 100)
    if nan:
        print('Number of NaN:\n', df.isnull().sum().sum())
        print('#' * 100)
    if unique:
        print('Number of unique values:\n', df.nunique())
        print('#' * 100)
    if dtypes:
        print('Data types:\n', df.dtypes)
        print('#' * 100)
    return df.head()



#load the malaria cases data and get an idea of what it looks like
cases_df = pd.read_csv('../../data/malaria/cases.csv')
display_info(cases_df)

Done
Columns:
 Index(['LATITUDE', 'LONGITUDE', 'VECTOR_SPECIES_COMPLEX', 'STAGE',
       'YEAR_START', 'SAMPLING_METHOD', 'INVASIVE_STATUS'],
      dtype='object')
####################################################################################################
Number of rows:
 106
####################################################################################################
Number of NaN:
 0
####################################################################################################
Number of unique values:
 LATITUDE                  4
LONGITUDE                 5
VECTOR_SPECIES_COMPLEX    1
STAGE                     4
YEAR_START                8
SAMPLING_METHOD           5
INVASIVE_STATUS           1
dtype: int64
####################################################################################################
Data types:
 LATITUDE                  float64
LONGITUDE                 float64
VECTOR_SPECIES_COMPLEX     object
STAGE                      object
YEAR_STAR

Unnamed: 0,LATITUDE,LONGITUDE,VECTOR_SPECIES_COMPLEX,STAGE,YEAR_START,SAMPLING_METHOD,INVASIVE_STATUS
0,11.2962,76.9414,An. stephensi,NR,1990,Not available,Native
1,11.2962,76.9414,An. stephensi,NR,1990,Not available,Native
2,11.2962,76.9414,An. stephensi,NR,1990,Not available,Native
3,11.2962,76.9414,An. stephensi,NR,1990,Not available,Native
4,11.2962,76.9414,An. stephensi,NR,1990,Not available,Native


In [2]:
#turn the year start column itno a datetime object
cases_df['date'] = pd.to_datetime(cases_df['YEAR_START'], format = '%Y')
display_info(cases_df, col = False, row = False, nan = False, unique = False, dtypes = True)

Data types:
 LATITUDE                         float64
LONGITUDE                        float64
VECTOR_SPECIES_COMPLEX            object
STAGE                             object
YEAR_START                         int64
SAMPLING_METHOD                   object
INVASIVE_STATUS                   object
date                      datetime64[ns]
dtype: object
####################################################################################################


Unnamed: 0,LATITUDE,LONGITUDE,VECTOR_SPECIES_COMPLEX,STAGE,YEAR_START,SAMPLING_METHOD,INVASIVE_STATUS,date
0,11.2962,76.9414,An. stephensi,NR,1990,Not available,Native,1990-01-01
1,11.2962,76.9414,An. stephensi,NR,1990,Not available,Native,1990-01-01
2,11.2962,76.9414,An. stephensi,NR,1990,Not available,Native,1990-01-01
3,11.2962,76.9414,An. stephensi,NR,1990,Not available,Native,1990-01-01
4,11.2962,76.9414,An. stephensi,NR,1990,Not available,Native,1990-01-01


In [3]:
#categorize the invasive status, sampling methog, stage, vextor species comblex all into numerical categories

cases_df['stage'] = cases_df['STAGE'].astype('category').cat.codes

cases_df.drop(['INVASIVE_STATUS', 'SAMPLING_METHOD', 'STAGE', 'VECTOR_SPECIES_COMPLEX'], axis = 1, inplace = True)

display_info(cases_df, col = False, row = False, nan = False, unique = False, dtypes = True)
cases_df.head()

Data types:
 LATITUDE             float64
LONGITUDE            float64
YEAR_START             int64
date          datetime64[ns]
stage                   int8
dtype: object
####################################################################################################


Unnamed: 0,LATITUDE,LONGITUDE,YEAR_START,date,stage
0,11.2962,76.9414,1990,1990-01-01,3
1,11.2962,76.9414,1990,1990-01-01,3
2,11.2962,76.9414,1990,1990-01-01,3
3,11.2962,76.9414,1990,1990-01-01,3
4,11.2962,76.9414,1990,1990-01-01,3


In [4]:
#relabel the rest of the coulumns to make the naming more consistent
cases_df['lat'] = cases_df['LATITUDE']
cases_df['lon'] = cases_df['LONGITUDE']

cases_df.drop(['LATITUDE', 'LONGITUDE', 'YEAR_START'], axis = 1, inplace = True)
cases_df.head()

Unnamed: 0,date,stage,lat,lon
0,1990-01-01,3,11.2962,76.9414
1,1990-01-01,3,11.2962,76.9414
2,1990-01-01,3,11.2962,76.9414
3,1990-01-01,3,11.2962,76.9414
4,1990-01-01,3,11.2962,76.9414


In [5]:
display_info(cases_df)

Columns:
 Index(['date', 'stage', 'lat', 'lon'], dtype='object')
####################################################################################################
Number of rows:
 106
####################################################################################################
Number of NaN:
 0
####################################################################################################
Number of unique values:
 date     8
stage    4
lat      4
lon      5
dtype: int64
####################################################################################################
Data types:
 date     datetime64[ns]
stage              int8
lat             float64
lon             float64
dtype: object
####################################################################################################


Unnamed: 0,date,stage,lat,lon
0,1990-01-01,3,11.2962,76.9414
1,1990-01-01,3,11.2962,76.9414
2,1990-01-01,3,11.2962,76.9414
3,1990-01-01,3,11.2962,76.9414
4,1990-01-01,3,11.2962,76.9414


In [6]:
#see the earlies and latest dates in the data
print('Earliest date:', cases_df['date'].min())
print('Latest date:', cases_df['date'].max())


Earliest date: 1984-01-01 00:00:00
Latest date: 1996-01-01 00:00:00


In [7]:
#load the malaria weather data and get an idea of what it looks like
weather_df = pd.read_csv('../../data/malaria/weather.csv')
display_info(weather_df)

Columns:
 Index(['Unnamed: 0', 'dt', 'city_name', 'lat', 'lon', 'temp', 'dew_point',
       'feels_like', 'temp_min', 'temp_max', 'pressure', 'humidity',
       'wind_speed', 'wind_deg', 'clouds_all', 'weather_id', 'weather_main',
       'weather_description'],
      dtype='object')
####################################################################################################
Number of rows:
 1585632
####################################################################################################
Number of NaN:
 0
####################################################################################################
Number of unique values:
 Unnamed: 0             1585632
dt                      396408
city_name                    4
lat                          4
lon                          4
temp                      4261
dew_point                 3821
feels_like                4925
temp_min                  4263
temp_max                  4236
pressure                    45
hum

Unnamed: 0.1,Unnamed: 0,dt,city_name,lat,lon,temp,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_id,weather_main,weather_description
0,0,1979-01-01 00:00:00+00:00,Northern New Delhi,28.581,77.173,6.44,4.6,4.92,5.66,7.57,1018,88,2.13,312,0,800,Clear,sky is clear
1,1,1979-01-01 01:00:00+00:00,Northern New Delhi,28.581,77.173,6.14,4.14,4.77,5.77,6.61,1018,87,1.93,302,0,800,Clear,sky is clear
2,2,1979-01-01 02:00:00+00:00,Northern New Delhi,28.581,77.173,5.99,4.16,4.73,5.7,6.35,1019,88,1.8,298,0,800,Clear,sky is clear
3,3,1979-01-01 03:00:00+00:00,Northern New Delhi,28.581,77.173,10.37,4.52,9.21,10.17,10.8,1019,67,2.07,297,0,800,Clear,sky is clear
4,4,1979-01-01 04:00:00+00:00,Northern New Delhi,28.581,77.173,18.1,8.38,17.35,17.73,18.56,1020,53,1.38,292,0,800,Clear,sky is clear


In [8]:
#we dont need the columns Unnamed: 0, weather id 
columns_to_drop = ['Unnamed: 0', 'weather_id']
weather_df.drop(columns=columns_to_drop, inplace=True)
weather_df.head()

Unnamed: 0,dt,city_name,lat,lon,temp,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_main,weather_description
0,1979-01-01 00:00:00+00:00,Northern New Delhi,28.581,77.173,6.44,4.6,4.92,5.66,7.57,1018,88,2.13,312,0,Clear,sky is clear
1,1979-01-01 01:00:00+00:00,Northern New Delhi,28.581,77.173,6.14,4.14,4.77,5.77,6.61,1018,87,1.93,302,0,Clear,sky is clear
2,1979-01-01 02:00:00+00:00,Northern New Delhi,28.581,77.173,5.99,4.16,4.73,5.7,6.35,1019,88,1.8,298,0,Clear,sky is clear
3,1979-01-01 03:00:00+00:00,Northern New Delhi,28.581,77.173,10.37,4.52,9.21,10.17,10.8,1019,67,2.07,297,0,Clear,sky is clear
4,1979-01-01 04:00:00+00:00,Northern New Delhi,28.581,77.173,18.1,8.38,17.35,17.73,18.56,1020,53,1.38,292,0,Clear,sky is clear


In [9]:
#convert the weathe_main and weather_description to categorical data   
weather_df['weather_main'] = weather_df['weather_main'].astype('category')
weather_df['weather_description'] = weather_df['weather_description'].astype('category')
weather_df['weather_main'] = weather_df['weather_main'].cat.codes
weather_df['weather_description'] = weather_df['weather_description'].cat.codes

display_info(weather_df)

Columns:
 Index(['dt', 'city_name', 'lat', 'lon', 'temp', 'dew_point', 'feels_like',
       'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed',
       'wind_deg', 'clouds_all', 'weather_main', 'weather_description'],
      dtype='object')
####################################################################################################
Number of rows:
 1585632
####################################################################################################
Number of NaN:
 0
####################################################################################################
Number of unique values:
 dt                     396408
city_name                   4
lat                         4
lon                         4
temp                     4261
dew_point                3821
feels_like               4925
temp_min                 4263
temp_max                 4236
pressure                   45
humidity                   99
wind_speed                989
wind_deg           

Unnamed: 0,dt,city_name,lat,lon,temp,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_main,weather_description
0,1979-01-01 00:00:00+00:00,Northern New Delhi,28.581,77.173,6.44,4.6,4.92,5.66,7.57,1018,88,2.13,312,0,0,24
1,1979-01-01 01:00:00+00:00,Northern New Delhi,28.581,77.173,6.14,4.14,4.77,5.77,6.61,1018,87,1.93,302,0,0,24
2,1979-01-01 02:00:00+00:00,Northern New Delhi,28.581,77.173,5.99,4.16,4.73,5.7,6.35,1019,88,1.8,298,0,0,24
3,1979-01-01 03:00:00+00:00,Northern New Delhi,28.581,77.173,10.37,4.52,9.21,10.17,10.8,1019,67,2.07,297,0,0,24
4,1979-01-01 04:00:00+00:00,Northern New Delhi,28.581,77.173,18.1,8.38,17.35,17.73,18.56,1020,53,1.38,292,0,0,24


In [10]:
# convert the weadher dt to a datetime object
weather_df['date'] = pd.to_datetime(weather_df['dt'])
weather_df.drop(columns=['dt'], inplace=True)
display_info(weather_df)

Columns:
 Index(['city_name', 'lat', 'lon', 'temp', 'dew_point', 'feels_like',
       'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed',
       'wind_deg', 'clouds_all', 'weather_main', 'weather_description',
       'date'],
      dtype='object')
####################################################################################################
Number of rows:
 1585632
####################################################################################################
Number of NaN:
 0
####################################################################################################
Number of unique values:
 city_name                   4
lat                         4
lon                         4
temp                     4261
dew_point                3821
feels_like               4925
temp_min                 4263
temp_max                 4236
pressure                   45
humidity                   99
wind_speed                989
wind_deg                  361
clouds_all

Unnamed: 0,city_name,lat,lon,temp,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_main,weather_description,date
0,Northern New Delhi,28.581,77.173,6.44,4.6,4.92,5.66,7.57,1018,88,2.13,312,0,0,24,1979-01-01 00:00:00+00:00
1,Northern New Delhi,28.581,77.173,6.14,4.14,4.77,5.77,6.61,1018,87,1.93,302,0,0,24,1979-01-01 01:00:00+00:00
2,Northern New Delhi,28.581,77.173,5.99,4.16,4.73,5.7,6.35,1019,88,1.8,298,0,0,24,1979-01-01 02:00:00+00:00
3,Northern New Delhi,28.581,77.173,10.37,4.52,9.21,10.17,10.8,1019,67,2.07,297,0,0,24,1979-01-01 03:00:00+00:00
4,Northern New Delhi,28.581,77.173,18.1,8.38,17.35,17.73,18.56,1020,53,1.38,292,0,0,24,1979-01-01 04:00:00+00:00


In [11]:
#trim the weather data to only include the data that is in the malaria cases data
# Earliest date: 1984-01-01 00:00:00
# Latest date: 1996-01-01 00:00:00

weather_df = weather_df[(weather_df['date'].dt.year >= 1984) & (weather_df['date'].dt.year <= 1996)]
display_info(weather_df)

Columns:
 Index(['city_name', 'lat', 'lon', 'temp', 'dew_point', 'feels_like',
       'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed',
       'wind_deg', 'clouds_all', 'weather_main', 'weather_description',
       'date'],
      dtype='object')
####################################################################################################
Number of rows:
 455904
####################################################################################################
Number of NaN:
 0
####################################################################################################
Number of unique values:
 city_name                   4
lat                         4
lon                         4
temp                     4128
dew_point                3449
feels_like               4670
temp_min                 4126
temp_max                 4115
pressure                   43
humidity                   97
wind_speed                875
wind_deg                  361
clouds_all 

Unnamed: 0,city_name,lat,lon,temp,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_main,weather_description,date
43824,Northern New Delhi,28.581,77.173,7.05,4.7,5.64,6.51,7.72,1018,85,2.13,64,0,0,24,1984-01-01 00:00:00+00:00
43825,Northern New Delhi,28.581,77.173,6.48,4.64,5.09,5.93,7.4,1019,88,2.0,61,0,0,24,1984-01-01 01:00:00+00:00
43826,Northern New Delhi,28.581,77.173,6.09,4.42,4.88,5.89,6.6,1019,89,1.77,68,0,0,24,1984-01-01 02:00:00+00:00
43827,Northern New Delhi,28.581,77.173,7.1,3.35,5.44,6.8,7.45,1019,77,2.44,81,0,0,24,1984-01-01 03:00:00+00:00
43828,Northern New Delhi,28.581,77.173,10.38,4.74,9.25,9.95,10.82,1020,68,1.92,97,0,0,24,1984-01-01 04:00:00+00:00


In [12]:
weather_df.tail()

Unnamed: 0,city_name,lat,lon,temp,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_main,weather_description,date
1347019,Tamil Nadu,11.2962,76.9414,19.27,15.14,19.26,18.81,19.57,1017,77,1.0,31,2,0,24,1996-12-31 19:00:00+00:00
1347020,Tamil Nadu,11.2962,76.9414,19.22,15.49,19.26,18.71,19.68,1016,79,0.98,38,1,0,24,1996-12-31 20:00:00+00:00
1347021,Tamil Nadu,11.2962,76.9414,18.29,15.91,18.42,17.66,19.29,1016,86,0.95,38,1,0,24,1996-12-31 21:00:00+00:00
1347022,Tamil Nadu,11.2962,76.9414,17.16,14.62,17.15,16.33,17.63,1016,85,0.75,4,12,1,4,1996-12-31 22:00:00+00:00
1347023,Tamil Nadu,11.2962,76.9414,16.9,14.72,16.92,15.83,17.54,1016,87,0.7,1,6,0,24,1996-12-31 23:00:00+00:00


In [13]:
#combine the two dataframes so they align on the date, lat and lon columns
print('Cases df shape:', cases_df.shape)
print('Weather df shape:', weather_df.shape)


Cases df shape: (106, 4)
Weather df shape: (455904, 16)


In [14]:
cases_df.head(20)

Unnamed: 0,date,stage,lat,lon
0,1990-01-01,3,11.2962,76.9414
1,1990-01-01,3,11.2962,76.9414
2,1990-01-01,3,11.2962,76.9414
3,1990-01-01,3,11.2962,76.9414
4,1990-01-01,3,11.2962,76.9414
5,1990-01-01,3,11.2962,76.9414
6,1990-01-01,3,11.2962,76.9414
7,1990-01-01,3,11.2962,76.9414
8,1990-01-01,3,11.2962,76.9414
9,1990-01-01,3,11.2962,76.9414


In [15]:
weather_df.head()

Unnamed: 0,city_name,lat,lon,temp,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_main,weather_description,date
43824,Northern New Delhi,28.581,77.173,7.05,4.7,5.64,6.51,7.72,1018,85,2.13,64,0,0,24,1984-01-01 00:00:00+00:00
43825,Northern New Delhi,28.581,77.173,6.48,4.64,5.09,5.93,7.4,1019,88,2.0,61,0,0,24,1984-01-01 01:00:00+00:00
43826,Northern New Delhi,28.581,77.173,6.09,4.42,4.88,5.89,6.6,1019,89,1.77,68,0,0,24,1984-01-01 02:00:00+00:00
43827,Northern New Delhi,28.581,77.173,7.1,3.35,5.44,6.8,7.45,1019,77,2.44,81,0,0,24,1984-01-01 03:00:00+00:00
43828,Northern New Delhi,28.581,77.173,10.38,4.74,9.25,9.95,10.82,1020,68,1.92,97,0,0,24,1984-01-01 04:00:00+00:00


In [16]:
# for each year we need to cumulate the number of cases in each location
cases_df['cases'] = 1
cases_df = cases_df.groupby(['date', 'lat', 'lon']).sum().reset_index()
cases_df.drop(columns=['stage'], inplace=True)
cases_df.head(20)



Unnamed: 0,date,lat,lon,cases
0,1984-01-01,28.5615,77.1935,1
1,1985-01-01,22.69,72.856,24
2,1985-01-01,28.5615,77.1935,12
3,1986-01-01,28.5615,77.1935,5
4,1989-01-01,28.581,77.219,1
5,1990-01-01,11.2962,76.9414,19
6,1994-01-01,28.581,77.173,14
7,1995-01-01,28.581,77.173,24
8,1996-01-01,28.581,77.173,6


In [19]:
#match the cases to the weather data based on lon lat and year
combined_df = weather_df
for index, row in cases_df.iterrows():
    date = row['date']
    lat = row['lat']
    lon = row['lon']
    cases = row['cases']
    mask = (weather_df['date'].dt.year == date.year) & (weather_df['lat'] == lat) & (weather_df['lon'] == lon)
    combined_df.loc[mask, 'cases'] = cases

In [25]:
tolerance = 1  # adjust the tolerance value as needed

combined_df = weather_df.copy()  # create a copy of the weather_df

for index, row in cases_df.iterrows():
    date = row['date']
    lat = row['lat']
    lon = row['lon']
    cases = row['cases']
    
    # Use a tolerance check for latitude and longitude
    mask = (weather_df['date'].dt.year == date.year) & \
        (abs(weather_df['lat'] - lat) <= tolerance) & \
        (abs(weather_df['lon'] - lon) <= tolerance)
    
    combined_df.loc[mask, 'cases'] = cases

In [26]:
combined_df.head(20)

Unnamed: 0,city_name,lat,lon,temp,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_main,weather_description,date,cases
43824,Northern New Delhi,28.581,77.173,7.05,4.7,5.64,6.51,7.72,1018,85,2.13,64,0,0,24,1984-01-01 00:00:00+00:00,1.0
43825,Northern New Delhi,28.581,77.173,6.48,4.64,5.09,5.93,7.4,1019,88,2.0,61,0,0,24,1984-01-01 01:00:00+00:00,1.0
43826,Northern New Delhi,28.581,77.173,6.09,4.42,4.88,5.89,6.6,1019,89,1.77,68,0,0,24,1984-01-01 02:00:00+00:00,1.0
43827,Northern New Delhi,28.581,77.173,7.1,3.35,5.44,6.8,7.45,1019,77,2.44,81,0,0,24,1984-01-01 03:00:00+00:00,1.0
43828,Northern New Delhi,28.581,77.173,10.38,4.74,9.25,9.95,10.82,1020,68,1.92,97,0,0,24,1984-01-01 04:00:00+00:00,1.0
43829,Northern New Delhi,28.581,77.173,15.73,6.19,14.74,15.55,15.95,1020,53,2.07,109,0,0,24,1984-01-01 05:00:00+00:00,1.0
43830,Northern New Delhi,28.581,77.173,17.99,6.83,17.1,17.79,18.23,1020,48,2.17,121,0,0,24,1984-01-01 06:00:00+00:00,1.0
43831,Northern New Delhi,28.581,77.173,17.95,6.49,17.03,17.78,18.18,1019,47,1.64,123,0,0,24,1984-01-01 07:00:00+00:00,1.0
43832,Northern New Delhi,28.581,77.173,18.14,6.03,17.18,18.0,18.34,1018,45,1.26,109,0,0,24,1984-01-01 08:00:00+00:00,1.0
43833,Northern New Delhi,28.581,77.173,19.54,6.64,18.67,19.42,19.73,1017,43,1.12,91,0,0,24,1984-01-01 09:00:00+00:00,1.0


In [27]:
combined_df.to_csv('test.csv', index=False)