In [0]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
# Read train data
train_data = pd.read_csv("./data/dengue_features_train.csv")
train_labels = pd.read_csv("./data/dengue_labels_train.csv")

# Read test data
test_data = pd.read_csv("./data/dengue_features_test.csv")

print(train_data.shape)
train_data.head()

(1456, 24)


Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,sj,1990,18,1990-04-30,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,292.414286,299.8,295.9,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0
1,sj,1990,19,1990-05-07,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,293.951429,300.9,296.4,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
2,sj,1990,20,1990-05-14,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,295.434286,300.5,297.3,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4
3,sj,1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,295.31,301.4,297.0,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
4,sj,1990,22,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,299.664286,295.821429,301.9,297.5,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8


## Handling missing values:

In [3]:
# Checking for duplicates
train_data.duplicated().sum()

0

In [4]:
# Missing values for each feature 
print("Missing values for each feature:")
print("--------------------------------")
print(train_data.isna().sum())

# Rows with more than 5 missing columns
print("\n\nRows with more than 5 missing columns:")
print("--------------------------------------")
print(train_data.isna().sum(axis=1)[train_data.isna().sum(axis=1) > 5])

Missing values for each feature:
--------------------------------
city                                       0
year                                       0
weekofyear                                 0
week_start_date                            0
ndvi_ne                                  194
ndvi_nw                                   52
ndvi_se                                   22
ndvi_sw                                   22
precipitation_amt_mm                      13
reanalysis_air_temp_k                     10
reanalysis_avg_temp_k                     10
reanalysis_dew_point_temp_k               10
reanalysis_max_air_temp_k                 10
reanalysis_min_air_temp_k                 10
reanalysis_precip_amt_kg_per_m2           10
reanalysis_relative_humidity_percent      10
reanalysis_sat_precip_amt_mm              13
reanalysis_specific_humidity_g_per_kg     10
reanalysis_tdtr_k                         10
station_avg_temp_c                        43
station_diur_temp_rng_c           

## Preprocessing data

In [0]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def preprocess(input_data, method="mean", by_city=True, train_data_path="./data/dengue_features_train.csv"):
  
  assert method in ["mean", "median", "zeros"], "Invalid method. Try 'mean', 'median' or 'zeros.'"""
  
  # Read training data
  try:
    train_data = pd.read_csv(train_data_path)
  except:
    train_data = pd.read_csv("https://raw.githubusercontent.com/Jose-Paulo-FaD/DengAI/master/data/dengue_features_train.csv")
    
  # Fill NaN values by city
  if by_city:
    # Split data
    sj_train_data = train_data[train_data.city=="sj"].copy()
    iq_train_data = train_data[train_data.city=="iq"].copy()
    sj_input_data = input_data[input_data.city=="sj"].copy()
    iq_input_data = input_data[input_data.city=="iq"].copy()
    
    if method == "mean":
      sj_input_data.fillna(sj_train_data.mean(), inplace=True)
      iq_input_data.fillna(train_data.mean(), inplace=True)
    elif method == "median":
      sj_input_data.fillna(sj_train_data.median(), inplace=True)
      iq_input_data.fillna(iq_train_data.median(), inplace=True)
    else:
      sj_input_data.fillna(0, inplace=True)
      iq_input_data.fillna(0, inplace=True)
      
    # Concat data back together
    input_data = pd.concat([sj_input_data, iq_input_data])
    
  
  # Fill NaN values as a whole
  else:  
    if method == "mean":
      input_data.fillna(train_data.mean(), inplace=True)
    elif method == "median":
      input_data.fillna(train_data.median(), inplace=True)
    else:
      input_data.fillna(0, inplace=True)
    
  # Fit scaler with the training data
  scaler = StandardScaler()
  scaler.fit(train_data.iloc[:, 4:])
  
  # Scale input data
  output_data = input_data.copy()
  output_data.iloc[:, 4:] = scaler.transform(input_data.iloc[:, 4:])
  
  return output_data

In [6]:
# Preprocess data
train_data_preprocessed = preprocess(train_data)
test_data_preprocessed = preprocess(test_data)

print(train_data_preprocessed.shape)
train_data_preprocessed.head()

(1456, 24)


Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,sj,1990,18,1990-04-30,-0.140192,-0.223645,-0.071781,-0.294358,-0.762931,-0.828956,-1.175569,-1.854321,-1.121735,0.070519,-0.187746,-1.229999,-0.762931,-1.772788,-0.641761,-1.349129,-0.544845,-1.558451,-1.335953,-0.491715
1,sj,1990,19,1990-05-07,0.196522,0.096889,-0.561071,-0.55822,-0.524947,-0.36009,-0.620577,-0.847865,-0.781544,0.26549,-0.511565,-0.67027,-0.524947,-0.890794,-0.714293,-0.364967,-0.793255,-0.384164,0.062186,-0.647706
2,sj,1990,20,1990-05-14,-0.783365,0.35358,-0.630918,-0.37512,-0.256757,0.058428,-0.275123,0.123048,-0.90525,0.616439,-0.32363,-0.015256,-0.256757,0.066243,-0.734441,-0.364967,-0.739545,-0.128884,0.443496,0.043712
3,sj,1990,21,1990-05-21,-0.097243,0.954632,0.321991,0.400368,-0.695655,0.209472,0.002373,0.041671,-0.626912,0.499456,-0.604611,-0.255168,-0.695655,-0.047712,-0.698175,0.221106,-0.605269,0.432731,0.761255,-0.744673
4,sj,1990,22,1990-05-28,0.383743,1.097461,0.642209,0.536934,-0.875058,0.59967,0.347828,0.376533,-0.47228,0.694428,-0.643764,-0.237989,-0.875058,0.300639,-0.532962,1.36008,0.616642,1.300682,1.142566,-0.706729


In [7]:
# Missing values for each feature 
print("Missing values for each feature:")
print("--------------------------------")
print(train_data_preprocessed.isna().sum())

# Rows with more than 5 missing columns
print("\n\nRows with more than 5 missing columns:")
print("--------------------------------------")
print(train_data_preprocessed.isna().sum(axis=1)[train_data_preprocessed.isna().sum(axis=1) > 5])

Missing values for each feature:
--------------------------------
city                                     0
year                                     0
weekofyear                               0
week_start_date                          0
ndvi_ne                                  0
ndvi_nw                                  0
ndvi_se                                  0
ndvi_sw                                  0
precipitation_amt_mm                     0
reanalysis_air_temp_k                    0
reanalysis_avg_temp_k                    0
reanalysis_dew_point_temp_k              0
reanalysis_max_air_temp_k                0
reanalysis_min_air_temp_k                0
reanalysis_precip_amt_kg_per_m2          0
reanalysis_relative_humidity_percent     0
reanalysis_sat_precip_amt_mm             0
reanalysis_specific_humidity_g_per_kg    0
reanalysis_tdtr_k                        0
station_avg_temp_c                       0
station_diur_temp_rng_c                  0
station_max_temp_c             