In [1]:
import seaborn as sns

import pandas as pd

import numpy as np

import datetime

import miceforest as mf

from sklearn import preprocessing

In [2]:
data = pd.read_csv("/Users/melissalian/Desktop/untitled folder/weatherdatasingapore.csv")

In [3]:
data.isna().sum()

Year                              0
Month                             0
Daily Rainfall Total (mm)         0
Highest 30 Min Rainfall (mm)      1
Highest 60 Min Rainfall (mm)      1
Highest 120 Min Rainfall (mm)     1
Mean Temperature (°C)            64
Maximum Temperature (°C)         63
Minimum Temperature (°C)         63
Mean Wind Speed (km/h)            3
Max Wind Speed (km/h)            18
Region                            0
dtype: int64

In [4]:
#change object to categorical data
data[data.select_dtypes(['object']).columns] = data.select_dtypes(['object']).apply(lambda x: x.astype('category'))

# Create kernels
kernel = mf.ImputationKernel(
  data=data,
  save_all_iterations=True,
  random_state=1991
)

# Run the MICE algorithm for 3 iterations on the dataset
kernel.mice(3,verbose=True)

Initialized logger with name mice 1-3
Dataset 0
1  | Highest 30 Min Rainfall (mm) | Highest 60 Min Rainfall (mm) | Highest 120 Min Rainfall (mm) | Mean Wind Speed (km/h) | Max Wind Speed (km/h) | Maximum Temperature (°C) | Minimum Temperature (°C) | Mean Temperature (°C)
2  | Highest 30 Min Rainfall (mm) | Highest 60 Min Rainfall (mm) | Highest 120 Min Rainfall (mm) | Mean Wind Speed (km/h) | Max Wind Speed (km/h) | Maximum Temperature (°C) | Minimum Temperature (°C) | Mean Temperature (°C)
3  | Highest 30 Min Rainfall (mm) | Highest 60 Min Rainfall (mm) | Highest 120 Min Rainfall (mm) | Mean Wind Speed (km/h) | Max Wind Speed (km/h) | Maximum Temperature (°C) | Minimum Temperature (°C) | Mean Temperature (°C)


In [5]:
data_complete = kernel.complete_data()

In [6]:
data_complete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1053 entries, 0 to 1052
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   Year                           1053 non-null   int64   
 1   Month                          1053 non-null   category
 2   Daily Rainfall Total (mm)      1053 non-null   float64 
 3   Highest 30 Min Rainfall (mm)   1053 non-null   float64 
 4   Highest 60 Min Rainfall (mm)   1053 non-null   float64 
 5   Highest 120 Min Rainfall (mm)  1053 non-null   float64 
 6   Mean Temperature (°C)          1053 non-null   float64 
 7   Maximum Temperature (°C)       1053 non-null   float64 
 8   Minimum Temperature (°C)       1053 non-null   float64 
 9   Mean Wind Speed (km/h)         1053 non-null   float64 
 10  Max Wind Speed (km/h)          1053 non-null   float64 
 11  Region                         1053 non-null   category
dtypes: category(2), float64(9), int64(

In [5]:
#change month to numeric
data_complete['Month'] = [datetime.datetime.strptime(str(x), '%B').month for x in data_complete['Month']]

#data before scaling and dummy
data_complete.to_csv("databeforescale.csv", index=False )

# separate the "categorical" and numeric data
numeric = data_complete.drop(["Region", "Year", "Month"], axis=1)
others = data_complete[["Region", "Year", "Month"]]
 
# standardization of numeric variables
scaled_df = pd.DataFrame(preprocessing.scale(numeric), columns=numeric.columns)

#concat all data
combined_df = pd.concat([scaled_df, others], axis=1)

#write final processed data to csv
combined_df.to_csv("preprocessdata.csv", index=False )

In [6]:
combined_df.describe()

Unnamed: 0,Daily Rainfall Total (mm),Highest 30 Min Rainfall (mm),Highest 60 Min Rainfall (mm),Highest 120 Min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h),Year,Month
count,1053.0,1053.0,1053.0,1053.0,1053.0,1053.0,1053.0,1053.0,1053.0,1053.0,1053.0
mean,1.754427e-16,-1.079647e-16,2.159294e-16,1.079647e-16,-2.766596e-15,1.427158e-15,1.349559e-15,-2.361728e-16,4.386066e-16,2018.384615,6.384615
std,1.000475,1.000475,1.000475,1.000475,1.000475,1.000475,1.000475,1.000475,1.000475,2.817185,3.41814
min,-1.925105,-2.127636,-2.044237,-2.006745,-3.195655,-3.161934,-2.636773,-1.660039,-2.624854,2014.0,1.0
25%,-0.7258998,-0.7290614,-0.7233214,-0.7468744,-0.7056184,-0.6328212,-0.7330282,-0.7280766,-0.64102,2016.0,3.0
50%,-0.110767,-0.07875942,-0.08851397,-0.08980792,0.09117037,-0.007558976,-0.003373318,-0.1930738,-0.05844583,2018.0,6.0
75%,0.5703784,0.6783337,0.651256,0.6526922,0.7291767,0.7178911,0.7038156,0.5408522,0.5433521,2021.0,9.0
max,8.857844,4.803775,4.72313,4.332214,2.472342,3.20575,2.475628,4.540879,3.677993,2023.0,12.0


In [7]:
combined_df

Unnamed: 0,Daily Rainfall Total (mm),Highest 30 Min Rainfall (mm),Highest 60 Min Rainfall (mm),Highest 120 Min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h),Region,Year,Month
0,-1.234920,-1.496323,-1.475633,-1.434619,-2.818187,-2.453041,-2.005773,-0.391966,0.387403,Central,2014,1
1,-1.069239,-1.159290,-1.207874,-1.200211,-1.915184,-1.501677,-1.089553,0.431404,0.124076,Central Business District and Surrounding,2014,1
2,-1.387421,-1.680647,-1.649292,-1.599752,-2.217576,-2.046116,-1.521941,1.342132,0.524621,East,2014,1
3,-1.087096,-1.177888,-1.166177,-1.182858,-2.098704,-1.926003,-1.222395,-0.226288,-0.315021,Islands,2014,1
4,-0.918836,-1.590758,-1.528513,-1.471668,-2.699455,-2.309916,-2.636773,0.870200,0.745476,North,2014,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1048,-0.462914,-0.828705,-0.838529,-0.787384,0.446667,1.013403,-0.129443,-0.488092,-0.129102,North,2023,9
1049,-0.110767,0.133875,0.083069,0.016716,0.653544,0.877687,0.314666,0.014096,0.546096,North-East,2023,9
1050,-0.425154,-0.115314,-0.180408,-0.282148,1.343136,-0.883151,1.877124,1.407566,0.612266,South-East,2023,9
1051,-0.375604,-0.250479,-0.247268,-0.284283,1.183668,-0.107129,1.739854,2.425432,2.526454,South-West,2023,9
