In [4]:
import pandas as pd
import numpy as np
import os

from sklearn.utils import class_weight
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, KFold

### Holdout estratificado

In [None]:
# É feito um holdout estratificado na proporção 1/3 para teste e 2/3 para treinamento

In [None]:
dataset = pd.read_csv('dados_originais')

In [None]:
# calcula proporcionalmente o quanto de cada objeto deve ser retirado (estratificado)
sampling = {weather: int(count*1/3) for weather, count in pd.value_counts(dataset["weather"]).items()}   # amostragem de 2/3 para treinamento e 1/3 para teste
sampling
    

In [None]:
#faz o sampling
testing_dataset = pd.DataFrame()

for weather in pd.unique(dataset["weather"]):
    sampled_dataset = dataset[dataset['weather'] == weather].sample(sampling.get(weather))  # vetorização booleana para filtrar por cada 'weather' e faz a amostragem estratificada (usando os valores calculados em sampling)
    testing_dataset = pd.concat([testing_dataset, sampled_dataset]) 
    
testing_dataset

In [None]:
#remove as amostras de teste das amostras de treinamento
dataset.drop(testing_dataset.index, axis = 0, inplace= True)
dataset.reset_index(inplace=True, drop=True)
dataset

In [None]:
testing_dataset.reset_index(inplace=True, drop=True)
testing_dataset

In [None]:
#salva os datasets 
dataset.to_csv('./training_dataset.csv')
testing_dataset.to_csv('./test_dataset.csv')

### K-fold cross-validaiton

In [19]:
training_dataset = pd.read_csv('dados_originais.csv').drop(['Unnamed: 0'], axis = 1).rename({'month':'season'}, axis = 1)
training_dataset


Unnamed: 0,season,humidity,pressure,temperature,wind,weather
0,spring,58,1012,15,7,few_clouds
1,spring,57,1012,15,7,few_clouds
2,spring,57,1012,15,7,few_clouds
3,spring,57,1012,15,7,few_clouds
4,spring,57,1012,15,6,few_clouds
...,...,...,...,...,...,...
43575,spring,36,1019,16,3,sky_is_clear
43576,spring,38,1019,16,1,sky_is_clear
43577,spring,54,1019,14,2,sky_is_clear
43578,spring,62,1020,12,3,sky_is_clear


In [20]:
x, y = training_dataset[['season','humidity', 'pressure', 'temperature', 'wind']], training_dataset['weather']

In [7]:
# mapeamento do atributo alvo (weather)
key_mapping = {value : i for i, value in enumerate(pd.unique(y))}
reverse_key_mapping = {value : key for key, value in key_mapping.items()}

key_mapping

{'few_clouds': 0,
 'scattered_clouds': 1,
 'broken_clouds': 2,
 'sky_is_clear': 3,
 'overcast_clouds': 4,
 'mist': 5,
 'drizzle': 6,
 'moderate_rain': 7,
 'light_intensity_drizzle': 8,
 'light_rain': 9,
 'fog': 10,
 'haze': 11,
 'heavy_snow': 12,
 'heavy_intensity_drizzle': 13,
 'heavy_intensity_rain': 14,
 'light_rain_and_snow': 15,
 'snow': 16,
 'light_snow': 17,
 'proximity_thunderstorm': 18,
 'thunderstorm': 19,
 'thunderstorm_with_rain': 20,
 'thunderstorm_with_heavy_rain': 21,
 'thunderstorm_with_light_rain': 22,
 'very_heavy_rain': 23,
 'dust': 24}

In [9]:
y = y.map(key_mapping)
y

0        0
1        0
2        0
3        0
4        0
        ..
43575    3
43576    3
43577    3
43578    3
43579    3
Name: weather, Length: 43580, dtype: int64

In [21]:
#Faz o mapeamento do atributo season em X
key_mapping_season = {value : round(i/3, 3) for i, value in enumerate(pd.unique(x['season']))}  
reverse_key_mapping_season = {value : key for key, value in key_mapping_season.items()}
key_mapping_season

{'spring': 0.0, 'summer': 0.333, 'autumn': 0.667, 'winter': 1.0}

In [12]:
x['season'] = x['season'].map(key_mapping_season)
x

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['season'] = x['season'].map(key_mapping_season)


Unnamed: 0,season,humidity,pressure,temperature,wind
0,0.0,58,1012,15,7
1,0.0,57,1012,15,7
2,0.0,57,1012,15,7
3,0.0,57,1012,15,7
4,0.0,57,1012,15,6
...,...,...,...,...,...
43575,0.0,36,1019,16,3
43576,0.0,38,1019,16,1
43577,0.0,54,1019,14,2
43578,0.0,62,1020,12,3


In [13]:
k_folds = StratifiedKFold(n_splits=5)

In [14]:
#cria um dicionario com i sendo a indexação da fold e os valores de i é outro dicionario contendo as separações (em formato dataframe)
folds = {i: {"x_train": x.iloc[train_index],
             "y_train" : y.iloc[train_index],
             "x_val" :  x.iloc[val_index],
             "y_val" :y.iloc[val_index]  }
          for i, (train_index, val_index)  in enumerate(k_folds.split(x, y))}

In [15]:
folds

{0: {'x_train':        season  humidity  pressure  temperature  wind
  780       0.0        69      1015            4     3
  809       0.0        51      1022            0     5
  812       0.0        51      1022           -1     4
  813       0.0        55      1022           -1     4
  814       0.0        51      1022           -1     5
  ...       ...       ...       ...          ...   ...
  43575     0.0        36      1019           16     3
  43576     0.0        38      1019           16     1
  43577     0.0        54      1019           14     2
  43578     0.0        62      1020           12     3
  43579     0.0        58      1020           11     2
  
  [34864 rows x 5 columns],
  'y_train': 780      12
  809      12
  812      12
  813      12
  814      12
           ..
  43575     3
  43576     3
  43577     3
  43578     3
  43579     3
  Name: weather, Length: 34864, dtype: int64,
  'x_val':        season  humidity  pressure  temperature  wind
  0       0.000     

In [16]:
#salva os k folds dentro de subpastas e no formato de datasets 
os.mkdir("K Folds Cross Validation")
for folds_index in folds.keys():
  os.mkdir(f"K Folds Cross Validation/fold_{folds_index}")
  for keys in folds.get(folds_index):
    folds.get(folds_index).get(keys).to_csv(f"./K Folds Cross Validation/fold_{folds_index}/{keys}_fold_{folds_index}")


In [17]:
import csv

with open('weather_key_mapping.csv', 'w', newline='') as csv_file:

    csv_writer = csv.writer(csv_file)


    for key, value in key_mapping.items():
        csv_writer.writerow([key, value])


with open('weather_reverse_key_mapping.csv', 'w', newline='') as csv_file:

    csv_writer = csv.writer(csv_file)


    for key, value in reverse_key_mapping.items():
        csv_writer.writerow([key, value])

with open('season_key_mapping.csv', 'w', newline='') as csv_file:

    csv_writer = csv.writer(csv_file)


    for key, value in key_mapping_season.items():
        csv_writer.writerow([key, value])

with open('season_reverse_key_mapping.csv', 'w', newline='') as csv_file:

    csv_writer = csv.writer(csv_file)


    for key, value in reverse_key_mapping_season.items():
        csv_writer.writerow([key, value])