In [1]:
import numpy as np
import pandas as pd
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

In [3]:
dataset = read_csv('dados_sp.txt')

In [4]:
def normalize_dataset (dataset):
    """"
    Function responsible for normalizing the dataset
    
    It receives a pandas dataframe, and, for each row where the column 'Hora' is 1200,
    it takes the 'Precipitacao' and 'TempMinima' columns and puts its values on the
    row above, for it does not have such values (NaN value).
    """
    for index, row in dataset.iterrows():
        if row.loc['Hora'] == 1200:
            precipitacao = row.loc['Precipitacao']
            temp_minima = row.loc['TempMinima']

            dataset.at[index - 1, 'Precipitacao'] = precipitacao
            dataset.at[index - 1, 'TempMinima'] = temp_minima

In [5]:
# Removing unwanted columns
dataset.drop(['Estacao'], axis=1, inplace=True)
dataset.drop(['Data'], axis=1, inplace=True)

normalize_dataset(dataset)

# Removing row with Hora == 0000 and then removing the Hora column
dataset = dataset[dataset.Hora != 1200]
dataset.drop(['Hora'], axis=1, inplace=True)

# Removing row with any occurance of NaN
dataset.dropna(how='any', inplace=True)

In [6]:
# Copying the Precipitacao column and encoding it to 0 or 1, where 0 is = 0, and 1 is > 0
precipitation = dataset[['Precipitacao']].copy()
precipitation = (precipitation[['Precipitacao']] > 0)*1

# Removing the Precipitacao column
dataset.drop(['Precipitacao'], axis=1, inplace=True)

In [7]:
dataset.head()

Unnamed: 0,TempMaxima,TempMinima,Insolacao,Evaporacao_Piche,Temp_Comp_Media,Umidade_Relativa_Media,Velocidade_Vento_Media
0,24.9,18.5,2.1,1.2,21.22,89.0,5.0
2,22.7,19.7,0.1,2.4,20.84,96.5,3.366667
4,28.5,17.8,9.5,0.7,21.7,79.0,3.1
6,27.2,18.7,3.7,2.7,21.72,88.5,3.6
8,25.8,18.0,3.6,1.3,21.3,86.0,4.8


In [8]:
# Copying the values to a matrix
data = dataset.iloc[:, range(len(dataset.columns))].values

# Transforming the classes in an array
classes = precipitation[['Precipitacao']].values
classes = np.squeeze(np.asarray(classes))

In [9]:
# Dividing the dataset into train e test
x_train, x_test, y_train, y_test = train_test_split(data, classes, test_size=0.3, stratify = classes, random_state=0)

In [10]:
features = SelectKBest(f_classif, 'all')

In [11]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)

In [12]:
# Fitting to check wich variables are important to our problem
features.fit(x_train, y_train)

SelectKBest(k='all', score_func=<function f_classif at 0x7f9fad153ae8>)

In [13]:
# Checking the results
features.scores_

array([ 372.63154036,  358.37627673, 1880.79987893,  870.13924458,
          2.47296491, 2001.22702892,   54.826031  ])