In [1]:
import numpy as np
import pandas as pd
from pandas import read_csv
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score

In [3]:
dataset = read_csv('dados_sp.txt')

In [4]:
def normalize_dataset (dataset):
    """"
    Function responsible for normalizing the dataset
    
    It receives a pandas dataframe, and, for each row where the column 'Hora' is 1200,
    it takes the 'Precipitacao' and 'TempMinima' columns and puts its values on the
    row above, for it does not have such values (NaN value).
    """
    for index, row in dataset.iterrows():
        if row.loc['Hora'] == 1200:
            precipitacao = row.loc['Precipitacao']
            temp_minima = row.loc['TempMinima']

            dataset.at[index - 1, 'Precipitacao'] = precipitacao
            dataset.at[index - 1, 'TempMinima'] = temp_minima

In [5]:
# Removing unwanted columns
dataset.drop(['Estacao'], axis=1, inplace=True)
dataset.drop(['Data'], axis=1, inplace=True)
dataset.drop(['Velocidade_Vento_Media'], axis=1, inplace=True)
dataset.drop(['Temp_Comp_Media'], axis=1, inplace=True)

normalize_dataset(dataset)

# Removing row with Hora == 0000 and then removing the Hora column
dataset = dataset[dataset.Hora != 1200]
dataset.drop(['Hora'], axis=1, inplace=True)

# Removing row with any occurance of NaN
dataset.dropna(how='any', inplace=True)

In [6]:
# Copying the Precipitacao column and encoding it to 0 or 1, where 0 is = 0, and 1 is > 0
precipitation = dataset[['Precipitacao']].copy()
precipitation = (precipitation[['Precipitacao']] > 0)*1

# Removing the Precipitacao column
dataset.drop(['Precipitacao'], axis=1, inplace=True)

In [7]:
dataset.head()

Unnamed: 0,TempMaxima,TempMinima,Insolacao,Evaporacao_Piche,Umidade_Relativa_Media
0,24.9,18.5,2.1,1.2,89.0
2,22.7,19.7,0.1,2.4,96.5
4,28.5,17.8,9.5,0.7,79.0
6,27.2,18.7,3.7,2.7,88.5
8,25.8,18.0,3.6,1.3,86.0


In [8]:
# Copying the values to a matrix
data = dataset.iloc[:, range(len(dataset.columns))].values

# Transforming the classes in an array
classes = precipitation[['Precipitacao']].values
classes = np.squeeze(np.asarray(classes))

In [9]:
# Dividing the dataset into train e test
x_train, x_test, y_train, y_test = train_test_split(data, classes, test_size=0.3, stratify = classes, random_state=0)

In [10]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)

In [2]:
model = svm.SVC(kernel='rbf', C=10, class_weight='balanced', gamma='auto')
model.fit(x_train, y_train)

In [12]:
# Cross Validation with Train Dataset
res = cross_val_score(model, x_train, y_train, cv = 10, scoring = 'accuracy', n_jobs = -1)
print('Average Accuracy: \t {0:4f}'.format(np.mean(res)))
print('Accuracy SD: \t {0:4f}'.format(np.std(res)))
print('Median: \t {0:4f}'. format(np.median(res)))

Average Accuracy: 	 0.731459
Accuracy SD: 	 0.015717
Median: 	 0.729415


In [13]:
# Prediction for the train dataset
y_train_pred = cross_val_predict(model, x_train, y_train, cv = 3)

In [14]:
# Creating confusion matrix for the train dataset
confusion_matrix(y_train, y_train_pred)

array([[4725, 1998],
       [ 880, 3025]])

In [15]:
print('Precision Score: \t {0:4f}'.format(precision_score(y_train, y_train_pred, average = 'weighted')))
print('Recall Score: \t {0:4f}'.format(recall_score(y_train, y_train_pred, average = 'weighted')))
print('f1 Score: \t {0:4f}'.format(f1_score(y_train, y_train_pred, average = 'weighted')))

Precision Score: 	 0.754533
Recal Score: 	 0.729206
f1 Score: 	 0.733882


In [16]:
# Prediction for the test dataset
y_test_pred = cross_val_predict(model, sc.transform(x_test), y_test, cv=3)

In [17]:
# Creating confusion matrix for the test dataset
confusion_matrix(y_test, y_test_pred)

array([[2056,  826],
       [ 373, 1300]])

In [18]:
print("Precision Score: \t {0:4f}".format(precision_score(y_test, y_test_pred, average = 'weighted')))
print("Recall Score: \t {0:4f}".format(recall_score(y_test, y_test_pred, average = 'weighted')))
print("f1 Score: \t {0:4f}".format(f1_score(y_test, y_test_pred, average = 'weighted')))

Precision Score: 	 0.760140
Recall Score: 	 0.736773
f1 Score: 	 0.741241
