In [1]:
import numpy as np
import pandas as pd
from pandas import read_csv
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
dataset = read_csv('dados_sp.txt')

In [3]:
def normalize_dataset (dataset):
    """"
    Function responsible for normalizing the dataset
    
    It receives a pandas dataframe, and, for each row where the column 'Hora' is 1200,
    it takes the 'Precipitacao' and 'TempMinima' columns and puts its values on the
    row above, for it does not have such values (NaN value).
    """
    for index, row in dataset.iterrows():
        if row.loc['Hora'] == 1200:
            precipitacao = row.loc['Precipitacao']
            temp_minima = row.loc['TempMinima']

            dataset.at[index - 1, 'Precipitacao'] = precipitacao
            dataset.at[index - 1, 'TempMinima'] = temp_minima

In [4]:
# Removing unwanted columns
dataset.drop(['Estacao'], axis=1, inplace=True)
dataset.drop(['Data'], axis=1, inplace=True)

normalize_dataset(dataset)

# Removing row with Hora == 0000 and then removing the Hora column
dataset = dataset[dataset.Hora != 1200]
dataset.drop(['Hora'], axis=1, inplace=True)

# Removing row with any occurance of NaN
dataset.dropna(how='any', inplace=True)

In [6]:
# Copying the Precipitacao column and encoding it to 0 or 1, where 0 is = 0, and 1 is > 0
precipitation = dataset[['Precipitacao']].copy()
precipitation = (precipitation[['Precipitacao']] > 0)*1

# Removing the Precipitacao column
# dataset.drop(['Precipitacao'], axis=1, inplace=True)


precipitation.head()

Unnamed: 0,Precipitacao
0,0
2,1
4,1
6,0
8,1


In [15]:
dataset.head()

Unnamed: 0,Precipitacao,TempMaxima,TempMinima,Insolacao,Evaporacao_Piche,Temp_Comp_Media,Umidade_Relativa_Media,Velocidade_Vento_Media
0,0.0,24.9,18.5,2.1,1.2,21.22,89.0,5.0
2,5.0,22.7,19.7,0.1,2.4,20.84,96.5,3.366667
4,44.9,28.5,17.8,9.5,0.7,21.7,79.0,3.1
6,0.0,27.2,18.7,3.7,2.7,21.72,88.5,3.6
8,7.5,25.8,18.0,3.6,1.3,21.3,86.0,4.8


In [14]:
dataset.corr(['Estacao'], axis=1, inplace=True)

Unnamed: 0,Precipitacao,TempMaxima,TempMinima,Insolacao,Evaporacao_Piche,Temp_Comp_Media,Umidade_Relativa_Media,Velocidade_Vento_Media
Precipitacao,1.0,-0.05891,0.163763,-0.245826,-0.179585,0.058292,0.252975,-0.022574
TempMaxima,-0.05891,1.0,0.648886,0.532931,0.375821,0.902376,-0.51633,-0.186741
TempMinima,0.163763,0.648886,1.0,-0.048081,0.232893,0.877516,-0.016904,0.002533
Insolacao,-0.245826,0.532931,-0.048081,1.0,0.206977,0.294775,-0.65325,-0.101266
Evaporacao_Piche,-0.179585,0.375821,0.232893,0.206977,1.0,0.335118,-0.443516,0.003051
Temp_Comp_Media,0.058292,0.902376,0.877516,0.294775,0.335118,1.0,-0.370631,-0.117082
Umidade_Relativa_Media,0.252975,-0.51633,-0.016904,-0.65325,-0.443516,-0.370631,1.0,0.077043
Velocidade_Vento_Media,-0.022574,-0.186741,0.002533,-0.101266,0.003051,-0.117082,0.077043,1.0


In [7]:
dataset.drop(['Precipitacao'], axis=1, inplace=True)

In [8]:
dataset = pd.concat((dataset, precipitation), axis=1)

dataset.head()

Unnamed: 0,TempMaxima,TempMinima,Insolacao,Evaporacao_Piche,Temp_Comp_Media,Umidade_Relativa_Media,Velocidade_Vento_Media,Precipitacao
0,24.9,18.5,2.1,1.2,21.22,89.0,5.0,0
2,22.7,19.7,0.1,2.4,20.84,96.5,3.366667,1
4,28.5,17.8,9.5,0.7,21.7,79.0,3.1,1
6,27.2,18.7,3.7,2.7,21.72,88.5,3.6,0
8,25.8,18.0,3.6,1.3,21.3,86.0,4.8,1


In [12]:
dataset.to_csv(path_or_buf='~/Documents/Unesp/TCC/prototipos_notebook/dados.csv')

In [7]:
# Copying the values to a matrix
data = dataset.iloc[:, range(len(dataset.columns))].values

# Transforming the classes in an array
classes = precipitation[['Precipitacao']].values
classes = np.squeeze(np.asarray(classes))

In [8]:
# Dividing the dataset into train e test
x_train, x_test, y_train, y_test = train_test_split(data, classes, test_size=0.3, stratify = classes, random_state=0)

In [9]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)

In [10]:
model = svm.SVC(kernel='rbf', C=10, class_weight='balanced', gamma='auto')
model.fit(x_train, y_train)

SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [11]:
# Cross Validation with Train Dataset
res = cross_val_score(model, x_train, y_train, cv = 10, scoring = 'accuracy', n_jobs = -1)
print('Average Accuracy: \t {0:4f}'.format(np.mean(res)))
print('Accuracy SD: \t {0:4f}'.format(np.std(res)))
print('Median: \t {0:4f}'. format(np.median(res)))

Average Accuracy: 	 0.744305
Accuracy SD: 	 0.011279
Median: 	 0.745761


In [12]:
# Prediction for the train dataset
y_train_pred = cross_val_predict(model, x_train, y_train, cv = 3)

In [13]:
# Creating confusion matrix for the train dataset
confusion_matrix(y_train, y_train_pred)

array([[4810, 1908],
       [ 797, 3107]])

In [14]:
print('Precision Score: \t {0:4f}'.format(precision_score(y_train, y_train_pred, average = 'weighted')))
print('Recal Score: \t {0:4f}'.format(recall_score(y_train, y_train_pred, average = 'weighted')))
print('f1 Score: \t {0:4f}'.format(f1_score(y_train, y_train_pred, average = 'weighted')))

Precision Score: 	 0.770266
Recal Score: 	 0.745340
f1 Score: 	 0.749723


In [15]:
# Prediction for the test dataset
y_test_pred = cross_val_predict(model, sc.transform(x_test), y_test, cv=3)

In [16]:
# Creating confusion matrix for the test dataset
confusion_matrix(y_test, y_test_pred)

array([[2060,  820],
       [ 401, 1272]])

In [17]:
print("Precision Score: \t {0:4f}".format(precision_score(y_test, y_test_pred, average = 'weighted')))
print("Recall Score: \t {0:4f}".format(recall_score(y_test, y_test_pred, average = 'weighted')))
print("f1 Score: \t {0:4f}".format(f1_score(y_test, y_test_pred, average = 'weighted')))

Precision Score: 	 0.752902
Recall Score: 	 0.731825
f1 Score: 	 0.736228
