## 1. Introdução
<p>Este <i>dataset</i> contém dados meteorológicos do Aeroporto Internacional de Raleigh Durham retirados do serviço da Web do NOAA.</p>

## 2. Lendo os Dados

In [3]:
import pandas as pd

# As colunas estão separadas pelo símbolo ';' #. 
weather_history = pd.read_csv('rdu-weather-history.csv', sep=';')

## 3. Pré-processamento dos dados

In [23]:
# Substituindo as string por números binários -- No = 0 e Yes = 1

weather_history = weather_history.replace('No', 0)
weather_history = weather_history.replace('Yes', 1)
print(weather_history[''])

In [7]:
# Verificando a soma dos valores nulos em cada feature

weather_history.isnull().sum()

date                     0
temperaturemin           1
temperaturemax           1
precipitation            1
snowfall                 2
snowdepth                1
avgwindspeed             3
fastest2minwinddir       2
fastest2minwindspeed     2
fastest5secwinddir      19
fastest5secwindspeed    19
fog                      0
fogheavy                 0
mist                     0
rain                     0
fogground                0
ice                      0
glaze                    0
drizzle                  0
snow                     0
freezingrain             0
smokehaze                0
thunder                  0
highwind                 0
hail                     0
blowingsnow              0
dust                     0
freezingfog              0
dtype: int64

In [21]:
from sklearn.preprocessing import Imputer
import numpy as np

#Substituindo os valores nulos pela média dos valores referentes a feature
imputer = Imputer(missing_values=np.nan, strategy='median', axis=0)
weather_history[['temperaturemin']] = imputer.fit_transform(weather_history[['temperaturemin']])
weather_history[['temperaturemax']] = imputer.fit_transform(weather_history[['temperaturemax']])
weather_history[['precipitation']] = imputer.fit_transform(weather_history[['precipitation']])
weather_history[['snowfall']] = imputer.fit_transform(weather_history[['snowfall']])
weather_history[['snowdepth']] = imputer.fit_transform(weather_history[['snowdepth']])
weather_history[['avgwindspeed']] = imputer.fit_transform(weather_history[['avgwindspeed']])
weather_history[['fastest2minwinddir']] = imputer.fit_transform(weather_history[['fastest2minwinddir']])
weather_history[['fastest2minwindspeed']] = imputer.fit_transform(weather_history[['fastest2minwindspeed']])
weather_history[['fastest5secwinddir']] = imputer.fit_transform(weather_history[['fastest5secwinddir']])
weather_history[['fastest5secwindspeed']] = imputer.fit_transform(weather_history[['fastest5secwindspeed']])

weather_history.isnull().sum()

date                    0
temperaturemin          0
temperaturemax          0
precipitation           0
snowfall                0
snowdepth               0
avgwindspeed            0
fastest2minwinddir      0
fastest2minwindspeed    0
fastest5secwinddir      0
fastest5secwindspeed    0
fog                     0
fogheavy                0
mist                    0
rain                    0
fogground               0
ice                     0
glaze                   0
drizzle                 0
snow                    0
freezingrain            0
smokehaze               0
thunder                 0
highwind                0
hail                    0
blowingsnow             0
dust                    0
freezingfog             0
dtype: int64

In [24]:
weather_history.head()

Unnamed: 0,date,temperaturemin,temperaturemax,precipitation,snowfall,snowdepth,avgwindspeed,fastest2minwinddir,fastest2minwindspeed,fastest5secwinddir,...,drizzle,snow,freezingrain,smokehaze,thunder,highwind,hail,blowingsnow,dust,freezingfog
0,2009-10-03,55.0,82.0,0.0,0.0,0.0,2.91,240.0,16.11,230.0,...,0,0,0,0,0,0,0,0,0,0
1,2009-10-10,59.0,79.0,0.02,0.0,0.0,7.83,220.0,17.0,220.0,...,0,0,0,0,0,0,1,0,0,0
2,2009-10-14,46.9,61.0,0.14,0.0,0.0,8.72,40.0,14.99,50.0,...,1,0,0,0,0,0,1,0,0,0
3,2009-10-17,45.0,57.9,0.0,0.0,0.0,6.26,30.0,14.09,40.0,...,0,0,0,0,0,0,0,0,0,0
4,2009-10-29,48.0,68.0,0.0,0.0,0.0,5.82,80.0,14.99,70.0,...,0,0,0,0,0,0,0,0,0,0


## 3. Visão geral
<p>O <i>dataset</i> contém informação sobre dados meteorológicos do Aeroporto Internacional de Raleigh Durham, desde 2007. Possui dados como temperaturas mínimas e máximas, volume de chuva, velocidade do vento e etc.</p>

In [25]:
# Número de entradas
num_history = weather_history.shape[0]
print("Número de linhas do dataset:", num_history)

print("\n\nResumo estatístico do DataFrame:")
weather_history.describe()

Número de linhas do dataset: 4137


Resumo estatístico do DataFrame:


Unnamed: 0,temperaturemin,temperaturemax,precipitation,snowfall,snowdepth,avgwindspeed,fastest2minwinddir,fastest2minwindspeed,fastest5secwinddir,fastest5secwindspeed,...,drizzle,snow,freezingrain,smokehaze,thunder,highwind,hail,blowingsnow,dust,freezingfog
count,4137.0,4137.0,4137.0,4137.0,4137.0,4137.0,4137.0,4137.0,4137.0,4137.0,...,4137.0,4137.0,4137.0,4137.0,4137.0,4137.0,4137.0,4137.0,4137.0,4137.0
mean,50.540416,72.017476,0.126599,0.012959,0.01738,5.860259,172.559826,15.956684,177.208122,21.798066,...,0.051487,0.011361,0.001209,0.090887,0.1124,0.000967,0.082185,0.013536,0.016437,0.000483
std,16.227581,16.528543,0.371278,0.195167,0.213927,2.957403,94.583981,5.269087,96.653949,7.079879,...,0.221015,0.105993,0.034748,0.287483,0.315896,0.031083,0.27468,0.11557,0.127164,0.021985
min,4.1,23.2,0.0,0.0,0.0,0.0,10.0,4.92,10.0,6.93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37.0,60.1,0.0,0.0,0.0,3.58,80.0,12.97,90.0,17.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,52.0,73.9,0.0,0.0,0.0,5.37,210.0,14.99,210.0,21.03,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,64.9,86.0,0.04,0.0,0.0,7.61,240.0,18.12,240.0,25.05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,80.1,105.1,6.45,6.69,5.91,19.01,360.0,59.95,360.0,86.12,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## 3.1 Utilizando a função Seaborn de mapas de calor
<p>Seaborn é uma biblioteca em python para criação de visualizações estatísticas.</p>
<p>O método ```.corr() ```, calcula o <b>coeficiente de correlação de Pearson</b> entre todos os pares de colunas numéricas do DataFrame.</p>
<p>Interpretando o coeficiente:
- 0.9 para mais ou para menos indica uma correlação muito forte.
- 0.7 a 0.9 positivo ou negativo indica uma correlação forte.
- 0.5 a 0.7 positivo ou negativo indica uma correlação moderada.
- 0.3 a 0.5 positivo ou negativo indica uma correlação fraca.
- 0 a 0.3 positivo ou negativo indica uma correlação desprezível.

In [26]:
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt

plt.figure(figsize=(18,10))
weather_map = sns.heatmap(weather_history.corr(), annot=True, square=True, cmap="YlGnBu", linewidths=.3)


## 4. Logistic Regression
<p>'Logistic Regression' é um algoritmo de <b>classificação</b>. Ele é usado para prever um resultado binário (1/0, Sim / Não, Verdadeiro / Falso), dado um conjunto de variáveis independentes.</p>

In [27]:
import sklearn.linear_model

#clean_weather_history = weather_history.drop(weather_history.columns[9:], axis=1)
#clean_weather_history = clean_weather_history.drop(['date'], axis=1)
#clean_weather_history = weather_history.dropna()

clean_weather_history = weather_history.replace('No', 0.0)
clean_weather_history = clean_weather_history.replace('Yes', 1.0)

model=sklearn.linear_model.LogisticRegression()

predictors =['drizzle','thunder', 'snow', 'fog', 'mist', 'hail']

x_train = clean_weather_history[predictors].values
y_train = clean_weather_history['rain'].values

model = sklearn.linear_model.LogisticRegression()
model.fit(x_train, y_train)

x_test = clean_weather_history[predictors].values

predicted= model.predict(x_test)

clean_weather_history['rain']=predicted

print("Results Predicted:\n", clean_weather_history['rain'])
print("\nOriginal dataset, 'rain' column:\n", weather_history['rain'])

Results Predicted:
 0       0
1       1
2       1
3       0
4       0
5       0
6       1
7       0
8       0
9       1
10      1
11      1
12      0
13      0
14      1
15      0
16      0
17      0
18      0
19      0
20      0
21      1
22      0
23      0
24      0
25      0
26      0
27      1
28      1
29      0
       ..
4107    0
4108    0
4109    0
4110    0
4111    0
4112    0
4113    0
4114    0
4115    0
4116    0
4117    0
4118    0
4119    0
4120    0
4121    0
4122    0
4123    0
4124    0
4125    0
4126    0
4127    0
4128    0
4129    0
4130    0
4131    0
4132    0
4133    0
4134    0
4135    0
4136    0
Name: rain, Length: 4137, dtype: int64

Original dataset, 'rain' column:
 0       0
1       1
2       1
3       0
4       0
5       0
6       1
7       0
8       0
9       1
10      1
11      1
12      1
13      0
14      1
15      0
16      0
17      0
18      0
19      0
20      0
21      1
22      0
23      0
24      0
25      0
26      0
27      1
28      1
29    

## 5. Referências   

> [Coeficiente de correlação de Pearson](https://pt.wikipedia.org/wiki/Coeficiente_de_correla%C3%A7%C3%A3o_de_Pearson#Refer%C3%AAncias)

> [Why isn't Logistic Regression called Logistic Classification?](https://stats.stackexchange.com/questions/127042/why-isnt-logistic-regression-called-logistic-classification)



