## Interpretando o dataset

In [258]:
import pandas as pd
dataframe = pd.read_csv("weatherAUS.csv")
dataframe.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


## Onde há valores nulos?

In [259]:
print(dataframe.isnull().sum())

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64


## Transformação e tratamento dos dados

In [260]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
# https://www.kaggle.com/datasets/jsphyg/weather-dataset-rattle-package
# dataframe = pd.read_csv("weatherAUS.csv")

# Alterando as variáveis No e Yes para inteiro (0 e 1)
dataframe["RainToday"] = dataframe["RainToday"].map({"No": 0, "Yes": 1})

# Tratando as datas
dataframe["Date"] = pd.to_datetime(dataframe["Date"])
dataframe["Year"] = dataframe["Date"].dt.year
dataframe["Month"] = dataframe["Date"].dt.month
dataframe["Day"] = dataframe["Date"].dt.day
dataframe.drop(columns=["Date"], inplace=True)

########## Tratando valores ausentes
dataframe["Evaporation"] = dataframe["Evaporation"].replace(["NA"], np.nan)
dataframe["Sunshine"] = dataframe["Sunshine"].replace(["NA"], np.nan)

# Calculando a media por localização mês
"""
location_mean_evaporation = dataframe.groupby(["Location", "Month"])["Evaporation"].transform("mean")
location_mean_sunshine = dataframe.groupby(["Location", "Month"])["Sunshine"].transform("mean")
location_mean_min_temp = dataframe.groupby(["Location", "Month"])["MinTemp"].transform("mean")
location_mean_max_temp = dataframe.groupby(["Location", "Month"])["MaxTemp"].transform("mean")
location_mean_rainfall = dataframe.groupby(["Location", "Month"])["Rainfall"].transform("mean")
location_mean_windGustSpeed = dataframe.groupby(["Location", "Month"])["WindGustSpeed"].transform("mean")

dataframe['Evaporation'] = dataframe['Evaporation'].fillna(location_mean_evaporation)
dataframe['Sunshine'] = dataframe['Sunshine'].fillna(location_mean_sunshine)
dataframe['MinTemp'] = dataframe['MinTemp'].fillna(location_mean_min_temp)
dataframe['MaxTemp'] = dataframe['MaxTemp'].fillna(location_mean_max_temp)
dataframe['Rainfall'] = dataframe['Rainfall'].fillna(location_mean_rainfall)
dataframe['WindGustSpeed'] = dataframe['WindGustSpeed'].fillna(location_mean_windGustSpeed)
"""

#Estava fazendo tudo um a um até lembrar que existe o FOR :) #####################################
#Capturando somente as variaveis numericas, visto que, as categoricas não fazem sentido tirar a média
numeric_columns = dataframe.select_dtypes(include=[np.number]).columns.tolist()
columns_with_nulls_numeric = dataframe[numeric_columns].columns[dataframe[numeric_columns].isnull().any()].to_list()
for column in columns_with_nulls_numeric:
  location_month_mean_value = dataframe.groupby(["Location", "Month"])[column].transform("mean")
  dataframe[column] = dataframe[column].fillna(location_month_mean_value)


# Quando uma location está completamente sem valores para determinada região, vamos tentar tirar a media do mes na respectiva
numeric_columns = dataframe.select_dtypes(include=[np.number]).columns.tolist()
columns_with_nulls_numeric = dataframe[numeric_columns].columns[dataframe[numeric_columns].isnull().any()].to_list()
for column in columns_with_nulls_numeric:
  mean_value_month = dataframe.groupby(["Month"])[column].transform("mean")
  dataframe[column] = dataframe[column].fillna(mean_value_month)

# Criando uma label (ID) para os valores em string
label_encoder = LabelEncoder()
dataframe["Location"] = label_encoder.fit_transform(dataframe["Location"])
dataframe["WindGustDir"] = label_encoder.fit_transform(dataframe["WindGustDir"])
dataframe["WindDir9am"] = label_encoder.fit_transform(dataframe["WindDir9am"])
dataframe["WindDir3pm"] = label_encoder.fit_transform(dataframe["WindDir3pm"])

# Dataframe depois dos devidos tratamentos
dataframe.head()

# dataframe.to_csv("after_process.csv", index=False)

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
0,2,13.4,22.9,0.6,8.046549,8.970973,13,44.0,13,14,...,1007.1,8.0,4.901099,16.9,21.8,0.0,No,2008,12,1
1,2,7.4,25.1,0.0,8.046549,8.970973,14,44.0,6,15,...,1007.8,5.539683,4.901099,17.2,24.3,0.0,No,2008,12,2
2,2,12.9,25.7,0.0,8.046549,8.970973,15,46.0,13,15,...,1008.7,5.539683,2.0,21.0,23.2,0.0,No,2008,12,3
3,2,9.2,28.0,0.0,8.046549,8.970973,4,24.0,9,0,...,1012.8,5.539683,4.901099,18.1,26.5,0.0,No,2008,12,4
4,2,17.5,32.3,1.0,8.046549,8.970973,13,41.0,1,7,...,1006.0,7.0,8.0,17.8,29.7,0.0,No,2008,12,5


## Verificando se em alguma coluna ainda há valores nulos

In [261]:
print(dataframe.isnull().sum())

Location            0
MinTemp             0
MaxTemp             0
Rainfall            0
Evaporation         0
Sunshine            0
WindGustDir         0
WindGustSpeed       0
WindDir9am          0
WindDir3pm          0
WindSpeed9am        0
WindSpeed3pm        0
Humidity9am         0
Humidity3pm         0
Pressure9am         0
Pressure3pm         0
Cloud9am            0
Cloud3pm            0
Temp9am             0
Temp3pm             0
RainToday           0
RainTomorrow     3267
Year                0
Month               0
Day                 0
dtype: int64


#### A única coluna com valores nulos é a RainTomorrow, que é justamente a que iremos dropar (pois é ela que vamos responder). Não foi removida antes para não impactar nas médias globais.

# Divisão em conjuntos de treinamento e teste

In [262]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

# Cria um novo dataframe dropando a coluna alvo com valores  nulos
dataframe_cleaned = dataframe.dropna(subset=["RainTomorrow"]).copy()

# Coluna alvo, ou seja, o que quero identificar, convertida para 0 e 1
y = dataframe_cleaned["RainTomorrow"].map({"No": 0, "Yes": 1})

# Dropando a coluna alvo
dataframe_cleaned.drop(columns=["RainTomorrow"], inplace = True)

X = dataframe_cleaned.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)


## KNN

In [263]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred_knn)
print("Acurácia do KNN %4f" %(acc_knn))

Acurácia do KNN 0.837576


## Árvore de decisão

In [264]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)
acc_tree = accuracy_score(y_test, y_pred_tree)
print("Acurácia da Árvore de Dec. %.4f" %(acc_tree))

Acurácia da Árvore de Dec. 0.7839
