In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
##from pandas.plotting import parallel_coordinates
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
from sklearn import tree
##plt.style.use("seaborn-dark")
plt.style.use('seaborn-v0_8-dark')

In [2]:
df = pd.read_csv('weatherAUS.csv')

In [3]:
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


<h3> Cantidad de lugares distintos de la base de datos </h3>

In [4]:
len(df['Location'].unique())

49

<h4> Hacemos una copia del la base de datos para cambiar los valores de 'yes' y 'no' por valores booleanos en la copia del DF </h4>

In [5]:
data = df.copy()

In [6]:
data['RainToday'] = (data['RainToday'] == 'Yes')*1
data['RainTomorrow'] = (data['RainTomorrow'] == 'Yes')*1

<h4>Cambiamos las abreviaciones de las direcciones del viento en las respectivas columnas </h4>

In [7]:
data.columns

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow'],
      dtype='object')

In [8]:
print(data.WindGustDir.unique())
print(data.WindDir9am.unique())
print(data.WindDir3pm.unique())

['W' 'WNW' 'WSW' 'NE' 'NNW' 'N' 'NNE' 'SW' nan 'ENE' 'SSE' 'S' 'NW' 'SE'
 'ESE' 'E' 'SSW']
['W' 'NNW' 'SE' 'ENE' 'SW' 'SSE' 'S' 'NE' nan 'SSW' 'N' 'WSW' 'ESE' 'E'
 'NW' 'WNW' 'NNE']
['WNW' 'WSW' 'E' 'NW' 'W' 'SSE' 'ESE' 'ENE' 'NNW' 'SSW' 'SW' 'SE' 'N' 'S'
 'NNE' nan 'NE']


In [9]:
Ab_WD = ['N','NNE','NE','ENE','E','ESE','SE','SSE','S', 'SSW','SW','WSW','W','WNW','NW','NNW']
WD = [0,22.5,45,67.5,90,112.5,135,157.5,180,202.5,225,247.5,270,292.5,315,337.5]
Col_WindDir = ['WindGustDir','WindDir9am','WindDir3pm']

In [10]:
#Dataframe de mapeo
Wind_Dir = pd.DataFrame({
    'Ab_WD': Ab_WD,
    'WD': WD
    })

#convertir a diccionario
wind_dir_map = dict(zip(Wind_Dir['Ab_WD'], Wind_Dir['WD']))
wind_dir_map

{'N': 0.0,
 'NNE': 22.5,
 'NE': 45.0,
 'ENE': 67.5,
 'E': 90.0,
 'ESE': 112.5,
 'SE': 135.0,
 'SSE': 157.5,
 'S': 180.0,
 'SSW': 202.5,
 'SW': 225.0,
 'WSW': 247.5,
 'W': 270.0,
 'WNW': 292.5,
 'NW': 315.0,
 'NNW': 337.5}

In [11]:
for column in Col_WindDir:
    data[column] = data[column].map(wind_dir_map)

<h4> Ahora veamos la cantidad de valores nulos que hay en la base de datos</h4>

<h4>Cantidad de valores nulos </h4>

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  float64
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  float64
 10  WindDir3pm     141232 non-null  float64
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [13]:
data.isnull().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday            0
RainTomorrow         0
dtype: int64

<h4>Calculemos los coeficientes de correlación de la base de datos para darnos una idea de que columnas pueden ser de utilidad y cuales no. </h4>

In [14]:
corr_data = data.select_dtypes(include=[float, int]).corr()
corr_data

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
MinTemp,1.0,0.736555,0.103938,0.466993,0.072586,-0.176323,0.177415,-0.101278,-0.180212,0.175064,...,-0.232899,0.006089,-0.45097,-0.461292,0.078754,0.021605,0.901821,0.708906,0.054702,0.082455
MaxTemp,0.736555,1.0,-0.074992,0.587932,0.470156,-0.189961,0.067615,-0.241169,-0.151406,0.01445,...,-0.50411,-0.508855,-0.332061,-0.427167,-0.28937,-0.277921,0.88721,0.984503,-0.226001,-0.157141
Rainfall,0.103938,-0.074992,1.0,-0.064351,-0.227549,0.035316,0.133659,0.068652,0.03406,0.087338,...,0.224405,0.255755,-0.168154,-0.126534,0.198528,0.172403,0.011192,-0.079657,0.501516,0.236461
Evaporation,0.466993,0.587932,-0.064351,1.0,0.365602,-0.097017,0.203021,-0.124446,-0.067127,0.193084,...,-0.504092,-0.390243,-0.270362,-0.293581,-0.183793,-0.182618,0.545115,0.572893,-0.185197,-0.117719
Sunshine,0.072586,0.470156,-0.227549,0.365602,1.0,-0.070306,-0.03475,-0.082111,-0.039864,0.005499,...,-0.490819,-0.62913,0.04197,-0.019719,-0.675323,-0.70393,0.291188,0.490501,-0.324613,-0.443561
WindGustDir,-0.176323,-0.189961,0.035316,-0.097017,-0.070306,1.0,0.14444,0.355927,0.564141,-0.00075,...,0.055518,0.014615,-0.164097,-0.097898,0.072232,0.074368,-0.195585,-0.19975,0.109109,0.059817
WindGustSpeed,0.177415,0.067615,0.133659,0.203021,-0.03475,0.14444,1.0,0.044181,0.138491,0.605303,...,-0.21507,-0.026327,-0.458744,-0.413749,0.071736,0.109168,0.15015,0.032748,0.151605,0.229195
WindDir9am,-0.101278,-0.241169,0.068652,-0.124446,-0.082111,0.355927,0.044181,1.0,0.261895,-0.007367,...,0.136257,0.156166,-0.064724,0.013762,0.077716,0.051256,-0.17281,-0.251392,0.146068,0.037027
WindDir3pm,-0.180212,-0.151406,0.03406,-0.067127,-0.039864,0.564141,0.138491,0.261895,1.0,0.01641,...,0.025388,-0.048623,-0.166488,-0.10158,0.041971,0.058699,-0.183184,-0.156264,0.095618,0.045575
WindSpeed9am,0.175064,0.01445,0.087338,0.193084,0.005499,-0.00075,0.605303,-0.007367,0.01641,1.0,...,-0.270858,-0.031614,-0.228743,-0.175817,0.025112,0.054639,0.128545,0.004569,0.099084,0.087145


<h4> Visualicemos esto con un gráfico </h4>

In [None]:
plt.figure(figsize=(20,12))
sns.heatmap(cor_data,annot=True)


<h4>Cantidad de calores nulos en porcentaje </h4>

In [None]:
(data.isnull().sum()*100)/len(data.index)

<h4>Dado que queremos un modelo que prediga si lloverá o no en una o varias de las ciudades de la base de datos, lo que haremos será optar por predecir con un modelo de árbol de desiciones (clasificador) </h4>

<h4>Puesto que queremos predecir especificamente en cada ciudad, como primer acercamiento separaremos los datos de cada ciudad en la base de datos y la limpiaremos para proceder a crear el modelo. </h4>

Para esto creamos un diccionario que contenga las bases de datos de cada lugar


In [None]:
Loc = data['Location'].unique()
df_dict_Loc = {elem : pd.DataFrame() for elem in Loc}
for key in df_dict_Loc.keys():
    df_dict_Loc[key] = data[:][data.Location == key]
df_dict_Loc

In [None]:
df_dict_Loc

Ahora procedemos a limpiar cada base de datos. 

Lo primero sera ver cuantos (porcentaje) valores nulos hay en cada base de datos.

In [None]:
null_val_perc = {elem : pd.Series(dtype='float64') for elem in Loc}
for key in df_dict_Loc.keys():
    null_val_perc[key] = (df_dict_Loc[key].isnull().sum()*100)/len(df_dict_Loc[key].index)

In [None]:
null_val_perc

Veamos las columnas con mas valores nulos y borremos las columnas que tengan todos sus valores nulos

In [None]:
for key in null_val_perc.keys():
    for col in df_dict_Loc[key].columns:
        if null_val_perc[key][col] == 100.0:
            df_dict_Loc[key]=df_dict_Loc[key].drop([col],axis=1)
            null_val_perc[key]=null_val_perc[key].drop([col])

In [None]:
df_dict_Loc

In [None]:
null_val_perc

Dado que todavia hay columnas con valores nulos, en esta ocasión optaremos por rellenar los valores con la media o moda dependiendo del tipo de datos.

Columnas que se rellenan con la moda.

In [None]:
Col_Mode =['WindGustDir','WindDir9am','WindDir3pm','Cloud9am','Cloud3pm']
for key in df_dict_Loc.keys():
    for col in Col_Mode:
        if col in df_dict_Loc[key].columns:
            df_dict_Loc[key][col]=df_dict_Loc[key][col].fillna(df_dict_Loc[key][col].mode()[0])

Columnas que se rellenan con la media.

In [None]:
for key in df_dict_Loc.keys():
    df_dict_Loc[key].fillna(df_dict_Loc[key].mean(), inplace=True)

Ahora no hay base de datos con valores nulos. Corroboremos esto.

In [None]:
null_val_perc = {elem : pd.Series(dtype='float64') for elem in Loc}
for key in df_dict_Loc.keys():
    null_val_perc[key] = (df_dict_Loc[key].isnull().sum()*100)/len(df_dict_Loc[key].index)

In [None]:
null_val_perc

Ahora veamos los coeficientes de correlación y decidamos con cuales características qeudarnos 
para el modelo.

In [None]:
coef_corr = {elem : pd.DataFrame() for elem in Loc}
for key in df_dict_Loc.keys():
    coef_corr[key]= df_dict_Loc[key].corr()

In [None]:
coef_corr

In [None]:
for keys in df_dict_Loc.keys():
    plt.figure(figsize=(20,12))
    sns.heatmap(coef_corr[key],annot=True)

Para el modelo elegiremos las columnas con coeficientes cuyos valores absolutos son
mayores o iguales a .1 

Coeficientes positivos

In [None]:
coef_corr_p = {elem : pd.DataFrame() for elem in Loc}
for key in df_dict_Loc.keys():
    coef_corr_p[key]= coef_corr[key][coef_corr[key]['RainTomorrow']>= .1]
    coef_corr_p[key]=coef_corr_p[key].drop(['RainTomorrow'],axis=0)

In [None]:
coef_corr_p

Coeficientes negativos

In [None]:
coef_corr_n = {elem : pd.DataFrame() for elem in df_dict_Loc.keys()}
for key in df_dict_Loc.keys():
    coef_corr_n[key]= coef_corr[key][coef_corr[key]['RainTomorrow']<= -.1]

In [None]:
coef_corr_n

Ahora tomemos esas características

In [None]:
total_len = {elem : list() for elem in Loc}
for key in df_dict_Loc.keys():
    total_len[key].append(len(coef_corr_p[key].index) +len(coef_corr_n[key].index))

In [None]:
total_len['Albury'][0]

In [None]:
features = {elem : list() for elem in Loc}
for key in df_dict_Loc.keys():
    for i in range(0,total_len[key][0]):
        if i<=(len(coef_corr_p[key])-1):
            features[key].append(coef_corr_p[key].index[i])
        elif i >= len(coef_corr_p[key]):
            features[key].append(coef_corr_n[key].index[i-(len(coef_corr_p[key])+1)])

In [None]:
features

In [None]:
Pasemos a crear los modelos

In [None]:
X = {elem : pd.DataFrame() for elem in Loc}
for key in df_dict_Loc.keys():
    X[key]= df_dict_Loc[key][features[key]].copy()

In [None]:
X

In [None]:
y = {elem : pd.DataFrame() for elem in Loc}
for key in df_dict_Loc.keys():
    y[key]= df_dict_Loc[key][['RainTomorrow']].copy()

In [None]:
y

Ya tenemos todo para hacer el modelo, podemos elegir cualquier "llave " del
diccionario y ver que tan bueno es el modelo

In [None]:
key='Canberra'
X_train, X_test, y_train, y_test = train_test_split(X[key], y[key], test_size=0.33, random_state=324)

In [None]:
val_met=pd.DataFrame(columns=['node','accuracy'],index=range(22))
for i in range(3,25):
    Rain_Tomorrow = DecisionTreeClassifier(max_leaf_nodes=i, random_state=0)
    Rain_Tomorrow.fit(X_train, y_train)
    predictions = Rain_Tomorrow.predict(X_test)
    j=i-3
    val_met.iloc[j][0] = i 
    val_met.iloc[j][1]=(accuracy_score(y_true = y_test, y_pred = predictions))
    print(i,' ',accuracy_score(y_true = y_test, y_pred = predictions))