In [107]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


weather = pd.read_csv('./dataset/clean-dataset.csv')
rows, columns = weather.shape
print(f"Rows: {rows}, Columns: {columns}")

Rows: 142146, Columns: 25


### Conversion de variables

In [108]:
weather.head(5)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,DailyTempRange,YearMonth
0,2008-07-02,Adelaide,12.7,15.8,0.8,1.4,7.8,SW,35.0,SSW,...,1022.4,1022.6,,,13.7,15.5,0.0,0.0,3.1,2008-07
1,2008-07-03,Adelaide,6.2,15.1,0.0,1.8,2.1,W,20.0,NNE,...,1027.8,1026.5,,,9.3,13.9,0.0,0.0,8.9,2008-07
2,2008-07-04,Adelaide,5.3,15.9,0.0,1.4,8.0,NNE,30.0,NNE,...,1028.7,1025.6,,,10.2,15.3,0.0,0.0,10.6,2008-07
3,2008-07-06,Adelaide,11.3,15.7,0.0,1.4,1.5,NNW,52.0,NNE,...,1019.5,1016.2,,,13.0,14.4,0.0,1.0,4.4,2008-07
4,2008-07-07,Adelaide,7.6,11.2,16.2,4.6,1.1,WSW,46.0,WNW,...,1015.9,1017.9,,,9.8,9.3,1.0,1.0,3.6,2008-07


In [109]:
weather.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142146 entries, 0 to 142145
Data columns (total 25 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Date            142146 non-null  object 
 1   Location        142146 non-null  object 
 2   MinTemp         142146 non-null  float64
 3   MaxTemp         142146 non-null  float64
 4   Rainfall        142146 non-null  float64
 5   Evaporation     142146 non-null  float64
 6   Sunshine        142146 non-null  float64
 7   WindGustDir     132820 non-null  object 
 8   WindGustSpeed   142146 non-null  float64
 9   WindDir9am      132135 non-null  object 
 10  WindDir3pm      138370 non-null  object 
 11  WindSpeed9am    142146 non-null  float64
 12  WindSpeed3pm    142146 non-null  float64
 13  Humidity9am     142146 non-null  float64
 14  Humidity3pm     142146 non-null  float64
 15  Pressure9am     142146 non-null  float64
 16  Pressure3pm     142146 non-null  float64
 17  Cloud9am  

#### Conversion de Date

In [110]:
# weather["Date"] = pd.to_datetime(weather["Date"])

#### Conversión de RainToday y RainTomorrow a Boolean

In [111]:
weather["RainToday"] = weather["RainToday"].apply(lambda x: True if x == 1.0 else False)
weather["RainTomorrow"] = weather["RainTomorrow"].apply(lambda x: True if x == 1.0 else False)


#### Conversion de variables categoricas

In [112]:
variables_categoricas = ["Location", "WindGustDir", "WindDir3pm", "WindDir9am"]
weather_with_dummies = pd.get_dummies(weather, columns=variables_categoricas, drop_first=True)

### Resultados post conversion de variables

In [113]:
weather_with_dummies.head(5)

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,...,WindDir9am_NNW,WindDir9am_NW,WindDir9am_S,WindDir9am_SE,WindDir9am_SSE,WindDir9am_SSW,WindDir9am_SW,WindDir9am_W,WindDir9am_WNW,WindDir9am_WSW
0,2008-07-02,12.7,15.8,0.8,1.4,7.8,35.0,13.0,15.0,75.0,...,False,False,False,False,False,True,False,False,False,False
1,2008-07-03,6.2,15.1,0.0,1.8,2.1,20.0,2.0,11.0,81.0,...,False,False,False,False,False,False,False,False,False,False
2,2008-07-04,5.3,15.9,0.0,1.4,8.0,30.0,6.0,13.0,71.0,...,False,False,False,False,False,False,False,False,False,False
3,2008-07-06,11.3,15.7,0.0,1.4,1.5,52.0,15.0,22.0,62.0,...,False,False,False,False,False,False,False,False,False,False
4,2008-07-07,7.6,11.2,16.2,4.6,1.1,46.0,17.0,13.0,83.0,...,False,False,False,False,False,False,False,False,True,False


In [114]:
weather.shape

(142146, 25)

In [115]:
weather_with_dummies.shape

(142146, 114)

In [116]:
weather_with_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142146 entries, 0 to 142145
Columns: 114 entries, Date to WindDir9am_WSW
dtypes: bool(95), float64(17), object(2)
memory usage: 33.5+ MB


### Separacion de Dataset

In [117]:
weather_with_dummies.drop(columns="Date", inplace=True)
weather_with_dummies.drop(columns="YearMonth", inplace=True)

X = weather_with_dummies.drop(columns="RainTomorrow")
y = weather_with_dummies["RainTomorrow"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### Escalado de variables numericas

In [118]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Aplicar la misma transformación al dataset de prueba
X_test_scaled = scaler.transform(X_test)