In [31]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, learning_curve, cross_val_score, KFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from geopy.geocoders import Nominatim
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer

In [5]:
rain = pd.read_csv('data/rain_data_aus.csv')

In [6]:
wind01 = pd.read_csv('data/wind_table_01.csv')
wind02 = pd.read_csv('data/wind_table_02.csv')
wind03 = pd.read_csv('data/wind_table_03.csv')
wind04 = pd.read_csv('data/wind_table_04.csv')
wind05 = pd.read_csv('data/wind_table_05.csv')
wind06 = pd.read_csv('data/wind_table_06.csv')
wind07 = pd.read_csv('data/wind_table_07.csv')
wind08 = pd.read_csv('data/wind_table_08.csv')

In [7]:
rain.head(3)

Unnamed: 0,date,location,mintemp,maxtemp,rainfall,evaporation,sunshine,humidity9am,humidity3pm,pressure9am,...,temp9am,temp3pm,raintoday,amountOfRain,raintomorrow,temp,humidity,precipitation3pm,precipitation9am,modelo_vigente
0,2008-12-01,Albury,13.4,22.9,0.6,,,71.0,22.0,1007.7,...,16.9,21.8,No,0.0,No,29.48,28.4,12,5.11536,0.089825
1,2008-12-02,Albury,7.4,25.1,0.0,,,44.0,25.0,1010.6,...,17.2,24.3,No,0.0,No,32.12,2.208569,10,21.4971,0.023477
2,2008-12-03,Albury,12.9,25.7,0.0,,,38.0,30.0,1007.6,...,21.0,23.2,No,0.0,No,32.84,38.0,17,20.782859,0.02758


In [8]:
print(f"wind01 - {wind01.columns}")
print(f"wind02 - {wind02.columns}")
print(f"wind03 - {wind03.columns}")
print(f"wind04 - {wind04.columns}")
print(f"wind05 - {wind05.columns}")
print(f"wind06 - {wind06.columns}")
print(f"wind07 - {wind07.columns}")
print(f"wind08 - {wind08.columns}")

wind01 - Index(['date', 'location', 'wind_gustdir', 'wind_gustspeed', 'wind_dir9am',
       'wind_dir3pm', 'wind_speed9am', 'wind_speed3pm'],
      dtype='object')
wind02 - Index(['date', 'location', 'wind_gustdir', 'wind_gustspeed', 'wind_dir9am',
       'wind_dir3pm', 'wind_speed9am', 'wind_speed3pm'],
      dtype='object')
wind03 - Index(['date', 'location', 'windgustdir', 'windgustspeed', 'winddir9am',
       'winddir3pm', 'windspeed9am', 'windspeed3pm'],
      dtype='object')
wind04 - Index(['date', 'location', 'windgustdir', 'windgustspeed', 'winddir9am',
       'winddir3pm', 'windspeed9am', 'windspeed3pm'],
      dtype='object')
wind05 - Index(['date', 'location', 'windgustdir', 'windgustspeed', 'winddir9am',
       'winddir3pm', 'windspeed9am', 'windspeed3pm'],
      dtype='object')
wind06 - Index(['date', 'location', 'windgustdir', 'windgustspeed', 'winddir9am',
       'winddir3pm', 'windspeed9am', 'windspeed3pm'],
      dtype='object')
wind07 - Index(['date', 'location', 'win

In [9]:
datasets = [wind01, wind02, wind03, wind04, wind05, wind06, wind07, wind08]

for data in datasets:
    complete = pd.concat([rain, data.rename(columns={'windgustdir':'wind_gustdir', 
                                        'windgustspeed':'wind_gustspeed', 
                                        'winddir9am':'wind_dir9am',
                                       'winddir3pm':'wind_dir3pm',
                                       'windspeed9am':'wind_speed9am',
                                       'windspeed3pm':'wind_speed3pm'})])

In [10]:
complete.head(3)

Unnamed: 0,date,location,mintemp,maxtemp,rainfall,evaporation,sunshine,humidity9am,humidity3pm,pressure9am,...,humidity,precipitation3pm,precipitation9am,modelo_vigente,wind_gustdir,wind_gustspeed,wind_dir9am,wind_dir3pm,wind_speed9am,wind_speed3pm
0,2008-12-01,Albury,13.4,22.9,0.6,,,71.0,22.0,1007.7,...,28.4,12.0,5.11536,0.089825,,,,,,
1,2008-12-02,Albury,7.4,25.1,0.0,,,44.0,25.0,1010.6,...,2.208569,10.0,21.4971,0.023477,,,,,,
2,2008-12-03,Albury,12.9,25.7,0.0,,,38.0,30.0,1007.6,...,38.0,17.0,20.782859,0.02758,,,,,,


In [11]:
complete.shape

(164386, 29)

In [12]:
complete.isna().sum()

date                     0
location                 0
mintemp              22830
maxtemp              22515
rainfall             23599
evaporation          83036
sunshine             90009
humidity9am          23967
humidity3pm          25803
pressure9am          36207
pressure3pm          36174
cloud9am             75850
cloud3pm             79287
temp9am              23097
temp3pm              24919
raintoday            23599
amountOfRain         22193
raintomorrow         22193
temp                 22515
humidity             25803
precipitation3pm     22193
precipitation9am     22193
modelo_vigente       22193
wind_gustdir        143441
wind_gustspeed      143441
wind_dir9am         143608
wind_dir3pm         143250
wind_speed9am       142298
wind_speed3pm       143146
dtype: int64

In [13]:
def location_to_coord(data, country):
    """
    Function to convert city name to Latitude and Longitude
    Takes a DataFrame with a 'location' column, and a Country name as string
    Returns a final DataFrame with two new columns 'latitude' and 'longitude'

    """
    # importing library and instantiating it, to retrieve location information
    from geopy.geocoders import Nominatim
    geolocator = Nominatim(user_agent="my-application")

    # Inserting space between city names when needed
    data['location'] = data['location'].str.replace(
        r"([A-Z]+)", r" \1").str.strip()

    # Retrieving the unique cities names from the DataFrame, and storing into a list
    cities = [x for x in data['location'].unique()]
    lat = []
    long = []

    # iterating every location name retrieving the required information
    for city in tqdm(cities):
        location = geolocator.geocode(city, country)
        lat.append(location.latitude)
        long.append(location.longitude)
        res = pd.DataFrame(list(zip(cities, lat, long)), columns=[
                           'location', 'latitude', 'longitude'])
        final_data = pd.merge(complete, res, on='location').sort_values(
            'date').reset_index(drop=True)
    return final_data

In [14]:
data_final = location_to_coord(complete, 'Australia')

100%|██████████████████████████████████████████████████████████████████████████████████| 49/49 [00:59<00:00,  1.21s/it]


In [15]:
data_final.head(3)

Unnamed: 0,date,location,mintemp,maxtemp,rainfall,evaporation,sunshine,humidity9am,humidity3pm,pressure9am,...,precipitation9am,modelo_vigente,wind_gustdir,wind_gustspeed,wind_dir9am,wind_dir3pm,wind_speed9am,wind_speed3pm,latitude,longitude
0,2007-11-01,Canberra,8.0,24.3,0.0,3.4,6.3,68.0,29.0,1019.7,...,9.613089,0.538687,,,,,,,-35.297591,149.101268
1,2007-11-02,Canberra,14.0,26.9,3.6,4.4,9.7,80.0,36.0,1012.4,...,17.115142,0.44592,,,,,,,-35.297591,149.101268
2,2007-11-03,Canberra,13.7,23.4,3.6,5.8,3.3,82.0,69.0,1009.5,...,17.391294,0.861744,,,,,,,-35.297591,149.101268


In [16]:
def date_to_season(data):
    """
    Function to create season column, based on date
    Receives a DataFrame
    
    """
    ## Converting date column to Datetime type
    data['date'] = pd.to_datetime(data['date'])
    seasons = {12:1, 1:1, 2:1, 
                 3:2, 4:2, 5:2,
                 6:3, 7:3, 8:3,
                 9:4, 10:4, 11:4}
    
    ## Creating a 'month' column to determine season by month
    data['month'] = data.loc[:,'date'].apply(lambda x: x.month)
    
    ## Creating 'season' column by month
    data['season'] = data.loc[:,'month'].apply(lambda x: seasons[x])
    
    ## Droping the 'month column'
    data.drop(columns=['month'], inplace=True, axis=1)

In [17]:
date_to_season(data_final)

In [18]:
data_final.head(3)

Unnamed: 0,date,location,mintemp,maxtemp,rainfall,evaporation,sunshine,humidity9am,humidity3pm,pressure9am,...,modelo_vigente,wind_gustdir,wind_gustspeed,wind_dir9am,wind_dir3pm,wind_speed9am,wind_speed3pm,latitude,longitude,season
0,2007-11-01,Canberra,8.0,24.3,0.0,3.4,6.3,68.0,29.0,1019.7,...,0.538687,,,,,,,-35.297591,149.101268,4
1,2007-11-02,Canberra,14.0,26.9,3.6,4.4,9.7,80.0,36.0,1012.4,...,0.44592,,,,,,,-35.297591,149.101268,4
2,2007-11-03,Canberra,13.7,23.4,3.6,5.8,3.3,82.0,69.0,1009.5,...,0.861744,,,,,,,-35.297591,149.101268,4


In [123]:
city = 'sao paulo'
country = 'brazil'
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(city, country)
res = [location.latitude, location.longitude]
teste = str(location.address).split(',')

print(f"{teste[0]}")
print(f"Latitude - {location.latitude}")
print(f"Longitude - {location.longitude}")
print(f"Altitude - {location.altitude}")

São Paulo
Latitude - -23.5506507
Longitude - -46.6333824
Altitude - 0.0


In [124]:
location.address

'São Paulo, Região Imediata de São Paulo, Região Metropolitana de São Paulo, Região Intermediária de São Paulo, São Paulo, Região Sudeste, Brasil'

In [19]:
X = data_final.drop('modelo_vigente', axis=1)
y = data_final['modelo_vigente']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
model = LogisticRegression()

In [22]:
cat_columns = list(X_train.select_dtypes('object').columns)
cat_columns

['location',
 'raintoday',
 'raintomorrow',
 'wind_gustdir',
 'wind_dir9am',
 'wind_dir3pm']

In [23]:
categorical_imputer = SimpleImputer(strategy='most_frequent')

In [24]:
num_columns = list(X_train.select_dtypes(exclude='object').columns)
num_columns

['date',
 'mintemp',
 'maxtemp',
 'rainfall',
 'evaporation',
 'sunshine',
 'humidity9am',
 'humidity3pm',
 'pressure9am',
 'pressure3pm',
 'cloud9am',
 'cloud3pm',
 'temp9am',
 'temp3pm',
 'amountOfRain',
 'temp',
 'humidity',
 'precipitation3pm',
 'precipitation9am',
 'wind_gustspeed',
 'wind_speed9am',
 'wind_speed3pm',
 'latitude',
 'longitude',
 'season']

In [25]:
numerical_imputer = SimpleImputer(strategy='median')

In [29]:
ordinal_encoder = OrdinalEncoder()
scaler = StandardScaler()

In [28]:
categorical_pipeline = Pipeline(steps=[('missing', categorical_imputer),
                                       ('encode', ordinal_encoder)])

In [30]:
numerical_pipeline = Pipeline(steps=[('missing', numerical_imputer),
                                     ('standardize', scaler)])

In [32]:
dataprep = ColumnTransformer(transformers=[('numerical_transform', numerical_pipeline, num_columns),
                                           ('categ_transform', categorical_pipeline, cat_columns)])

In [33]:
pipeline = Pipeline(steps=[('preprocessing', dataprep),
                           ('modelling', model)])

In [34]:
pipeline

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numerical_transform',
                                                  Pipeline(memory=None,
                                                           steps=[('missing',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                            

In [38]:
pipeline.fit(X_train, y_train)

TypeError: invalid type promotion