Generate a model to predict rain or no rain for each station

In [19]:
# Comment the two lines below to run in jupyter nootebook
from google.colab import drive
drive.mount('/content/drive')


%load_ext autoreload
%autoreload 2
import os
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline 
import joblib

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# replace for the place where notebooks and data are placed: if notebook is running inside notebooks, root_path = '../' should work
root_path  = '/content/drive/MyDrive/Colab Notebooks/skipass-master/' 


In [3]:
csv_path = root_path+'data/data_weather_cleaned.csv'
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,date,numer_sta,Latitude,Longitude,Altitude,pmer,dd,ff,t,u,ssfrai,rr3,pres,dd_sin,dd_cos
0,2010-01-01 00:00:00,7510.0,44.830667,-0.691333,47.0,99050.0,230.0,9.8,9.6,81.0,0.0,0,98410.0,-0.766044,-0.642788
1,2010-01-01 03:00:00,7510.0,44.830667,-0.691333,47.0,99160.0,250.0,11.8,8.7,87.0,0.0,0,98520.0,-0.939693,-0.34202
2,2010-01-01 06:00:00,7510.0,44.830667,-0.691333,47.0,99570.0,290.0,5.1,7.6,91.0,0.0,0,98920.0,-0.939693,0.34202
3,2010-01-01 09:00:00,7510.0,44.830667,-0.691333,47.0,99990.0,310.0,5.7,6.8,92.0,0.0,0,99340.0,-0.766044,0.642788
4,2010-01-01 12:00:00,7510.0,44.830667,-0.691333,47.0,100350.0,310.0,6.2,6.6,82.0,0.0,0,99690.0,-0.766044,0.642788


In [4]:
df.shape

(505857, 15)

In [5]:
# Generate a list with the stations
stations = df['numer_sta'].unique()

stations = np.delete(stations, 6) # eliminate station 7591, pmer = inf
stations

array([7510., 7434., 7643., 7690., 7481., 7630., 7255., 7240., 7460.,
       7222., 7577., 7280., 7299., 7650.])

In [6]:
df.isna().sum()

date          0
numer_sta     0
Latitude      0
Longitude     0
Altitude      0
pmer          0
dd            0
ff            0
t             0
u             0
ssfrai        0
rr3           0
pres         66
dd_sin        0
dd_cos        0
dtype: int64

In [7]:
# Instantiate model and scaler
#scaler = MinMaxScaler()
#model = LogisticRegression()
model = Pipeline([('scaler', MinMaxScaler()), ('logistic', LogisticRegression())])


## Training

In [24]:
# Create directory to save models
directory = root_path+'models'
if not os.path.exists(directory):
    os.makedirs(directory)

In [25]:
# TRAINING!!!!!!

for station in stations:
  # Select the data that corresponds to the station
  df_station = df[df['numer_sta'] == station]
  # Calculate the number of rainy and no-rainy segments (called "days")
  days_of_rain    = (df_station['rr3']==1).sum()
  days_of_no_rain = (df_station['rr3']==0).sum()
  print('=====================================================================')
  print(f'Working on station: {station}')
  print(f'days_of_no_rain: {days_of_no_rain}, days_of_rain: {days_of_rain}')
  # Eliminating no-rainy days at random to balance the number of instances of each class
  df_station = df_station[df_station['rr3'] == 1].append(df_station[df_station['rr3'] == 0].sample(days_of_rain))
#  print(df_station.shape)
#  print(df_of_rain.shape)
#  print(df_of_no_rain.shape)
  X = df_station[['t', 'u', 'pmer']]
  y = df_station['rr3']
  # Split data
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=51)
  #if station != 7643.0:
  model.fit(X_train, y_train)
  model.score(X_train,y_train)
  y_test_pred = model.predict(X_test)
  print(model.score(X_train,y_train))
  print(model.score(X_test,y_test))
  print(precision_score(y_test, y_test_pred, average='macro'))
  print(recall_score(y_test, y_test_pred, average='macro'))
  model_path  = root_path + f'models/model{int(station)}.pkl'
  joblib.dump(model, model_path)
  print(f'Saved in {model_path}')

Working on station: 7510.0
days_of_no_rain: 33079, days_of_rain: 697
0.8370846730975349
0.8481561822125814
0.849673264334819
0.8485147955726225
Saved in /content/drive/MyDrive/Colab Notebooks/skipass-master/models/model7510.pkl
Working on station: 7434.0
days_of_no_rain: 32990, days_of_rain: 787
0.8444022770398482
0.8461538461538461
0.8563814527940088
0.849790305131967
Saved in /content/drive/MyDrive/Colab Notebooks/skipass-master/models/model7434.pkl
Working on station: 7643.0
days_of_no_rain: 33017, days_of_rain: 479
0.8861154446177847
0.8706624605678234
0.8738522954091816
0.875110149803733
Saved in /content/drive/MyDrive/Colab Notebooks/skipass-master/models/model7643.pkl
Working on station: 7690.0
days_of_no_rain: 33053, days_of_rain: 733
0.869653767820774
0.8863636363636364
0.8870460565666922
0.886960087479497
Saved in /content/drive/MyDrive/Colab Notebooks/skipass-master/models/model7690.pkl
Working on station: 7481.0
days_of_no_rain: 33063, days_of_rain: 732
0.863265306122449
0.

## TEST

In [9]:
loaded_model = joblib.load(model_path) # loading a model

In [11]:
loaded_model.score(X_train,y_train)
y_test_pred = loaded_model.predict(X_test)
print(loaded_model.score(X_train,y_train))
print(loaded_model.score(X_test,y_test))


0.8763066202090593
0.8661971830985915


## Load all models in memory
I use locals to manipulate variable_names as variables

In [12]:
# Load all models
lcl = locals()
for station in stations:
  model_path = f'/content/drive/MyDrive/Colab Notebooks/skipass-master/models/model{int(station)}.pkl'
  loaded_model = joblib.load(model_path)
  variable_name = f'model{str(int(station))}'
  lcl[variable_name] = loaded_model
  print(variable_name)



model7510
model7434
model7643
model7690
model7481
model7630
model7255
model7240
model7460
model7222
model7577
model7280
model7299
model7650


Once all the models are loaded, you can use get_rain function

In [13]:
def get_rain(numer_sta, t, u, pmer): # numer_sta is the station number integer
  variable_name = f'model{str(numer_sta)}'
#  print(lcl[variable_name]) 
  model = lcl[variable_name]
  X_test = np.array([[t, u, pmer]])
  return model.predict(X_test)


In [14]:
get_rain(7240, 2.8, 4, 100190)

array([0])