In [1]:
import pandas as pd

df = pd.read_csv("climat_data.csv", index_col=0)

In [2]:
df

Unnamed: 0,time,temperature_2m,relative_humidity_2m,pressure_msl,wind_speed_10m,precipitation
0,2024-11-10T00:00,10.9,97,1025.9,4.7,0.0
1,2024-11-10T01:00,11.2,98,1025.9,4.2,0.0
2,2024-11-10T02:00,11.3,98,1025.8,3.9,0.0
3,2024-11-10T03:00,10.8,99,1025.7,3.5,0.0
4,2024-11-10T04:00,10.2,99,1025.9,5.8,0.0
...,...,...,...,...,...,...
8779,2025-11-10T19:00,11.0,81,1011.2,12.4,0.0
8780,2025-11-10T20:00,10.7,82,1011.4,12.0,0.0
8781,2025-11-10T21:00,10.5,85,1011.6,12.1,0.0
8782,2025-11-10T22:00,10.0,88,1011.8,12.2,0.0


Préparation & Exploration

In [3]:
df_original = df
df = df.copy()
df.rename(columns={
    "time": "datetime",
    "temperature_2m": "temperature",
    "relative_humidity_2m": "humidite",
    "pressure_msl": "pression",
    "wind_speed_10m": "vent",
    "precipitation": "precipitation"
}, inplace=True)

df["datetime"] = pd.to_datetime(df["datetime"])
df.sort_values("datetime", inplace=True)
df.reset_index(drop=True, inplace=True)

assert df["datetime"].is_monotonic_increasing

df["pluie_oui_non"] = (df["precipitation"] > 0.1).astype(int)

print(df.head())

             datetime  temperature  humidite  pression  vent  precipitation  \
0 2024-11-10 00:00:00         10.9        97    1025.9   4.7            0.0   
1 2024-11-10 01:00:00         11.2        98    1025.9   4.2            0.0   
2 2024-11-10 02:00:00         11.3        98    1025.8   3.9            0.0   
3 2024-11-10 03:00:00         10.8        99    1025.7   3.5            0.0   
4 2024-11-10 04:00:00         10.2        99    1025.9   5.8            0.0   

   pluie_oui_non  
0              0  
1              0  
2              0  
3              0  
4              0  


In [4]:
import sqlite3

conn = sqlite3.connect("meteo_bretagne.db")

df.to_sql("donnees_meteo", conn, if_exists="replace", index=False)

check = pd.read_sql("SELECT * FROM donnees_meteo LIMIT 5;", conn)
print(check)

conn.close()

              datetime  temperature  humidite  pression  vent  precipitation  \
0  2024-11-10 00:00:00         10.9        97    1025.9   4.7            0.0   
1  2024-11-10 01:00:00         11.2        98    1025.9   4.2            0.0   
2  2024-11-10 02:00:00         11.3        98    1025.8   3.9            0.0   
3  2024-11-10 03:00:00         10.8        99    1025.7   3.5            0.0   
4  2024-11-10 04:00:00         10.2        99    1025.9   5.8            0.0   

   pluie_oui_non  
0              0  
1              0  
2              0  
3              0  
4              0  


In [5]:
conn = sqlite3.connect("meteo_bretagne.db")

df_check = pd.read_sql("SELECT * FROM donnees_meteo LIMIT 5;", conn)
print(df_check.info())

na_ratio = pd.read_sql("SELECT COUNT(*) - COUNT(temperature) AS missing_temp FROM donnees_meteo;", conn)
print(na_ratio)

target_count = pd.read_sql("SELECT pluie_oui_non, COUNT(*) as n FROM donnees_meteo GROUP BY pluie_oui_non;", conn)
print(target_count)

conn.close()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   datetime       5 non-null      object 
 1   temperature    5 non-null      float64
 2   humidite       5 non-null      int64  
 3   pression       5 non-null      float64
 4   vent           5 non-null      float64
 5   precipitation  5 non-null      float64
 6   pluie_oui_non  5 non-null      int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 412.0+ bytes
None
   missing_temp
0             0
   pluie_oui_non     n
0              0  7940
1              1   844


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df[["temperature", "humidite", "pression", "vent"]]
y = df["pluie_oui_non"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Modélisation (Entraînement & Évaluation)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

model = LogisticRegression(max_iter=1000, class_weight="balanced")
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1249  339]
 [  38  131]]
              precision    recall  f1-score   support

           0       0.97      0.79      0.87      1588
           1       0.28      0.78      0.41       169

    accuracy                           0.79      1757
   macro avg       0.62      0.78      0.64      1757
weighted avg       0.90      0.79      0.82      1757



In [8]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    class_weight="balanced",
    random_state=42
)
model_rf.fit(X_train_scaled, y_train)
y_pred_rf = model_rf.predict(X_test_scaled)

print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

[[1484  104]
 [  71   98]]
              precision    recall  f1-score   support

           0       0.95      0.93      0.94      1588
           1       0.49      0.58      0.53       169

    accuracy                           0.90      1757
   macro avg       0.72      0.76      0.74      1757
weighted avg       0.91      0.90      0.90      1757



In [9]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_scaled, y_train)

model_smote = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42
)
model_smote.fit(X_train_res, y_train_res)

y_pred_sm = model_smote.predict(X_test_scaled)
print(confusion_matrix(y_test, y_pred_sm))
print(classification_report(y_test, y_pred_sm))

[[1417  171]
 [  51  118]]
              precision    recall  f1-score   support

           0       0.97      0.89      0.93      1588
           1       0.41      0.70      0.52       169

    accuracy                           0.87      1757
   macro avg       0.69      0.80      0.72      1757
weighted avg       0.91      0.87      0.89      1757



In [10]:
y_proba = model_smote.predict_proba(X_test_scaled)[:,1]
threshold = 0.47
y_pred_adj = (y_proba >= threshold).astype(int)

print(confusion_matrix(y_test, y_pred_adj))
print(classification_report(y_test, y_pred_adj))

[[1387  201]
 [  43  126]]
              precision    recall  f1-score   support

           0       0.97      0.87      0.92      1588
           1       0.39      0.75      0.51       169

    accuracy                           0.86      1757
   macro avg       0.68      0.81      0.71      1757
weighted avg       0.91      0.86      0.88      1757



In [11]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.ensemble import HistGradientBoostingClassifier

param_dist = {
    "max_depth": randint(3, 10),
    "learning_rate": uniform(0.01, 0.2),
    "max_iter": randint(200, 800),
    "min_samples_leaf": randint(5, 20)
}

search = RandomizedSearchCV(
    HistGradientBoostingClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=20,
    scoring="f1",
    cv=3,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train_res, y_train_res)
print("Best params:", search.best_params_)

Best params: {'learning_rate': np.float64(0.19437484700462337), 'max_depth': 8, 'max_iter': 451, 'min_samples_leaf': 13}


In [12]:
model_boost = HistGradientBoostingClassifier(max_depth = 9, learning_rate = 0.1468466053024314, max_iter=654, min_samples_leaf = 16)
model_boost.fit(X_train_res, y_train_res)

y_pred_boost = model_boost.predict(X_test_scaled)
print(confusion_matrix(y_test, y_pred_boost))
print(classification_report(y_test, y_pred_boost))

[[1491   97]
 [  71   98]]
              precision    recall  f1-score   support

           0       0.95      0.94      0.95      1588
           1       0.50      0.58      0.54       169

    accuracy                           0.90      1757
   macro avg       0.73      0.76      0.74      1757
weighted avg       0.91      0.90      0.91      1757



In [13]:
from sklearn.ensemble import StackingClassifier

stack_model = StackingClassifier(
    estimators=[
        ("rf", RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),
        ("boost", HistGradientBoostingClassifier(max_depth=6, learning_rate=0.1, max_iter=300))
    ],
    final_estimator=LogisticRegression(max_iter=1000),
    n_jobs=-1
)

stack_model.fit(X_train_res, y_train_res)
y_pred_stack = stack_model.predict(X_test_scaled)

print(confusion_matrix(y_test, y_pred_stack))
print(classification_report(y_test, y_pred_stack))

[[1473  115]
 [  72   97]]
              precision    recall  f1-score   support

           0       0.95      0.93      0.94      1588
           1       0.46      0.57      0.51       169

    accuracy                           0.89      1757
   macro avg       0.71      0.75      0.72      1757
weighted avg       0.91      0.89      0.90      1757



In [None]:
from sklearn.ensemble import VotingClassifier

voting_model = VotingClassifier(
    estimators=[
        ("rf", RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),
        ("boost", HistGradientBoostingClassifier(max_depth=6, learning_rate=0.1, max_iter=300)),
        ("logreg", LogisticRegression(max_iter=1000))
    ],
    voting="soft",  
    n_jobs=-1
)
    
voting_model.fit(X_train_res, y_train_res)
y_pred_vote = voting_model.predict(X_test_scaled)

print(confusion_matrix(y_test, y_pred_vote))
print(classification_report(y_test, y_pred_vote))

[[1432  156]
 [  52  117]]
              precision    recall  f1-score   support

           0       0.96      0.90      0.93      1588
           1       0.43      0.69      0.53       169

    accuracy                           0.88      1757
   macro avg       0.70      0.80      0.73      1757
weighted avg       0.91      0.88      0.89      1757



Sauvegarde du modèle & scaler (PKL)

In [15]:
import joblib
joblib.dump(scaler, "scaler.pkl")
joblib.dump(voting_model, "voting_model.pkl")

['voting_model.pkl']