In [2]:
!pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp311-cp311-win_amd64.whl.metadata (4.3 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.11.0-cp311-cp311-win_amd64.whl.metadata (681 bytes)
Downloading fastparquet-2024.11.0-cp311-cp311-win_amd64.whl (671 kB)
   ---------------------------------------- 0.0/671.0 kB ? eta -:--:--
   ---------------------------------------- 0.0/671.0 kB ? eta -:--:--
   ---------------------------------------- 0.0/671.0 kB ? eta -:--:--
   --------------- ------------------------ 262.1/671.0 kB ? eta -:--:--
   ---------------------------------------- 671.0/671.0 kB 1.9 MB/s  0:00:00
Downloading cramjam-2.11.0-cp311-cp311-win_amd64.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ------------------ --------------------- 0.8/1.7 MB 3.4 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 4.6 MB/s  0:00:00
Installing collected packages: cramjam, fastparquet

   ----------

In [4]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-22.0.0-cp311-cp311-win_amd64.whl.metadata (3.3 kB)
Downloading pyarrow-22.0.0-cp311-cp311-win_amd64.whl (28.1 MB)
   ---------------------------------------- 0.0/28.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/28.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/28.1 MB ? eta -:--:--
   ---------------------------------------- 0.3/28.1 MB ? eta -:--:--
   - -------------------------------------- 0.8/28.1 MB 2.2 MB/s eta 0:00:13
   -- ------------------------------------- 2.1/28.1 MB 3.7 MB/s eta 0:00:08
   ----- ---------------------------------- 3.7/28.1 MB 5.1 MB/s eta 0:00:05
   -------- ------------------------------- 5.8/28.1 MB 6.3 MB/s eta 0:00:04
   ---------- ----------------------------- 7.3/28.1 MB 6.2 MB/s eta 0:00:04
   ------------- -------------------------- 9.7/28.1 MB 7.0 MB/s eta 0:00:03
   ----------------- ---------------------- 12.3/28.1 MB 7.7 MB/s eta 0:00:03
   --------------------- 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pathlib import Path
import sys
from fastparquet import ParquetFile
import pyarrow
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [3]:
project_root = Path.cwd().parent.parent
sys.path.append(str(project_root))
from src.ram import low_ram

In [4]:
def add_features(df):
    features = [col for col in df.columns if col not in ['id', 'FloodProbability']]
    df['sum_risk'] = df[features].sum(axis=1)
    df['mean_risk'] = df[features].mean(axis=1)
    df['std_risk'] = df[features].std(axis=1)
    
    return df

In [5]:
raw_data_dir = Path("../../data/raw")
train_df = pd.read_csv(raw_data_dir / "train.csv")
test_df = pd.read_csv(raw_data_dir / "test.csv")

In [6]:
train_df = add_features(train_df)
test_df = add_features(test_df)

In [7]:
train_df = low_ram(train_df, target_format='parquet')
test_df = low_ram(test_df, target_format='parquet')
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1117957 entries, 0 to 1117956
Data columns (total 25 columns):
 #   Column                           Non-Null Count    Dtype  
---  ------                           --------------    -----  
 0   id                               1117957 non-null  int16  
 1   MonsoonIntensity                 1117957 non-null  int16  
 2   TopographyDrainage               1117957 non-null  int16  
 3   RiverManagement                  1117957 non-null  int16  
 4   Deforestation                    1117957 non-null  int16  
 5   Urbanization                     1117957 non-null  int16  
 6   ClimateChange                    1117957 non-null  int16  
 7   DamsQuality                      1117957 non-null  int16  
 8   Siltation                        1117957 non-null  int16  
 9   AgriculturalPractices            1117957 non-null  int16  
 10  Encroachments                    1117957 non-null  int16  
 11  IneffectiveDisasterPreparedness  1117957 non-null 

In [8]:
X = train_df.drop(['id', 'FloodProbability'], axis=1)
y = train_df['FloodProbability']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2,
    random_state=69,
    shuffle=True
)

In [9]:
X_test = test_df.drop('id', axis=1)
test_ids = test_df['id']
y_class_train = ((y_train > 0.54) & (X_train['sum_risk'] > 104)).astype(int)
print(f"Экстремальных случаев для LogReg в X_train: {y_class_train.sum()} из {len(y_class_train)}")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
log_reg = LogisticRegression(random_state=42, solver='saga', max_iter=1000, n_jobs=-1)
log_reg.fit(X_train_scaled, y_class_train)
train_meta = log_reg.predict_proba(X_train_scaled)[:, 1]
val_meta = log_reg.predict_proba(X_val_scaled)[:, 1]
test_meta = log_reg.predict_proba(X_test_scaled)[:, 1]
X_train['extreme_risk_feature'] = train_meta
X_val['extreme_risk_feature'] = val_meta
X_test['extreme_risk_feature'] = test_meta


Экстремальных случаев для LogReg в X_train: 177555 из 894365


In [10]:
output_dir = Path("../../data/processed") 
output_dir.mkdir(parents=True, exist_ok=True)

X_train.to_parquet(output_dir / "X_train.parquet", index=False)
X_val.to_parquet(output_dir / "X_val.parquet", index=False)
y_train.to_frame().to_parquet(output_dir / "y_train.parquet", index=False)
y_val.to_frame().to_parquet(output_dir / "y_val.parquet", index=False)
X_test.to_parquet(output_dir / "X_test.parquet", index=False)
pd.DataFrame({'id': test_ids}).to_parquet(output_dir / "test_ids.parquet", index=False)

print("ОК")

ОК
