In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE

In [2]:
df_train=pd.read_csv('../data/output/development_sample_cleaned_1.csv')
df_test=pd.read_csv('../data/output/testing_sample_cleaned.csv')

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35700 entries, 0 to 35699
Data columns (total 46 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   working_months  35700 non-null  float64
 1   ID              35700 non-null  int64  
 2   customer_id     35700 non-null  int64  
 3   _r_             35700 non-null  float64
 4   Var1            35700 non-null  int64  
 5   Var4            35700 non-null  int64  
 6   Var5            35700 non-null  int64  
 7   Var6            35700 non-null  int64  
 8   Var7            35700 non-null  float64
 9   Var15           35700 non-null  int64  
 10  Var16           35700 non-null  int64  
 11  Var17           35700 non-null  float64
 12  Var20           35700 non-null  int64  
 13  Var21           35700 non-null  int64  
 14  Var22           35700 non-null  int64  
 15  Var23           35700 non-null  int64  
 16  Var24           35700 non-null  int64  
 17  Var25           35700 non-null 

In [4]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3546 entries, 0 to 3545
Data columns (total 46 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   working_months  3546 non-null   float64
 1   ID              3546 non-null   int64  
 2   customer_id     3546 non-null   int64  
 3   _r_             3546 non-null   float64
 4   Var1            3546 non-null   int64  
 5   Var4            3546 non-null   int64  
 6   Var5            3546 non-null   int64  
 7   Var6            3546 non-null   int64  
 8   Var7            3546 non-null   float64
 9   Var15           3546 non-null   int64  
 10  Var16           3546 non-null   int64  
 11  Var17           3546 non-null   float64
 12  Var20           3546 non-null   int64  
 13  Var21           3546 non-null   int64  
 14  Var22           3546 non-null   int64  
 15  Var23           3546 non-null   int64  
 16  Var24           3546 non-null   int64  
 17  Var25           3546 non-null   f

In [5]:
# Istnieje teoretycznie 2^45 mozliwych modeli z roznymi parametrami ktore mozemy uzyskac z tego zestawu danych
# A więc istnieje 35184372088832
# Jest to 35 trylionów możliwych modeli 
2**45
# Przy zalozeniu ze wykorzystalbym wszystkie moce swojego kompa 6-rdzeniowemu komputerowi z maksymalnym taktowaniem 3.6 GHz i 16 GB RAM o szybkości 2667 MHz 
# 2^45sekund=35,184,372,088,832 sekund
# to jest około 185 lat XDDDDD


35184372088832

In [6]:
def prepare_balanced_data(df, ratio):
    df_majority = df[df.target == 0]
    df_minority = df[df.target == 1]
    n_samples = int(len(df_minority) * ratio) if ratio <= len(df_majority) / len(df_minority) else len(df_majority)
    df_majority_downsampled = resample(df_majority, replace=False, n_samples=n_samples, random_state=123)
    return pd.concat([df_majority_downsampled, df_minority])

# Funkcja do trenowania modelu i obliczania metryk
def train_evaluate_model(X_train, y_train, X_test, y_test):
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred)
    }

In [7]:
'''''''''


scalers = {"standard": StandardScaler(), "min_max": MinMaxScaler(), "robust": RobustScaler(), "none": None}
best_models = {}

# Iterowanie przez wszystkie skalery i różne proporcje downsamplingu i podziału zbioru treningowego
for scaler_name, scaler in scalers.items():
    best_score = 0
    for downsample_ratio in [0.5, 1, 2, 3]:
        df_train_balanced = prepare_balanced_data(df_train, downsample_ratio)
        for split_ratio in [0.2, 0.3, 0.4, 0.5, 0.6, 0.7]:
            X = df_train_balanced.drop('target', axis=1)
            y = df_train_balanced['target']
            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=split_ratio, random_state=42)

            # Skalowanie danych, jeśli scaler jest zdefiniowany
            if scaler:
                X_train_scaled = scaler.fit_transform(X_train)
                X_val_scaled = scaler.transform(X_val)
                X_test_scaled = scaler.transform(df_test.drop('target', axis=1))
            else:
                X_train_scaled = X_train
                X_val_scaled = X_val
                X_test_scaled = df_test.drop('target', axis=1)

            # Trenowanie i ewaluacja modelu na zbiorze walidacyjnym
            metrics = train_evaluate_model(X_train_scaled, y_train, X_val_scaled, y_val)
            if metrics['f1_score'] > best_score:
                best_score = metrics['f1_score']
                best_models[scaler_name] = (LogisticRegression(max_iter=1000).fit(X_train_scaled, y_train), scaler, downsample_ratio, split_ratio)

# Dla każdego skalera, trenowanie najlepszego modelu na pełnym zbiorze treningowym i ewaluacja na zbiorze testowym
final_results = {}
for scaler_name, (model, scaler, downsample_ratio, split_ratio) in best_models.items():
    if scaler:
        X_full_train_scaled = scaler.transform(df_train.drop('target', axis=1))
    else:
        X_full_train_scaled = df_train.drop('target', axis=1)
    y_full_train = df_train['target']
    X_test_scaled = scaler.transform(df_test.drop('target', axis=1)) if scaler else df_test.drop('target', axis=1)
    y_test = df_test['target']

    final_metrics = train_evaluate_model(X_full_train_scaled, y_full_train, X_test_scaled, y_test)
    final_results[scaler_name] = final_metrics

final_results


SyntaxError: unterminated string literal (detected at line 48) (1690481538.py, line 48)

In [11]:
scalers = {"standard": StandardScaler(), "min_max": MinMaxScaler(), "robust": RobustScaler(), "none": None}
best_models = {}

# Iterowanie przez wszystkie skalery i różne proporcje downsamplingu i podziału zbioru treningowego
for scaler_name, scaler in scalers.items():
    best_score = 0
    for downsample_ratio in [0.5, 1, 2, 3]:
        df_train_balanced = prepare_balanced_data(df_train, downsample_ratio)
        for split_ratio in [0.2, 0.3, 0.4, 0.5, 0.6, 0.7]:
            X = df_train_balanced.drop('target', axis=1)
            y = df_train_balanced['target']
            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=split_ratio, random_state=42)
            
            # Zdefiniowanie y_test tutaj
            y_test = df_test['target']

            # Skalowanie danych, jeśli scaler jest zdefiniowany
            if scaler:
                X_train_scaled = scaler.fit_transform(X_train)
                X_val_scaled = scaler.transform(X_val)
                X_test_scaled = scaler.transform(df_test.drop('target', axis=1))
            else:
                X_train_scaled = X_train
                X_val_scaled = X_val
                X_test_scaled = df_test.drop('target', axis=1)

            # Wybór najlepszej liczby cech przy użyciu RFE
            model = LogisticRegression(max_iter=1000)
            best_n_features = 0
            best_rfe_score = 0
            for n_features_to_select in range(1, X_train_scaled.shape[1] + 1):
                rfe = RFE(estimator=model, n_features_to_select=n_features_to_select)
                rfe.fit(X_train_scaled, y_train)
                X_train_rfe = rfe.transform(X_train_scaled)
                X_val_rfe = rfe.transform(X_val_scaled)

                # Trenowanie i ewaluacja modelu na danych po RFE
                model.fit(X_train_rfe, y_train)
                y_pred = model.predict(X_val_rfe)
                rfe_score = f1_score(y_val, y_pred)  # Wybierz odpowiednią metrykę
                if rfe_score > best_rfe_score:
                    best_rfe_score = rfe_score
                    best_n_features = n_features_to_select

            # Trenowanie modelu na pełnym zbiorze treningowym z wybraną liczbą cech
            rfe = RFE(estimator=model, n_features_to_select=best_n_features)
            rfe.fit(X_train_scaled, y_train)
            X_train_rfe = rfe.transform(X_train_scaled)
            X_test_rfe = rfe.transform(X_test_scaled)
            model.fit(X_train_rfe, y_train)

            # Ewaluacja modelu na zbiorze testowym
            y_test_pred = model.predict(X_test_rfe)
            test_score = f1_score(y_test, y_test_pred)  # Możesz użyć innej metryki, jeśli chcesz

            # Zapisanie najlepszego modelu dla danego skalera
            if test_score > best_score:
                best_score = test_score
                best_models[scaler_name] = (model, scaler, downsample_ratio, split_ratio)

# Trenowanie najlepszych modeli na pełnych danych treningowych i ocena na zbiorze testowym
final_results = {}
for scaler_name, (model, scaler, downsample_ratio, split_ratio) in best_models.items():
    if scaler:
        X_full_train_scaled = scaler.transform(df_train.drop('target', axis=1))
        X_test_scaled = scaler.transform(df_test.drop('target', axis=1))
    else:
        X_full_train_scaled = df_train.drop('target', axis=1)
        X_test_scaled = df_test.drop('target', axis=1)

    y_full_train = df_train['target']
    y_test = df_test['target']

    model.fit(X_full_train_scaled, y_full_train)
    y_test_pred = model.predict(X_test_scaled)

    final_metrics = {
        'accuracy': accuracy_score(y_test, y_test_pred),
        'precision': precision_score(y_test, y_test_pred, zero_division=0),
        'recall': recall_score(y_test, y_test_pred),
        'f1_score': f1_score(y_test, y_test_pred)
    }

    final_results[scaler_name] = final_metrics

final_results



{'standard': {'accuracy': 0.9664410603496898,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0},
 'min_max': {'accuracy': 0.9664410603496898,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0},
 'robust': {'accuracy': 0.03355893965031021,
  'precision': 0.03355893965031021,
  'recall': 1.0,
  'f1_score': 0.06493860845839018},
 'none': {'accuracy': 0.03355893965031021,
  'precision': 0.03355893965031021,
  'recall': 1.0,
  'f1_score': 0.06493860845839018}}