In [1]:
import pandas as pd
import numpy as np
np.random.seed(42)
import matplotlib.pyplot as plt
from typing import Callable

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [62]:
df_test = pd.read_csv('data/raw/triathlon_watch_test_data_final.csv')
df_training = pd.read_csv('data/raw/triathlon_watch_training_data_final.csv')

In [63]:
df_training_baseline_model = df_training.dropna(subset=['User of latest model'])

In [64]:
df_training_baseline_model = df_training_baseline_model.drop_duplicates(subset='ID')
df_training_baseline_model.set_index("ID", inplace=True)

In [65]:
df_training_baseline_model['Most current software update'] = pd.to_datetime(df_training_baseline_model['Most current software update'])

# Berechne die Differenz in Tagen zum aktuellen Datum
df_training_baseline_model['Days since Update'] = (pd.Timestamp.today() - df_training_baseline_model['Most current software update']).dt.days

# Entferne die ursprüngliche Datums-Spalte
df_training_baseline_model.drop(columns=['Most current software update'], inplace=True)

# Überprüfe die Umwandlung
print(df_training_baseline_model[['Days since Update']].head())

       Days since Update
ID                      
C0001              197.0
C0003              269.0
C0004              172.0
C0005              177.0
C0006               78.0


In [66]:
df_training_baseline_model

Unnamed: 0_level_0,Age of customer,Sex,Ctry,Town,Swimming Hours per Week,Biking Hours per Week,Running Hours per Week,Total Training Hours per Week,VO2 Max,10k Running Time Prediction,Calories Burned per Week,Support Cases of Customer,Customer Years,Goal of Training,Preferred Training Daytime,Subscription Type,Color of Watch,Synchronisation,User of latest model,Days since Update
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
C0001,61.0,Other,UK,Birmingham,2.52,0.10,1.78,4.40,23.08,60.72,2329.95,2.0,1.0,Fitness,Evening,Free,White,Yes,1.0,197.0
C0003,57.0,Other,Australia,Sydney,1.55,10.01,4.57,16.13,39.04,54.37,7904.93,2.0,1.0,Fitness,Evening,Premium,Black,Yes,0.0,269.0
C0004,30.0,Other,India,Bangalore,1.19,12.04,8.64,21.87,71.59,33.92,10839.81,2.0,0.0,Competition,Afternoon,Free,White,Yes,1.0,172.0
C0005,21.0,Male,Germany,Munich,2.25,4.67,0.92,7.84,49.09,44.97,,3.0,0.0,Recreation,Evening,Premium,Black,No,1.0,177.0
C0006,63.0,Male,India,Mumbai,0.80,5.88,0.67,7.35,13.91,62.46,3575.96,3.0,3.0,Recreation,Morning,Basic,Black,Yes,0.0,78.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0996,28.0,Female,Australia,Brisbane,2.38,0.28,2.79,5.46,61.05,40.38,2589.77,0.0,3.0,Recreation,Morning,Premium,Black,Yes,0.0,392.0
C0997,54.0,Female,Australia,Melbourne,1.80,4.45,1.35,7.61,34.50,56.38,4104.15,0.0,3.0,Recreation,Afternoon,Free,Black,Yes,0.0,87.0
C0998,35.0,Other,USA,New York,3.91,0.57,5.57,10.05,58.63,37.27,5078.45,4.0,6.0,Fitness,Afternoon,Premium,Black,Yes,1.0,161.0
C0999,31.0,Other,Germayn,Berlin,0.36,4.63,5.16,10.15,66.46,37.98,5184.27,3.0,3.0,Fitness,Evening,Free,Black,,1.0,230.0


In [61]:
import numpy as np
import pandas as pd

# Ersetze "Germayn" durch "Germany" in der Spalte `Ctry`
df_training_baseline_model['Ctry'] = df_training_baseline_model['Ctry'].replace('Germayn', 'Germany')

# Ersetze "UnknownLand" durch np.nan (fehlender Wert)
df_training_baseline_model['Ctry'] = df_training_baseline_model['Ctry'].replace('UnknownLand', np.nan)

# Ersetze -1 in "Age of customer" durch np.nan
df_training_baseline_model['Age of customer'] = df_training_baseline_model['Age of customer'].replace(-1, np.nan)

df_training_baseline_model['Most current software update'] = pd.to_datetime(df_training_baseline_model['Most current software update'])
df_training_baseline_model['date_numeric'] = (df_training_baseline_model['Most current software update'] - pd.Timestamp("2000-01-01")).dt.days
df_training_baseline_model.drop(columns=['Most current software update'], inplace=True)  # Original-Datumsspalte entfernen


# One-Hot-Encoding für kategoriale Variablen
df_training_baseline_model = pd.get_dummies(
    df_training_baseline_model, 
    columns=['Sex', 'Ctry', 'Town', 'Subscription Type', "Goal of Training", "Preferred Training Daytime", "Color of Watch", 'Synchronisation'], 
    drop_first=True
)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Definiere die Features (X) und die Zielvariable (y)
X = df_training_baseline_model.drop('User of latest model', axis=1)
y = df_training_baseline_model['User of latest model']

# Teile die Daten in Trainings- und Testdaten auf
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialisiere das Modell
model = LogisticRegression()


# Trainiere das Modell
model.fit(X_train.fillna(0), y_train)

# Mache Vorhersagen auf den Testdaten
y_pred = model.predict(X_test.fillna(0))

# Evaluierung des Modells
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7538461538461538
Confusion Matrix:
 [[93 17]
 [31 54]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.75      0.85      0.79       110
         1.0       0.76      0.64      0.69        85

    accuracy                           0.75       195
   macro avg       0.76      0.74      0.74       195
weighted avg       0.75      0.75      0.75       195



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
# Ersetze "Germayn" durch "Germany" in der Spalte `Ctry`
df_training_baseline_model['Ctry'] = df_training_baseline_model['Ctry'].replace('Germayn', 'Germany')
# Ersetze "UnknownLand" durch NaN (fehlender Wert)
df_training_baseline_model['Ctry'] = df_training_baseline_model['Ctry'].replace('UnknownLand', pd.NA)
# Ersetze -1 in "Age of customer" durch NaN
df_training_baseline_model['Age of customer'] = df_training_baseline_model['Age of customer'].replace(-1, pd.NA)

df_training_baseline_model = pd.get_dummies(df_training_baseline_model, columns=['Ctry', 'Sex', 'Town', 'Preferred Training Daytime', 'Subscription Type', 'Color of Watch', 'Synchronisation'], drop_first=True)

from sklearn.model_selection import train_test_split

# Definiere die Features (X) und die Zielvariable (y)
X = df_training_baseline_model.drop('User of latest model', axis=1)
y = df_training_baseline_model['User of latest model']

# Teile die Daten in Trainings- und Testdaten auf
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialisiere das Modell
model = LogisticRegression()

# Trainiere das Modell
model.fit(X_train, y_train)

# Mache Vorhersagen auf den Testdaten
y_pred = model.predict(X_test)

# Evaluierung des Modells
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

TypeError: float() argument must be a string or a real number, not 'NAType'

In [None]:
def impute_synchronisation(df):
    """
    Füllt fehlende Werte in 'Synchronisation' mit Regression Imputation (Logistische Regression).
    
    Verwendet:
    - 'VO2 Max' als Prädiktor für 'Synchronisation'
    
    Args:
        df (pd.DataFrame): DataFrame mit fehlenden Werten in 'Synchronisation'.
    
    Returns:
        pd.DataFrame: DataFrame mit imputierten Werten für 'Synchronisation'.
    """
    df = df.copy()  # Verhindert inplace-Änderungen
    
    # Überprüfen, ob 'Synchronisation' eine kategoriale Variable ist
    if df['Synchronisation'].dtype != 'object':
        df['Synchronisation'] = df['Synchronisation'].astype(str)

    # Label-Encoding für 'Synchronisation' (kategoriale Variable)
    le = LabelEncoder()
    df['Synchronisation Encoded'] = le.fit_transform(df['Synchronisation'])

    # Aufteilung: Trainingsdaten (ohne NaN) und fehlende Werte
    train_data = df.dropna(subset=['Synchronisation Encoded'])  # Nur Zeilen ohne NaN
    missing_data = df[df['Synchronisation Encoded'].isna()]  # Zeilen mit NaN

    if missing_data.empty:
        return df  # Falls keine fehlenden Werte, nichts tun

    # Logistische Regression für kategoriale Zielvariable
    imputer = IterativeImputer(estimator=LogisticRegression(max_iter=1000, random_state=42), max_iter=10, random_state=42)

    # Nur relevante Spalten für Imputation
    imputed_values = imputer.fit_transform(train_data[['Synchronisation Encoded', 'VO2 Max']])

    # Setze die imputierten Werte zurück in das ursprüngliche DataFrame
    df.loc[missing_data.index, 'Synchronisation Encoded'] = imputed_values[:, 0].round().astype(int)

    # Rückumwandlung in ursprüngliche Kategorien
    df['Synchronisation'] = le.inverse_transform(df['Synchronisation Encoded'].astype(int))

    # Entferne die temporäre numerische Spalte
    df = df.drop(columns=['Synchronisation Encoded'])

    return df