In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

In [47]:
# 1. Loading CSV files
diabetes_data = pd.read_csv('diabetes.csv')
activity_data = pd.read_csv('diabetes_dataset00.csv')

In [48]:
# 2. Selecting the appropriate columns
diabetes_features = ['Glucose', 'Insulin']
activity_features = ['BMI', 'Physical Activity', 'Genetic Markers', 'Insulin Levels', 'Age', 'Dietary Habits']


In [49]:
# List of columns to remove
columns_to_drop = [
    'Target', 'Autoantibodies', 'Family History', 'Environmental Factors', 
    'Ethnicity', 'Socioeconomic Factors', 'Smoking Status', 
    'Alcohol Consumption', 'History of PCOS', 'Previous Gestational Diabetes', 
    'Pregnancy History', 'Cystic Fibrosis Diagnosis', 'Steroid Use History', 
    'Genetic Testing', 'Liver Function Tests', 'Urine Test', 'Birth Weight', 'Early Onset Symptoms'
]

# Removing the listed columns
activity_data = activity_data.drop(columns=columns_to_drop)

In [50]:
# Checking if there are any missing values
print(activity_data.isnull().sum())
print(diabetes_data.isnull().sum())

# Optional: removing rows with missing values
activity_data = activity_data.dropna()
diabetes_data = diabetes_data.dropna()

Genetic Markers                 0
Insulin Levels                  0
Age                             0
BMI                             0
Physical Activity               0
Dietary Habits                  0
Blood Pressure                  0
Cholesterol Levels              0
Waist Circumference             0
Blood Glucose Levels            0
Glucose Tolerance Test          0
Weight Gain During Pregnancy    0
Pancreatic Health               0
Pulmonary Function              0
Neurological Assessments        0
Digestive Enzyme Levels         0
dtype: int64
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [54]:
pd.set_option('future.no_silent_downcasting', True)

# Changing data to numeric in selected columns:
activity_data[columns_to_replace] = activity_data[columns_to_replace].replace(
    {'Positive': 1, 'Negative': 0, 'High': 2, 'Moderate': 1, 'Low': 0, 'Healthy': 1, 'Unhealthy': 0, 'Normal': 1, 'Abnormal': 0}
)


In [55]:
# 3. Preparing data
sample_size = min(len(diabetes_data), len(activity_data), 1000)
diabetes_sample = diabetes_data[diabetes_features].sample(n=sample_size, random_state=42)
activity_sample = activity_data[activity_features].sample(n=sample_size, random_state=42)

# Merging data
merged_data = pd.concat([diabetes_sample.reset_index(drop=True), 
                         activity_sample.reset_index(drop=True)], axis=1)


In [57]:
X = merged_data[features]
y = merged_data[target]

# Data diagnostics
print("Typy danych w zbiorze X:")
print(X.dtypes)
print("\nPróbka danych X:")
print(X.head())
print("\nLiczba unikalnych wartości w każdej kolumnie:")
print(X.nunique())
print("\nOpis statystyczny danych:")
print(X.describe())
print("\nSprawdzenie brakujących wartości:")
print(X.isnull().sum())


KeyError: "['PhysicalActivity'] not in index"

In [58]:
# 5. Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


NameError: name 'X' is not defined

In [70]:
# 6. Data processing preparation
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), features)
    ])

In [71]:
# 7. Creating pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])


In [72]:
# 8. Model training
try:
    pipeline.fit(X_train, y_train)
    print("Model został pomyślnie wytrenowany.")
except Exception as e:
    print(f"Wystąpił błąd podczas trenowania modelu: {e}")


Wystąpił błąd podczas trenowania modelu: could not convert string to float: 'Yes'


In [73]:
# 9. Testing and Evaluating the Model
if hasattr(pipeline, 'predict'):
    try:
        y_pred = pipeline.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f"\nMean Squared Error: {mse}")
        print(f"R2 Score: {r2}")
    except Exception as e:
        print(f"An error occurred while testing the model: {e}")

# Insulin dose recommendation function
def recommend_insulin_dose(glucose, physical_activity, bmi):
    input_data = pd.DataFrame([[glucose, physical_activity, bmi]], 
                              columns=['Glucose', 'PhysicalActivity', 'BMI'])
    input_data = input_data.astype(float)  # Conversion to float
    try:
        recommended_dose = pipeline.predict(input_data)[0]
        return recommended_dose
    except Exception as e:
        print(f"An error occurred while recommending the insulin dose: {e}")
        return None

Wystąpił błąd podczas testowania modelu: 'ColumnTransformer' object has no attribute 'transformers_'


In [49]:
# Usage example (if model has been trained)
if 'pipeline' in locals() and hasattr(pipeline, 'predict'):
    sample_glucose = 120
    sample_physical_activity = 1
    sample_bmi = 25
    recommended_dose = recommend_insulin_dose(sample_glucose, sample_physical_activity, sample_bmi)
    if recommended_dose is not None:
        print(f"\nRecommended insulin dose: {recommended_dose:.2f} units.")

Wystąpił błąd podczas rekomendacji dawki insuliny: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''


Zapytać chatgpt czy nie lepiej zamienić dane na numeryczne 