In [1]:
import pandas as pd
import numpy as np
import io
from google.colab import files
from sklearn.preprocessing import MinMaxScaler

# Step 1: Upload the file to Google Colab
print("Please upload your 'Diabetes Risk Data Collection - Form Responses 1.csv' file.")
uploaded = files.upload()

# Check if a file was uploaded
if not uploaded:
    print("\nNo file uploaded. Please run the cell again and select your file.")
else:
    # Get the filename (assuming only one file is uploaded)
    file_name = next(iter(uploaded))
    print(f"\nUploaded file '{file_name}' successfully.")

    # Step 2: Load the data into a pandas DataFrame
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))

    # --- Step 3: Preprocess the Data ---

    # Standardize column names for easier access
    df.columns = [
        'Timestamp', 'Age', 'Gender', 'Height', 'Weight',
        'Physical_Activity', 'Smokes', 'Alcohol',
        'Family_History_Diabetes', 'Previously_Diagnosed_Diabetes'
    ]

    # --- Data Cleaning and Type Conversion ---

    # Function to clean and convert height to meters
    def clean_height(h):
        try:
            h_str = str(h).lower().strip()
            if "'" in h_str or "’" in h_str:
                parts = h_str.replace("’", "'").split("'")
                feet = float(parts[0])
                inches = float(parts[1].replace('"', '').replace(' ', '')) if len(parts) > 1 and parts[1] else 0
                return (feet * 30.48 + inches * 2.54) / 100
            elif 'cm' in h_str:
                return float(h_str.replace('cm', '').strip()) / 100
            else:
                height_cm = float(h_str)
                if height_cm > 250: return np.nan
                return height_cm / 100
        except (ValueError, TypeError):
            return np.nan

    # Function to clean and convert weight to kg
    def clean_weight(w):
        try:
            w_str = str(w).lower().strip()
            w_val = float(''.join(filter(lambda x: x.isdigit() or x == '.', w_str)))
            if w_val > 300: return np.nan
            return w_val
        except (ValueError, TypeError):
            return np.nan

    # Apply the cleaning functions
    df['Height_m'] = df['Height'].apply(clean_height)
    df['Weight_kg'] = df['Weight'].apply(clean_weight)

    # Clean Age column
    df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
    df.loc[df['Age'] > 120, 'Age'] = np.nan

    # Impute missing values with the median
    df['Height_m'] = df['Height_m'].fillna(df['Height_m'].median())
    df['Weight_kg'] = df['Weight_kg'].fillna(df['Weight_kg'].median())
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Age'] = df['Age'].astype(int)

    # Calculate BMI
    df['BMI'] = df['Weight_kg'] / (df['Height_m'] ** 2)
    df['BMI'] = df['BMI'].round(2)

    # --- Convert categorical columns to numeric ---
    df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
    yes_no_cols = [
        'Physical_Activity', 'Smokes', 'Alcohol',
        'Family_History_Diabetes', 'Previously_Diagnosed_Diabetes'
    ]
    for col in yes_no_cols:
        df[col] = df[col].map({'Yes': 1, 'No': 0})

    # --- Finalize the DataFrame ---

    # Select the columns for the final dataframe, dropping height and weight
    processed_df = df[[
        'Age', 'Gender', 'BMI', 'Physical_Activity', 'Smokes', 'Alcohol',
        'Family_History_Diabetes', 'Previously_Diagnosed_Diabetes'
    ]].copy() # Use .copy() to avoid SettingWithCopyWarning

    # --- Step 4: Normalize Age and BMI ---
    scaler = MinMaxScaler()
    processed_df[['Age', 'BMI']] = scaler.fit_transform(processed_df[['Age', 'BMI']])

    # Step 5: Show the first 10 rows of the preprocessed data
    print("\n--- Preprocessed Data (First 10 Rows) ---")
    print("(Note: Age and BMI are now normalized)")
    print(processed_df.head(10))

    # --- Step 6: Download the processed file ---
    output_filename = 'preprocessed_normalized_diabetes_data.csv'
    processed_df.to_csv(output_filename, index=False)
    print(f"\nPreparing '{output_filename}' for download...")
    files.download(output_filename)
    print(f"Download of '{output_filename}' has started.")

Please upload your 'Diabetes Risk Data Collection - Form Responses 1.csv' file.


Saving DataSet1.csv to DataSet1.csv

Uploaded file 'DataSet1.csv' successfully.

--- Preprocessed Data (First 10 Rows) ---
(Note: Age and BMI are now normalized)
        Age  Gender       BMI  Physical_Activity  Smokes  Alcohol  \
0  0.174603       1  0.362670                  1       1        0   
1  0.142857       1  0.261063                  1       0        0   
2  0.126984       1  0.288257                  0       0        0   
3  0.142857       0  0.373548                  0       0        0   
4  0.158730       1  0.180964                  1       0        0   
5  0.126984       0  0.128554                  0       0        0   
6  0.111111       0  0.214586                  0       0        0   
7  0.111111       0  0.193325                  0       0        0   
8  0.111111       0  0.224722                  0       0        0   
9  0.111111       0  0.185414                  0       0        0   

   Family_History_Diabetes  Previously_Diagnosed_Diabetes  
0                 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download of 'preprocessed_normalized_diabetes_data.csv' has started.
