In [1]:
## read txt file
from google.colab import drive
drive.mount('/content/drive/')
data_path = "/content/drive/MyDrive/Colab Notebooks/"  # this is your drive

Mounted at /content/drive/


In [2]:
import pandas as pd

df = pd.read_csv(data_path + "diabetes.csv")

# View basic information of data
print(df.info())
print(df.head())
print(df["Outcome"].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8     

In [3]:
# Check for missing values
print(df.isnull().sum())

# Fill in missing numerical values with median
df.fillna(df.median(), inplace=True)

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [4]:
print(df.describe())

# Using Z-score to detect outliers
from scipy.stats import zscore
z_scores = zscore(df.select_dtypes(include=["float64", "int64"]))
abs_z_scores = abs(z_scores)
df = df[(abs_z_scores < 3).all(axis=1)]  # Remove outliers with Z-score greater than 3

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  

In [5]:
print(df.describe())

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   688.000000  688.000000     688.000000     688.000000  688.000000   
mean      3.845930  120.293605      72.345930      20.787791   72.507267   
std       3.279256   30.005790      12.312859      15.405391   90.106939   
min       0.000000   44.000000      24.000000       0.000000    0.000000   
25%       1.000000   99.000000      64.000000       0.000000    0.000000   
50%       3.000000  115.000000      72.000000      23.000000   43.500000   
75%       6.000000  139.000000      80.000000      32.000000  126.000000   
max      13.000000  199.000000     122.000000      60.000000  415.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  688.000000                688.000000  688.000000  688.000000  
mean    32.209157                  0.454555   33.094477    0.329942  
std      6.618891                  0.284527   11.436990    0.470534  
min     18.200000                  

In [6]:
from sklearn.preprocessing import StandardScaler


# Separate features and target columns
X = df.drop(["Outcome"], axis=1)
y = df["Outcome"]

X = X[['Glucose','BMI','Age','Pregnancies','DiabetesPedigreeFunction','BloodPressure','Insulin','SkinThickness']]

# Standardized numerical features
scaler = StandardScaler()
numerical_features = X.select_dtypes(include=["float64", "int64"]).columns  # 仅选择数值型特征
X[numerical_features] = scaler.fit_transform(X[numerical_features])

print(X.head())

    Glucose       BMI       Age  Pregnancies  DiabetesPedigreeFunction  \
0  0.924040  0.210285  1.479220     0.657355                  0.606516   
1 -1.177082 -0.848063 -0.183265    -0.868490                 -0.364220   
2  2.091330 -1.346999 -0.095766     1.267694                  0.764788   
3 -1.043678 -0.621274 -1.058257    -0.868490                 -1.011378   
5 -0.143197 -0.999256 -0.270764     0.352186                 -0.891795   

   BloodPressure   Insulin  SkinThickness  
0      -0.028115 -0.805266       0.923219  
1      -0.515765 -0.805266       0.533462  
2      -0.678315 -0.805266      -1.350366  
3      -0.515765  0.238698       0.143704  
5       0.134435 -0.805266      -1.350366  


In [7]:
# Merge features and target columns
df_p = pd.concat([X, y], axis=1)

print(df_p.head())

    Glucose       BMI       Age  Pregnancies  DiabetesPedigreeFunction  \
0  0.924040  0.210285  1.479220     0.657355                  0.606516   
1 -1.177082 -0.848063 -0.183265    -0.868490                 -0.364220   
2  2.091330 -1.346999 -0.095766     1.267694                  0.764788   
3 -1.043678 -0.621274 -1.058257    -0.868490                 -1.011378   
5 -0.143197 -0.999256 -0.270764     0.352186                 -0.891795   

   BloodPressure   Insulin  SkinThickness  Outcome  
0      -0.028115 -0.805266       0.923219        1  
1      -0.515765 -0.805266       0.533462        0  
2      -0.678315 -0.805266      -1.350366        1  
3      -0.515765  0.238698       0.143704        0  
5       0.134435 -0.805266      -1.350366        0  


In [8]:
# Calculate the correlation between features and target columns
correlation = df_p.corr()["Outcome"].abs().sort_values(ascending=False)
print(correlation)

# Select features with high correlation
selected_features = correlation[correlation > 0.1].index
df_p = df_p[selected_features]

Outcome                     1.000000
Glucose                     0.478614
BMI                         0.298056
Age                         0.246290
Pregnancies                 0.226382
DiabetesPedigreeFunction    0.213872
BloodPressure               0.182521
Insulin                     0.110062
SkinThickness               0.056662
Name: Outcome, dtype: float64


In [9]:
correlation = df_p.corr()["Outcome"].abs().sort_values(ascending=False)
print(correlation)

df_p.head()

Outcome                     1.000000
Glucose                     0.478614
BMI                         0.298056
Age                         0.246290
Pregnancies                 0.226382
DiabetesPedigreeFunction    0.213872
BloodPressure               0.182521
Insulin                     0.110062
Name: Outcome, dtype: float64


Unnamed: 0,Outcome,Glucose,BMI,Age,Pregnancies,DiabetesPedigreeFunction,BloodPressure,Insulin
0,1,0.92404,0.210285,1.47922,0.657355,0.606516,-0.028115,-0.805266
1,0,-1.177082,-0.848063,-0.183265,-0.86849,-0.36422,-0.515765,-0.805266
2,1,2.09133,-1.346999,-0.095766,1.267694,0.764788,-0.678315,-0.805266
3,0,-1.043678,-0.621274,-1.058257,-0.86849,-1.011378,-0.515765,0.238698
5,0,-0.143197,-0.999256,-0.270764,0.352186,-0.891795,0.134435,-0.805266


In [10]:
from google.colab import files

df_p.to_csv('data1_p.csv', index=False)
files.download('data1_p.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
from imblearn.over_sampling import SMOTE

X = df_p.drop("Outcome", axis=1)
y = df_p["Outcome"]

# Use SMOTE to oversample minority classes
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

print(y_res.value_counts())

Outcome
1    461
0    461
Name: count, dtype: int64


In [12]:
from sklearn.model_selection import train_test_split

# partitioned data set
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42, stratify=y_res)

print(y_train.value_counts())
print(y_test.value_counts())

Outcome
1    369
0    368
Name: count, dtype: int64
Outcome
0    93
1    92
Name: count, dtype: int64
