In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
# Load the dataset
file_path = 'D:/K1_nam4/인공지능개론/artificial-intelligence/AI_class/artificial-intelligence/week5/dataFrame/diabetes.csv'

In [None]:
df = pd.read_csv(file_path)
# Display the first few rows of the dataset 
df.head()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Check the data types of the columns
df.dtypes

In [None]:
# Check the shape of the dataset
df.shape

In [None]:
# Check the distribution of the target variable
df['Outcome'].value_counts()

In [None]:
# Visualize the distribution of the target variable
sns.countplot(x='Outcome', data=df)
plt.title('Distribution of Diabetes Outcome')
plt.xlabel('Outcome')
plt.ylabel('Count')
plt.show()

In [None]:
# Visualize the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
df.duplicated().sum()

In [None]:
df.isnull().sum()

In [None]:
df['Age'].describe()

In [None]:
# Check the distribution of the 'Age' column
plt.figure(figsize=(10, 6))
sns.histplot(df['Age'], bins=30, kde=True)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Check the distribution of the 'Pregnancies' column
plt.figure(figsize=(10, 6))
sns.histplot(df['Pregnancies'], bins=30, kde=True)
plt.title('Distribution of Pregnancies')
plt.xlabel('Pregnancies')
plt.ylabel('Frequency')
plt.show()

In [None]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

y = pd.get_dummies(y).values
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
X.values

In [None]:
y

In [None]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')  # 이진 분류 이므로 sigmoid 사용 
])


In [None]:
model.compile(optimizer='adam', loss='mse', metrics=['mse'])

In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))

In [None]:
# loss check 
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# mse check
plt.plot(history.history['mse'], label='train_mse')
plt.plot(history.history['val_mse'], label='val_mse')
plt.title('MSE')
plt.xlabel('Epochs') 
plt.ylabel('MSE')
plt.legend()
plt.show()

In [None]:
# Evaluate the model
model.evaluate(X_test, y_test)

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")
