# BACS3013 Data Science Assignment - Diabetes

In [None]:
# 1. choose model class
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math
from scipy.stats import ttest_ind
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.naive_bayes import GaussianNB 

## Import the data

In [None]:
df = pd.read_csv('diabetes_data_upload.csv')

## Data Preprocessing

In [None]:
X = df.drop(columns=['class'])  # Features (excluding the target variable)
y = df['class']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
df.isnull().any().any() # check the missing value

In [None]:
df.head() # print the data from head

In [None]:
df.tail() # print the data from tail

In [None]:
# Get the column variable
df.columns

In [None]:
# Get the number of rows and columns
rows = len(df.axes[0])
cols = len(df.axes[1])
  
# Print the number of rows and columns
print("Number of Rows: " + str(rows))
print("Number of Columns: " + str(cols))

In [None]:
# Test if missing value exist
df.isna().any()

In [None]:
df.info() # print the information of the data

In [None]:

# Mapping for Gender
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})

# Mapping for Yes/No Data
binary_columns = ['Polyuria', 'Polydipsia', 'sudden weight loss', 'weakness', 'Polyphagia', 'Genital thrush',
                  'visual blurring', 'Itching', 'Irritability', 'delayed healing', 'partial paresis', 'muscle stiffness',
                  'Alopecia', 'Obesity']

for column in binary_columns:
    df[column] = df[column].map({'No': 0, 'Yes': 1})

# Mapping for Positive/Negative Data
df['class'] = df['class'].map({'Negative': 0, 'Positive': 1})
df.head()

In [None]:
# print the correlation of the class
correlation_with_class = df.corr()['class'].abs().sort_values(ascending=False)

print(correlation_with_class)

In [None]:
# Display the heatmap
plt.subplots(figsize=(20,15))
sns.heatmap(df.corr(), cmap="YlGnBu", annot=True)

In [None]:
# show the description of the data
df.describe()

In [None]:
p_values = []
for column in df.drop(columns=['class']):
    feature_positive = df[df['class'] == 1][column]
    feature_negative = df[df['class'] == 0][column]
    _, p_value = ttest_ind(feature_positive, feature_negative)
    p_values.append((column, p_value))

# Select columns with p-value less than a significance level (e.g., 0.05)
significant_columns = [col for col, p_value in p_values if p_value < 0.05]

# Keep only the significant columns
df = df[['class'] + significant_columns]

print(df)

## Data Preparation


### Data Cleaning And Remove Outliers


### Features Engineering

### Split Data Into Training Set and Testing Set,  Normalizing Data


## Modeling

### K-Nearest Neighbours (KNN)

In [None]:
# consider the labels of the 5 nearest data points to a given input when making a prediction.
knn = KNeighborsClassifier(n_neighbors = 5)

# Fit model to data
knn.fit(X_train, y_train)


# Predict on new data
knn_pred = knn.predict(X_test)

# Evaluate performance
accuracyScore = accuracy_score(y_test, knn_pred)
print("Accuracy Score:", accuracyScore)

roundScore = round(accuracy_score(y_test, knn_pred)*100, 3)
print('Accuracy Score: ', roundScore)

# Mean Square Error
mse = mean_squared_error(y_test, knn_pred)
rmse = math.sqrt(mse)
print('RMSE: %f' % rmse)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, knn_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_test, knn_pred)
print("Classification Report:")
print(class_report)

In [None]:
# show the heatmap plot     
fig, ax = plt.subplots(figsize=(10,10)) 

mat = confusion_matrix(y_test, knn_pred)  #target variable and prediction
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,cmap="YlGnBu",
            xticklabels=["Predicted 0", "Predicted 1"],
            yticklabels=["True 0", "True 1"])
plt.title("Confusion Matrix")
plt.xlabel('True Label')
plt.ylabel('Predicted Label');

### Support Vector Machine (SVM / SVC)

### Random Forest

### Logistic Regression

In [None]:
# Create the linear regression model
model = LogisticRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)


# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Predicted 0", "Predicted 1"],
            yticklabels=["True 0", "True 1"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()

## Evaluation

## Deployment