In [None]:
'''

Data Preprocessing:

a. Handling Missing Values:
We identify if there are any missing values in our dataset.
Then decide on an appropriate strategy for handling missing values.
Common approaches include:
Removing rows with missing values, inputing missing values using mean, median, or other statistical methods.

b. Normalizing/Standardizing Numerical Features:
We check the distribution of numerical features.
Then apply normalization or standardization to ensure that all numerical features are on a similar scale.
Common methods include Min-Max scaling or Standard Scaling (z-score normalization).

c. Encoding Categorical Variables:
If there are categorical variables in our dataset, we'll need to encode them for the machine learning model.
Common encoding methods include:
One-Hot Encoding: For nominal categorical variables.
Label Encoding: For ordinal categorical variables.

d. Splitting the Dataset:
We Split our dataset into training and testing sets to evaluate the model's performance. A typical split might be
80% for training and 20% for testing.
Then use tools like scikit-learn's train_test_split function.

'''

In [1]:
# import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Load dataset
df = pd.read_csv('diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
# Handling Missing Values
df = df.dropna() 

In [4]:
# Extract features and target variable
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [5]:
# Normalizing Numerical Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
# Check for categorical variables
categorical_columns = df.select_dtypes(include=['object']).columns

if len(categorical_columns) > 0:
    print("Categorical variables present in the dataset:")
    print(categorical_columns)
else:
    print("No categorical variables found in the dataset.")

No categorical variables found in the dataset.


In [7]:
# Splitting the Dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Check the shapes of the resulting datasets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (614, 8) (614,)
Testing set shape: (154, 8) (154,)


In [None]:
# Model Selection and Training (Using Logistic Regression):
# We choose a machine learning algorithm for classification.
# Common algorithms include Logistic Regression, Decision Trees, Random Forests, Support Vector Machines (SVM), or Neural
# Networks.

In [8]:
# Import LogisticRegression
from sklearn.linear_model import LogisticRegression

In [9]:
# Choose and initialize your model
model = LogisticRegression()

# Train the model on the training set
model.fit(X_train, y_train)

In [None]:
# Model Evaluation:
# Evaluating the performance of the model on the testing set. We use metrics such as accuracy, precision, recall,
# F1 score and the confusion matrix.

In [11]:
# Import libraries for model evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [12]:
# Predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Accuracy: 0.7532467532467533
Confusion Matrix:
 [[79 20]
 [18 37]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.80      0.81        99
           1       0.65      0.67      0.66        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154



In [None]:
# Hyperparameter Tuning:
# Experiment with different hyperparameter values to optimize the model's performance. We can use techniques like
# Grid Search or Random Search for hyperparameter tuning.

In [13]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

In [14]:
# Define hyperparameters and their values
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

# Perform Grid Search
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'C': 10}


In [None]:
# Feature Importance and Explainability:
# If applicable to our chosen model, we can explore feature importance to understand which features contribute
# most to predictions.

In [15]:
# For logistic regression, we can get feature coefficients
feature_importance = model.coef_[0]

# Print feature importance
for feature, importance in zip(X.columns, feature_importance):
    print(f"{feature}: {importance}")

Pregnancies: 0.2162191146191342
Glucose: 1.0693043509966906
BloodPressure: -0.25871749559858703
SkinThickness: 0.04726609969275953
Insulin: -0.19919621066370874
BMI: 0.7920861979489214
DiabetesPedigreeFunction: 0.2269584099400954
Age: 0.43005814134198533


In [None]:
# Further Improvements (Using Random Forest):
# Consider further refining your model or exploring advanced techniques like ensemble methods, cross-validation
# or feature engineering to enhance predictive performance.
# Below is an example of using an ensemble method (Random Forest)

In [16]:
# Import Random Forest
from sklearn.ensemble import RandomForestClassifier

In [17]:
# Train the model on training set
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

In [18]:
# Import libraries for model evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [19]:
# Predictions on the testing set using Random Forest model
y_pred_rf = rf_model.predict(X_test)

# Evaluate the Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
class_report_rf = classification_report(y_test, y_pred_rf)

print("Random Forest Model Evaluation:")
print("Accuracy:", accuracy_rf)
print("Confusion Matrix:\n", conf_matrix_rf)
print("Classification Report:\n", class_report_rf)

Random Forest Model Evaluation:
Accuracy: 0.7532467532467533
Confusion Matrix:
 [[78 21]
 [17 38]]
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.79      0.80        99
           1       0.64      0.69      0.67        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.74       154
weighted avg       0.76      0.75      0.76       154

