In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf

# Functions

## Data Cleaning

In [None]:
def display_countplot(df, x, title=None, filename=None, x_ticks=None):
    """Function to display the countplot for the specified column of the df dataframe
    Args:
        1) df - a dataframe that contains the data required for visualization
        2) x - the name of the dataframe column on which the countplot is built
        3) title - the name of the graph (countplot)
        4) filename - the path where the file will be saved (with the file name, the file extension is not required) 
        5) x_ticks - ticks on the OX axis of the graph
    """
    plt.figure(figsize=(10, 5))
    sns.countplot(data=df, x=x)
    plt.title(title)
    if x_ticks:
        plt.xticks(x_ticks)
    if filename:
        plt.savefig(f'{filename}.png', bbox_inches='tight')
    plt.show()

# Data Cleaning and Preprocessing

In [None]:
columns = ['pregnancies', 'glucose_concentration', 'blood_pressure', 'triceps_skin_thickness', 'serum_insulin', 'bmi',
           'diabetes_pedigree_function', 'age', 'has_diabetes']
df = pd.read_csv('data/pima-indians-diabetes.csv', names=columns)
df.head()

In [None]:
df.info()

In [None]:
len(df[df['pregnancies'] < 0])

### Explore unique values of each column

In [None]:
print(f'Unique values of each column of dataset')
for column in df.columns:
    print(f"{column}: {df[column].unique()}")

#### Explore the 'pregnancies' column

In [None]:
df['pregnancies'].unique()

In [None]:
display_countplot(df=df, x='pregnancies', title='Number of times pregnant', filename='graphs/pregnancies_countplot')

#### Explore the 'glucose_concentration' column

In [None]:
glucose_arr = df['glucose_concentration'].unique()
glucose_arr

In [None]:
len(df[df['glucose_concentration']==0])

#### Explore the 'blood_pressure' column

In [None]:
blood_pressure_arr = df['blood_pressure'].unique()
blood_pressure_arr.sort()
blood_pressure_arr

In [None]:
len(df[df['blood_pressure']==0])

#### Explore the 'triceps_skin_thickness' column

In [None]:
triceps_skin_thickness = df['triceps_skin_thickness'].unique()
triceps_skin_thickness.sort()
triceps_skin_thickness

In [None]:
len(df[df['triceps_skin_thickness']==0])

##### Solution = Imputation

Imputation: If the feature is important and you decide to retain it, you can consider imputing the missing values using techniques such as mean imputation, median imputation, or more advanced methods like k-nearest neighbors imputation. Imputation can help you retain the valuable information while filling in the gaps.

#### Explore the 'serum_insulin' column

In [None]:
serum_insulin = df['serum_insulin'].unique()
serum_insulin.sort()
serum_insulin

In [None]:
len(df[df['serum_insulin']==0])

#### Explore the 'bmi' column

In [None]:
bmi = df['bmi'].unique()
bmi.sort()
bmi

In [None]:
len(df[df['bmi']==0])

#### Explore the 'diabetes_pedigree_function' column

In [None]:
diabetes_pedigree_func_arr = df['diabetes_pedigree_function'].unique()
diabetes_pedigree_func_arr.sort()
diabetes_pedigree_func_arr

#### Explore the 'age' column

In [None]:
age_arr = df['age'].unique()
age_arr.sort()
age_arr

#### Explore the 'has_diabetes' column

In [None]:
df['has_diabetes'].unique()