#Exploratory Analysis

##Importing all used libraries

In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [60]:
# loading dataset and seeing firsts rows:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


##Exploring the dataset

In [61]:
# the dataset shape:
print(df.shape)

# statistical data about the age of people in the dataset:
print(df.describe())

(520, 17)
              Age
count  520.000000
mean    48.028846
std     12.151466
min     16.000000
25%     39.000000
50%     47.500000
75%     57.000000
max     90.000000


In [62]:
# histogram of ages:
fig = px.histogram(df, x="Age", title = "Distribution of ages")
fig.show()

In [63]:
# relationship between people's ages and their gender:
fig = px.histogram(df, x="Age", color = 'Gender', title = "Distribution of gender in ages", marginal = "violin")
fig.show()

In [64]:
# relationship between people's ages and diabetes result:
fig = px.histogram(df, x="Age", color="class", marginal="violin", title ="Distribution of class result in ages")
fig.show()

In [65]:
# relationship between some features and diabetes result:

df2 = df
df2["class"] = np.where(df["class"].str.contains("Positive"), "Yes", "No")                                 #making a copy of the dataset, changing the class text to "yes" or "no", just to simplify the graphics below

# creating lists of some characteristics, to suit the "graph_objects":
x0 = list(df2["Obesity"])                                                                                  #list with obesity data
x1 = list(df2["class"])                                                                                    #list with class data
x2 = list(df2["Polyuria"])                                                                                 #list with polyuria data
x3 = list(df2["Polydipsia"])                                                                               #list with polydipsia data
x4 = list(df2["Alopecia"])                                                                                 #list with alopecia data

# reserving space for subplots:
fig = make_subplots(rows=2, cols=2)

# adding traces (subplots), two for each plot, the first being any characteristic and the second, always the diabetes outcome class:
fig.add_trace(go.Histogram(x=x0, showlegend=True, legendgroup="group", text="Obesity", name="Obesity"), row=1, col=1)           #subplot one, obesity feature, with legend and forming the graph in the upper left corner
fig.add_trace(go.Histogram(x=x1, showlegend=True, legendgroup="group", text="class", name="class"), row=1, col=1)               #subplot one, class feature, without legend and forming the graph in the upper left corner

fig.add_trace(go.Histogram(x=x2, showlegend=True, legendgroup="group2", text="Polyuria", name="Polyuria"), row=1, col=2)        #subplot two, polyuria feature, with legend and forming the graph in the upper right corner
fig.add_trace(go.Histogram(x=x1, showlegend=True, legendgroup="group2", text="class", name="class"), row=1, col=2)              #subplot two, class feature, without legend and forming the graph in the upper right corner

fig.add_trace(go.Histogram(x=x3, showlegend=True, legendgroup="group3", text="Polydipsia", name="Polydipsia"), row=2, col=1)    #subplot tree, polydipsia feature, with legend and forming the graph in the bottom left corner
fig.add_trace(go.Histogram(x=x1, showlegend=True, legendgroup="group3", text="class", name="class"), row=2, col=1)              #subplot tree, class feature, without legend and forming the graph in the bottom left corner

fig.add_trace(go.Histogram(x=x4, showlegend=True, legendgroup="group4", text="Alopecia", name="Alopecia"), row=2, col=2)        #subplot four, alopecia feature, with legend and forming the graph in the bottom right corner
fig.add_trace(go.Histogram(x=x1, showlegend=True, legendgroup="group4", text="class", name="class"), row=2, col=2)              #subplot four, class feature, without legend and forming the graph in the bottom right corner


fig.show()                                                                                                                      #plotting

In [67]:
# seeing the parallel categories diagram (also known as parallel sets or alluvial diagram):
fig = px.parallel_categories(df)
fig.show()                        #plotting