In [None]:
#imports
import sys
sys.path.append("../")
from utils import *
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [None]:

#define data frames:
students_df = pd.read_csv("../datasets/Students_Performance.csv")
students_df.head()
temperature_df = pd.read_csv("../datasets/City_Temperature.csv")
temperature_df.head()
temperature_df = temperature_df.loc[temperature_df["Temp"] != -99]
temperature_df.head()

In [None]:
#bar repr:
df_count_ethnicities = students_df.groupby(['race.ethnicity']).size().reset_index(name='Count')
px.bar(df_count_ethnicities, x="race.ethnicity", y="Count", height=200).show()
df_count_ethnicities = students_df.groupby(['race.ethnicity', 'gender']).size().reset_index(name='Count')
px.bar(df_count_ethnicities, x="race.ethnicity", y="Count", color = "gender", height=200).show()
#pie repr:
colored_by = "test.preparation.course"
split_by = 'parental.level.of.education'

for level in students_df[split_by].unique():
    df = students_df.loc[students_df[split_by] == level].groupby([colored_by]).size().reset_index(name='Count')
    px.pie(df, values='Count', names = colored_by, title = level, height=150).show()

In [None]:

# histogram

fig = make_subplots(rows=1, cols=3,
                    subplot_titles=("Math score distribution", "Writing score distribution", "Reading score distribution"))

for i, label in enumerate(["math.score", "writing.score", "reading.score"]):
    fig.append_trace(go.Histogram(x=students_df[label], showlegend = False), row = 1, col = i+1)
    fig.update_xaxes(title_text=label.capitalize(), row=1, col=i+1)

fig.update_layout(height=300).show()

In [None]:

#correlation using scattering

students_df["gender.cat"] = pd.Categorical(students_df["gender"]).codes

fig = make_subplots(rows=1, cols=2, start_cell="bottom-left")

fig.add_traces([go.Scatter(x=students_df["math.score"], y=students_df["reading.score"], mode="markers",
                           marker = dict(color = students_df["gender.cat"], colorscale="Bluered"), showlegend = False),
                go.Scatter(x=students_df["math.score"], y=students_df["science.score"], mode="markers",
                           marker = dict(color = students_df["gender.cat"], colorscale="Bluered"), showlegend = False)],
               rows=[1,1], cols=[1,2])
fig.add_trace(go.Scatter(x = [None], y = [None], mode = 'markers',
                        marker = dict(color="Blue"), legendgroup = "female", name = "female"), row = 1, col =1)
fig.add_trace(go.Scatter(x = [None], y = [None], mode = 'markers',
                        marker = dict(color="Red"), legendgroup = "male", name = "male"), row = 1, col =1)
fig.update_xaxes(title_text="Reading Score", row=1, col=1)
fig.update_xaxes(title_text="Science Score", row=1, col=2)
fig.update_yaxes(title_text="Math Score")
fig.show()

In [None]:
df_count_ethnicities = students_df.groupby(['race.ethnicity', 'gender']).size().reset_index(name='Count')

x_ = np.unique(df_count_ethnicities["race.ethnicity"].tolist())
y_ = np.unique(df_count_ethnicities["gender"].tolist())

values = np.array(df_count_ethnicities["Count"]).reshape(5, 2)
values_norm_row = (values.T/values.sum(axis = 1)).T
values_norm_col = values/values.sum(axis = 0)

for title, z in [["Counts Heatmap", values], ["Row Normalized", values_norm_row], ["Column Normalized", values_norm_col]]:
    go.Figure(go.Heatmap(x=y_, y=x_,z=z), layout=go.Layout(title=title, height=300, width=200)).show()

In [1]:
### Estimating Expectation of Distribution

In [2]:
ms = np.linspace(2, 200, 200).astype(np.int)
mu, sigma = 5, 2
estimated_mean = []
for m in ms:
    X = np.random.normal(mu, sigma, size=m)
    estimated_mean.append(np.mean(X))


go.Figure([go.Scatter(x=ms, y=estimated_mean, mode='markers+lines', name=r'$\widehat\mu$'),
           go.Scatter(x=ms, y=[mu]*len(ms), mode='lines', name=r'$\mu$')],
          layout=go.Layout(title=r"$\text{(5) Estimation of Expectation As Function Of Number Of Samples}$",
                  xaxis_title="$m\\text{ - number of samples}$",
                  yaxis_title="r$\hat\mu$",
                  height=300)).show()

NameError: name 'np' is not defined

In [None]:
### Estimating Variance Of Distribution

In [None]:
estimated_sigmas = []
for m in ms:
    X = np.random.normal(mu, sigma, size=m)
    estimated_sigmas.append(X.var(ddof=1))


go.Figure([go.Scatter(x=ms, y=estimated_sigmas, mode='markers+lines', name=r'$\widehat\sigma^2$'),
           go.Scatter(x=ms, y=[sigma**2]*len(ms), mode='lines', name=r'$\sigma^2$')],
         layout=go.Layout(title=r"$\text{(6) Estimation of Variance As Function Of Number Of Samples}$",
                          xaxis_title="$m\\text{ - number of samples}$",
                          yaxis_title="r$\hat\sigma^2$",
                          height=300)).show()