In [40]:
import pandas as pd

### Iris

In [2]:
iris = pd.read_csv("data/iris/iris.data", header=None)

attr_names = ["sepal-length", "sepal-width", "petal-length", "petal-width"]
target = ["species"]

iris.columns = attr_names + target

species = ["Setosa", "Versicolor", "Virginica"]
iris = iris.replace(["Iris-setosa", "Iris-versicolor", "Iris-virginica"], species)

In [41]:
# scatterplot matrix

from plotly.subplots import make_subplots
import plotly.graph_objects as go

specs = [
    [{"type": "histogram"},            {"type": "scatter"},              {"type": "scatter"},            {"type": "scatter"}  ],
    [{"type": "histogram2dcontour"},   {"type": "histogram"},            {"type": "scatter"},            {"type": "scatter"}  ],
    [{"type": "histogram2dcontour"},   {"type": "histogram2dcontour"},   {"type": "histogram"},          {"type": "scatter"}  ],
    [{"type": "histogram2dcontour"},   {"type": "histogram2dcontour"},   {"type": "histogram2dcontour"}, {"type": "histogram"}]
]

titles = []

for i, feature in enumerate(attr_names):
    for j, pair in enumerate(attr_names):
        if j == i:
            titles.append(feature)
        if j > i:
            titles.append(f"{feature} / {pair}")
        if j < i:
            titles.append(f"{pair} / {feature}")
        

fig = make_subplots(
    rows=4, 
    cols=4, 
    specs=specs,
    subplot_titles=titles,
)


colors = {
    "Setosa": "CornflowerBlue", 
    "Versicolor": "IndianRed",
    "Virginica": "DarkSeaGreen", 
}

# diagonal
for i, feature in enumerate(attr_names):
    for j, pair in enumerate(attr_names):
        if j == i:
            fig.add_trace(
                go.Histogram(
                    x=iris[feature],
                    name=feature,
                    marker_color="CadetBlue",
                    legendgroup="Histogram"
                ), row=i+1, col=j+1
            )

# scatter
show_legend = dict(zip(species, [True]*3))

for i, feature in enumerate(attr_names):
    for j, pair in enumerate(attr_names):
        if j > i:
            for s in species:
                fig.add_trace(
                    go.Scatter(
                        x=iris.loc[iris["species"] == s, feature],
                        y=iris.loc[iris["species"] == s, pair],
                        name=s,
                        legendgroup="Scatter",
                        showlegend=show_legend[s],
                        mode="markers",
                        marker_color=colors[s]
                    ), row=i+1, col=j+1
                )
                show_legend[s] = False

# contour
for i, feature in enumerate(attr_names):
    for j, pair in enumerate(attr_names):
        if j < i:
            fig.add_trace(
                go.Histogram2dContour(
                    x=iris[pair],
                    y=iris[feature],
                    colorscale="Emrld",
                    showscale=False,
                ), row=i+1, col=j+1
            )

fig.update_layout(
    height=900,
    width=1250,
    title="Iris 1. Paired scatterplot matrix"
)

fig.update_coloraxes(colorbar_len=0.01) 

fig.show()




In [42]:
# histograms

data = []
for feature in attr_names:
    for s in species:
        x = iris[feature].loc[iris["species"] == s]
        data.append(
            go.Histogram(
                x=x,
                name=s,
                marker_color=colors[s],
                visible=False,
                xbins={
                    "start": 0,
                    "end": 200,
                    "size": .1
                }
            )
        )

fig = go.Figure(data=data)

n_attrs = len(attr_names)

for i in range(3):
    fig.data[i].visible = True

buttons = []
for i in range(0, len(fig.data), 3):
    button = {
        "method": "update",
        "args": [{"visible": [False] * len(fig.data)}],
        "label": attr_names[int(i/3)]
    }
    button["args"][0]["visible"][i:i+2] = [True] * 3
    buttons.append(button)

updatemenus = [{
    "type": "buttons",
    "direction": "left",
    "active": 0, 
    "buttons": buttons,
    "xanchor": "left",
    "yanchor": "top",
    "x": 0,
    "y": 1.075
}]

fig.update_layout(
    title="Iris 2. Histograms",
    updatemenus=updatemenus,
    barmode="overlay",
    legend_title="Species"
)

fig.update_traces(opacity=.75)
fig.update_layout(width=1250, height=750)
fig.update_xaxes(nticks=25)

In [43]:
specs = [[{"type": "histogram"}, {"type": "violin"}]] * 4

titles = []
for feature in attr_names:
    titles.append(f"Boxplot for {feature}")
    titles.append(f"Violinplot for {feature}")

fig = make_subplots(
    rows=4, 
    cols=2, 
    specs=specs,
    subplot_titles=titles,
)

colors = ["SteelBlue", "CadetBlue", "LightSalmon", "Thistle"]

for i, feature in enumerate(attr_names):
    fig.add_trace(
        go.Box(
            y=iris[feature],
            boxpoints="all",
            jitter=.5,
            pointpos=-2,
            name=feature,
            marker_color=colors[i],
            showlegend=False,
            boxmean="sd"
        ), row=i+1, col=1
    )

    fig.add_trace(
        go.Violin(
            y=iris[feature],
            marker_color=colors[i],
            name=feature,
            showlegend=False
        ), row=i+1, col=2
    )

fig.update_layout(
    title="Iris 3. Boxplots and violinplots",
    width=1250,
    height=1500
)

fig.show()

### Congressional Voting Records

In [6]:
voting = pd.read_csv("data/voting-records/house-votes-84.data", header=None)

voting = voting.rename(columns = {0: "affiliation"})
affiliation = voting["affiliation"]
voting = voting.drop("affiliation", axis=1)

import numpy as np

voting = voting.replace(["y", "n", "?"], [1, 0, np.NaN])

n_votes = len(voting.columns)

In [7]:
# data prep

N = len(voting)
D = len(voting[affiliation == "democrat"])
R = len(voting[affiliation == "republican"])

all_inf = voting.sum()
all_nan = voting.isna().sum()
all_opp = (N - all_inf - all_nan) 

dem_inf = voting.loc[affiliation == "democrat"].sum()
dem_nan = voting.loc[affiliation == "democrat"].isna().sum()
dem_opp = (D - dem_inf - dem_nan) 

rep_inf = voting.loc[affiliation == "republican"].sum()
rep_nan = voting.loc[affiliation == "republican"].isna().sum()
rep_opp = (R - rep_inf - rep_nan)

marker = {
    "colors": ["MediumSeaGreen", "Tomato", "SkyBlue"],
    "line": {"color": "Black", "width": 1}
}

labels=["In Favor", "Opposed", "Other"]

votings = [
    "handicapped-infants", 
    "water-project-cost-sharing", 
    "adoption-of-the-budget-resolution", 
    "physician-fee-freeze",
    "el-salvador-aid",
    "religious-groups-in-schools",
    "anti-satellite-test-ban",
    "aid-to-nicaraguan-contras",
    "mx-missile",
    "immigration",
    "synfuels-corporation-cutback",
    "education-spending",
    "superfund-right-to-sue",
    "crime",
    "duty-free-exports",
    "export-administration-act-south-africa"
]

In [44]:
# piecharts
fig = make_subplots(
    rows=1, 
    cols=3, 
    specs=[[{"type": "pie"}] * 3],
    subplot_titles=["All", "Democrat", "Republican"]
)
    
for i in range(1, n_votes+1):
    fig.add_trace(
        go.Pie( 
            name="All",
            visible=False,
            labels=labels, 
            values=[all_inf[i], all_opp[i], all_nan[i]], 
            sort=False,
            legendgroup=1,
            legendgrouptitle_text="All Congressmen",
            textinfo="label+percent+value",
            pull=.075,
            marker=marker
        ), 
        row = 1, col = 1
    )

    fig.add_trace(
        go.Pie(
            name="Democrat",
            visible=False,
            labels=labels, 
            values=[dem_inf[i], dem_opp[i], dem_nan[i]],
            sort=False,
            legendgroup=2,
            legendgrouptitle_text="Democrat",
            textinfo="label+percent+value",
            hole=0.25,
            marker=marker
        ), 
        row = 1, col = 2
    )

    fig.add_trace(
        go.Pie(
            name="Republican",
            visible=False,
            labels=labels, 
            values=[rep_inf[i], rep_opp[i], rep_nan[i]],
            sort=False,
            legendgroup=3,
            legendgrouptitle_text="Republican",
            textinfo="label+percent+value",
            hole=0.25,
            marker=marker
        ), 
        row = 1, col = 3
    )

for i in range(3):
    fig.data[i].visible = True
   
steps = []
for i in range(0, len(fig.data), 3):
    step = {
        "method": "restyle",
        "args": [
            {"visible": [False] * len(fig.data)},
        ],
        "label": str(int(i/3) + 1) + ": " + str(votings[int(i/3)])
    }
    step["args"][0]["visible"][i:i+2] = [True] * 3
    steps.append(step)

sliders = [{
    "active": 0,
    "currentvalue": {"prefix": "Voting "},
    "steps": steps
}]

fig.update_layout(
    sliders=sliders,
    title="Congressional Voting Records 1. Votes Distribution",
    height=600,
    width=1700,
    annotations=[{"y": .9, "x": 0}, {"y": .9, "x": .35}, {"y": .9, "x": .7}]
)

fig.show()

In [9]:
# missing values imputation

def imputation(inf, opp, nan, alpha=.25):
    return (inf + alpha*nan) / (inf + nan + opp)

dem_inp = imputation(dem_inf, dem_opp, dem_nan)
rep_inp = imputation(rep_inf, rep_opp, rep_nan)

for i in range(n_votes):
    voting.loc[affiliation == "democrat", i+1] = voting.loc[affiliation == "democrat", i+1].replace(np.NaN, dem_inp[i+1])
    voting.loc[affiliation == "republican", i+1] = voting.loc[affiliation == "republican", i+1].replace(np.NaN, rep_inp[i+1])

In [45]:
# dimensionality reduction for visualizations

from sklearn.manifold import TSNE

voting_tsne = TSNE(n_components=2).fit_transform(voting)
voting_tsne = pd.DataFrame(voting_tsne, columns=["TSNE1", "TSNE2"])

from sklearn.decomposition import PCA

voting_pca = PCA(n_components=2).fit_transform(voting)
voting_pca = pd.DataFrame(voting_pca, columns=["PCA1", "PCA2"])

colors = ["CornflowerBlue", "Crimson"]

fig = make_subplots(
    rows=1, 
    cols=2, 
    specs=[[{"type": "scatter"}] * 2],
    subplot_titles=["TSNE", "PCA"])

fig.add_trace(
    go.Histogram2dContour(
        x=voting_tsne["TSNE1"],
        y=voting_tsne["TSNE2"],
        colorscale="Emrld",
        showscale=False
    ),
    row = 1, col = 1
)

fig.add_trace(
    go.Scatter(
        x=voting_tsne["TSNE1"][affiliation == "democrat"], 
        y=voting_tsne["TSNE2"][affiliation == "democrat"], 
        mode="markers",
        legendgroup=1,
        legendgrouptitle_text="TSNE",
        marker_color="CornflowerBlue",
        name="Democrat"
    ),
    row = 1, col = 1
)

fig.add_trace(
    go.Scatter(
        x=voting_tsne["TSNE1"][affiliation == "republican"], 
        y=voting_tsne["TSNE2"][affiliation == "republican"], 
        mode="markers",
        legendgroup=1,
        legendgrouptitle_text="TSNE",
        marker_color="IndianRed",
        name="Republican"
    ),
    row = 1, col = 1
)

fig.add_trace(
    go.Histogram2dContour(
        x=voting_pca["PCA1"],
        y=voting_pca["PCA2"],
        colorscale="Emrld",
        showscale=False
    ),
    row = 1, col = 2   
)

fig.add_trace(
    go.Scatter(
        x=voting_pca["PCA1"][affiliation == "democrat"], 
        y=voting_pca["PCA2"][affiliation == "democrat"], 
        mode="markers",
        legendgroup=2,
        legendgrouptitle_text="PCA",
        marker_color="CornflowerBlue",
        name="Democrat"
    ),
    row = 1, col = 2
)

fig.add_trace(
    go.Scatter(
        x=voting_pca["PCA1"][affiliation == "republican"], 
        y=voting_pca["PCA2"][affiliation == "republican"], 
        mode="markers",
        legendgroup=2,
        legendgrouptitle_text="PCA",
        marker_color="IndianRed",
        name="Republican"
    ),
    row = 1, col = 2
)

fig.update_layout(
    title="Congressional Voting Records 2. TSNE and PCA visualizations",
    height=600,
    width=1700
)

fig.show()

In [48]:
# anova

from sklearn.feature_selection import f_classif

fig = make_subplots(
    rows=2, 
    cols=2, 
    specs=[[{"type": "bar"}] * 2] *2 ,
    subplot_titles=["ANOVA", "p-values", "Chi-sq", "p-values"]
)

lr = f_classif(voting, affiliation)

fig.add_trace(
    go.Bar(
        showlegend=False,
        name="Coefficients",
        marker_color="CornflowerBlue",
        x=votings,
        y=lr[0]
    ), row=1, col=1
)

fig.add_trace(
    go.Bar(
        showlegend=False,
        name="Coefficients",
        marker_color="IndianRed",
        x=votings,
        y=lr[1]
    ), row=1, col=2
)

# chi-sq test

from sklearn.feature_selection import chi2

chi2_ = chi2(voting, affiliation)

fig.add_trace(
    go.Bar(
        showlegend=False,
        name="ANOVA coefficients",
        x=votings,
        y=chi2_[0],
        marker_color="CornflowerBlue"
    ), row=2, col=1
)

fig.add_trace(
    go.Bar(
        showlegend=False,
        name="Chi-sq test coefficients",
        marker_color="IndianRed",
        x=votings,
        y=chi2_[1]
    ), row=2, col=2
)

fig.update_layout(
    title="Congressional Voting Records 3. ANOVA and Chi-sq tests",
    height=850, 
    width=1700
)
fig.show()