<a href="https://colab.research.google.com/github/KekaiApana/datasci112_final_project/blob/main/DATASCI_112_Supreme_Court_Data_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **DATASCI 112 Final Project: Supreme Court Data Exploration**
### *By: Kekai and Adrian*

This project explores the Cornell Supreme Court Oral Arguments Corpus (https://convokit.cornell.edu/documentation/supreme.html). This data includes data from cases spanning 1955 to 2019.

**Research question: Can we predict how Chief Justice Roberts will vote on a case based on their linguistic patterns in oral arguments?**

In this file, we analyze the linguistic patterns of Chief Justice Roberts and other Justices during oral arguments to explore whether these patterns correlate with their voting behavior. Specifically, we:

Visualize Voting Similarity: Create a heatmap to show how often each pair of Justices votes the same way, highlighting patterns of agreement or disagreement.

Analyze Linguistic Patterns: Use text analysis techniques to identify the most frequently used words by each Justice during oral arguments.

Correlate Words and Voting: Explore the relationship between the words used by Justices and their voting behavior, using scatter plots and bar charts.

Heatmap: Shows word usage by Justices.

Bar Chart: Displays top words for each Justice and their voting similarity.






In [None]:
pip install -U kaleido



In [None]:
import pandas as pd
df = pd.read_csv('scotus_roberts_data.csv')

In [None]:
import plotly.express as px

# Create a pivot table to count the number of times each pair of Justices votes the same way
vote_columns = [col for col in df.columns if col.startswith('votes_side.j__')]
justices = [col.split('__')[-1] for col in vote_columns]

# Create a matrix to store the number of times each pair of Justices votes the same way
vote_matrix = pd.DataFrame(index=justices, columns=justices)

for i, justice1 in enumerate(justices):
    for j, justice2 in enumerate(justices):
        if i == j:
            vote_matrix.loc[justice1, justice2] = 1  # A Justice always votes with themselves
        else:
            # Count the number of times both Justices voted the same way
            same_votes = df[f'votes_side.j__{justice1}'] == df[f'votes_side.j__{justice2}']
            vote_matrix.loc[justice1, justice2] = same_votes.mean()

# Convert the matrix to a long format for plotting
vote_matrix_long = vote_matrix.stack().reset_index()
vote_matrix_long.columns = ['Justice1', 'Justice2', 'Vote Similarity']

# Create a heatmap
fig = px.imshow(vote_matrix,
                labels=dict(x="Justice", y="Justice", color="Vote Similarity"),
                x=justices, y=justices,
                title="Voting Similarity Among Justices in the Roberts Court")

fig.update_layout(
        width = 800,
        height = 800,
        title_font=dict(size=24),
        xaxis=dict(
            title_font=dict(size=24),
            tickfont=dict(size=24)
        ),
        yaxis=dict(
            title_font=dict(size=24),
            tickfont=dict(size=24)
        )
    )


# Save the figure as a PNG file
fig.write_image("voting_similarity_heatmap.png")

fig.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Extract the text and speaker columns
texts = df['text']
speakers = df['speaker']

# Create a CountVectorizer to count the frequency of words
vectorizer = CountVectorizer(stop_words='english', max_features=100)
X = vectorizer.fit_transform(texts)

# Convert the sparse matrix to a DataFrame
word_freq_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
word_freq_df['speaker'] = speakers

# Group by speaker and sum the word frequencies
speaker_word_freq = word_freq_df.groupby('speaker').sum()

# Normalize the word frequencies by the total number of words spoken by each Justice
speaker_word_freq = speaker_word_freq.div(speaker_word_freq.sum(axis=1), axis=0)

# Now, we can correlate the word frequencies with the voting patterns
# For simplicity, let's just visualize the top 10 words for each Justice
top_words_per_justice = speaker_word_freq.apply(lambda x: x.nlargest(10).index.tolist(), axis=1)

# Create a bar chart for each Justice's top words
i = 1
for justice in top_words_per_justice.index:
    fig = px.bar(speaker_word_freq.loc[justice].nlargest(10),
                 title=f"Top 10 Words for {justice}",
                 labels={
            'index': 'Words',
            'value': 'Frequency'
        })

    fig.update_layout(
        width = 800,
        height = 800,
        title_font=dict(size=24),
        xaxis=dict(
            title_font=dict(size=24),
            tickfont=dict(size=24)
        ),
        yaxis=dict(
            title_font=dict(size=24),
            tickfont=dict(size=24)
        )
    )
    fig.write_image(f"top_words_justice{i}.png")
    fig.show()

In [None]:
# Here, we just visualize the top words for each Justice and their voting similarity
# Now, correlate the top words with voting similarity
for justice in top_words_per_justice.index:
    # Remove the 'j__' prefix from the justice name to match the vote_matrix
    justice_name = justice.split('__')[-1]

    print(f"Top words for {justice}: {top_words_per_justice[justice]}")
    print(f"Voting similarity with other Justices: {vote_matrix.loc[justice_name]}")

Top words for j__anthony_m_kennedy: ['case', 'just', 'say', 'think', 'court', 'question', 'don', 'state', 'justice', 'suppose']
Voting similarity with other Justices: john_paul_stevens      0.235552
sandra_day_oconnor     0.019441
antonin_scalia         0.547067
anthony_m_kennedy             1
david_h_souter         0.196304
clarence_thomas        0.652057
ruth_bader_ginsburg    0.611045
stephen_g_breyer       0.638595
john_g_roberts_jr      0.723548
samuel_a_alito_jr      0.670593
sonia_sotomayor        0.413836
elena_kagan            0.356783
neil_gorsuch           0.063963
brett_m_kavanaugh           0.0
Name: anthony_m_kennedy, dtype: object
Top words for j__antonin_scalia: ['don', 'say', 'know', 'mean', 'right', 'think', 'just', 'case', 'court', 'state']
Voting similarity with other Justices: john_paul_stevens      0.201029
sandra_day_oconnor     0.020998
antonin_scalia                1
anthony_m_kennedy      0.547067
david_h_souter         0.171002
clarence_thomas        0.620504

In [None]:
import networkx as nx
import plotly.graph_objects as go

# Step 1: Prepare the data
# Create a graph where nodes are Justices and edges represent shared words
G = nx.Graph()

# Add nodes (Justices)
for justice in top_words_per_justice.index:
    justice_name = justice.split('__')[-1]
    G.add_node(justice_name)

# Add edges based on shared words
for justice1 in top_words_per_justice.index:
    justice1_name = justice1.split('__')[-1]
    for justice2 in top_words_per_justice.index:
        justice2_name = justice2.split('__')[-1]
        if justice1_name != justice2_name:
            shared_words = set(top_words_per_justice[justice1]).intersection(set(top_words_per_justice[justice2]))
            if shared_words:
                # Add edge with weight proportional to the number of shared words
                G.add_edge(justice1_name, justice2_name, weight=len(shared_words))

# Step 2: Create a network graph
pos = nx.spring_layout(G)  # Layout for the graph

# Create edge traces with varying thickness and color based on weight
edge_trace = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    weight = G[edge[0]][edge[1]]['weight']  # Get the weight (number of shared words)
    edge_trace.append(go.Scatter(
        x=[x0, x1],
        y=[y0, y1],
        mode='lines',
        line=dict(width=weight * 0.5, color='blue'),  # Thicker line for higher weight
        opacity=0.3,  # Adjust opacity for better visibility
        hoverinfo='text',
        text=f"Shared Words: {weight}",  # Display number of shared words on hover
        name=f"{edge[0]} - {edge[1]}"
    ))

# Create node trace
node_trace = go.Scatter(
    x=[pos[node][0] for node in G.nodes()],
    y=[pos[node][1] for node in G.nodes()],
    mode='markers+text',
    text=[node for node in G.nodes()],
    marker=dict(
        size=20,
        color='lightblue',  # Node color
        line=dict(width=2, color='lightblue')  # Node border
    ),
    textposition="top center",
    hoverinfo='text',
    textfont=dict(
        size=24,
        color='black',
        family='Arial Black'  # Use a bold font family
    )
)

# Create the figure
fig = go.Figure(data=edge_trace + [node_trace],
                layout=go.Layout(
                    title="Network Graph: Justices and Shared Words",
                    title_x=0.5,
                    title_font=dict(size=24),
                    showlegend=False,
                    hovermode='closest',
                    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                    margin=dict(l=40, r=40, b=40, t=40),
                    paper_bgcolor='rgba(0,0,0,0)',
                    plot_bgcolor='rgba(0,0,0,0)'
                ))

fig.update_layout(
        width = 800,
        height = 800
    )

fig.write_image("network graph.png")

fig.show()