## Analysis

Now that we've labeled our data, let's check our results and make some conclusions from it

In [2]:
import os

import pandas as pd

Let's load our data

In [5]:
current_wd = os.getcwd()

In [11]:
gpt4_classifications = pd.read_csv(
    "../manuscript_pilot/representative_diversification_feed.csv"
)
llama3_8b_classifications = pd.read_csv(
    "classified_posts_llama3_8b.csv"
)
llama3_70b_classifications = pd.read_csv(
    "classified_posts_llama3_70b.csv"
)
ground_truth_labels = pd.read_csv(
    "../manuscript_pilot/hand_labeled_pilot_posts.csv"
)


#### Cleaning and preprocessing
Now let's do some cleaning and preprocessing so everything is in the same format

##### GPT4

In [22]:
gpt_subset = gpt4_classifications[["link", "civic", "political_ideology"]]

In [None]:
gpt_subset["civic"] = gpt_subset["civic"].replace(True, "civic")
gpt_subset["civic"] = gpt_subset["civic"].replace(False, "not civic")
gpt_subset["political_ideology"] = gpt_subset["political_ideology"].replace(
    " left-leaning", "left-leaning"
)
gpt_subset = gpt_subset.rename(
    columns={
        "civic": "gpt4_civic_label",
        "political_ideology": "gpt4_political_label"
    }
)

In [30]:
gpt_subset.columns

Index(['link', 'gpt4_civic_label', 'gpt4_political_label'], dtype='object')

##### Llama models

Let's only include the ones that have valid JSON responses, since these are the only ones that we could label.

In [26]:
llama3_8b_subset = llama3_8b_classifications[
    llama3_8b_classifications["valid_json_response"] == True
]
llama3_8b_subset = llama3_8b_subset[
    ["link", "civic_label", "political_label"]
]
llama3_8b_subset = llama3_8b_subset.rename(
    columns={
        "civic_label": "llama3-8b_civic_label",
        "political_label": "llama3-8b_political_label"
    }
)

In [28]:
llama3_8b_subset.columns

Index(['link', 'llama3-8b_civic_label', 'llama3-8b_political_label'], dtype='object')

In [27]:
llama3_70b_subset = llama3_70b_classifications[
    llama3_70b_classifications["valid_json_response"] == True
]
llama3_70b_subset = llama3_70b_subset[
    ["link", "civic_label", "political_label"]
]
llama3_70b_subset = llama3_70b_subset.rename(
    columns={
        "civic_label": "llama3-70b_civic_label",
        "political_label": "llama3-70b_political_label"
    }
)

In [29]:
llama3_70b_subset.columns

Index(['link', 'llama3-70b_civic_label', 'llama3-70b_political_label'], dtype='object')

##### Ground-truth labels

Let's remove any NAs and then do processing like the others

In [32]:
ground_truth_labels = ground_truth_labels[
    ~pd.isna(ground_truth_labels["civic_hand_label"])
]

In [35]:
ground_truth_labels_subset = ground_truth_labels[
    ["link", "civic_hand_label", "political_ideology_hand_label"]
]

#### Now let's get some basic counts and statistics

GPT4:

In [20]:
print(gpt_subset["gpt4_civic_label"].value_counts())

civic
not civic    187
civic        174
Name: count, dtype: int64


In [37]:
print(
    gpt_subset[
        gpt_subset["gpt4_civic_label"] == "civic"
    ]["gpt4_political_label"].value_counts()
)

gpt4_political_label
left-leaning     140
unclear           18
right-leaning     15
moderate           1
Name: count, dtype: int64


Llama3-8b

In [38]:
print(llama3_8b_subset["llama3-8b_civic_label"].value_counts())

llama3-8b_civic_label
civic        199
not civic    148
Name: count, dtype: int64


In [39]:
print(
    llama3_8b_subset[
        llama3_8b_subset["llama3-8b_civic_label"] == "civic"
    ]["llama3-8b_political_label"].value_counts()
)

llama3-8b_political_label
left-leaning     130
right-leaning     34
unclear           32
moderate           3
Name: count, dtype: int64


Llama3-70b

In [40]:
print(llama3_70b_subset["llama3-70b_civic_label"].value_counts())

llama3-70b_civic_label
civic        200
not civic    147
Name: count, dtype: int64


In [41]:
print(
    llama3_70b_subset[
        llama3_70b_subset["llama3-70b_civic_label"] == "civic"
    ]["llama3-70b_political_label"].value_counts()
)

llama3-70b_political_label
left-leaning     173
right-leaning     11
unclear            9
moderate           7
Name: count, dtype: int64


Ground truth

In [42]:
print(ground_truth_labels_subset["civic_hand_label"].value_counts())

civic_hand_label
civic        193
not civic    161
Name: count, dtype: int64


In [43]:
print(
    ground_truth_labels_subset[
        ground_truth_labels_subset["civic_hand_label"] == "civic"
    ]["political_ideology_hand_label"].value_counts()
)

political_ideology_hand_label
left-leaning     160
unclear           20
right-leaning      7
moderate           5
Name: count, dtype: int64


### Analyze the data together

Let's join the data together to get a joined version of the labels and the ground truth

In [44]:
joined_df = pd.merge(
    gpt_subset, llama3_8b_subset, on="link"
)
joined_df = pd.merge(
    joined_df, llama3_70b_subset, on="link"
)
joined_df = pd.merge(
    joined_df, ground_truth_labels_subset, on="link"
)

In [45]:
joined_df.shape

(333, 9)