## Analysis

Now that we've labeled our data, let's check our results and make some conclusions from it

In [49]:
import os

import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

Let's load our data

In [5]:
current_wd = os.getcwd()

In [11]:
gpt4_classifications = pd.read_csv(
    "../manuscript_pilot/representative_diversification_feed.csv"
)
llama3_8b_classifications = pd.read_csv(
    "classified_posts_llama3_8b.csv"
)
llama3_70b_classifications = pd.read_csv(
    "classified_posts_llama3_70b.csv"
)
ground_truth_labels = pd.read_csv(
    "../manuscript_pilot/hand_labeled_pilot_posts.csv"
)


#### Cleaning and preprocessing
Now let's do some cleaning and preprocessing so everything is in the same format

##### GPT4

In [22]:
gpt_subset = gpt4_classifications[["link", "civic", "political_ideology"]]

In [None]:
gpt_subset["civic"] = gpt_subset["civic"].replace(True, "civic")
gpt_subset["civic"] = gpt_subset["civic"].replace(False, "not civic")
gpt_subset["political_ideology"] = gpt_subset["political_ideology"].replace(
    " left-leaning", "left-leaning"
)
gpt_subset = gpt_subset.rename(
    columns={
        "civic": "gpt4_civic_label",
        "political_ideology": "gpt4_political_label"
    }
)

In [30]:
gpt_subset.columns

Index(['link', 'gpt4_civic_label', 'gpt4_political_label'], dtype='object')

##### Llama models

Let's only include the ones that have valid JSON responses, since these are the only ones that we could label.

In [26]:
llama3_8b_subset = llama3_8b_classifications[
    llama3_8b_classifications["valid_json_response"] == True
]
llama3_8b_subset = llama3_8b_subset[
    ["link", "civic_label", "political_label"]
]
llama3_8b_subset = llama3_8b_subset.rename(
    columns={
        "civic_label": "llama3-8b_civic_label",
        "political_label": "llama3-8b_political_label"
    }
)

In [28]:
llama3_8b_subset.columns

Index(['link', 'llama3-8b_civic_label', 'llama3-8b_political_label'], dtype='object')

In [27]:
llama3_70b_subset = llama3_70b_classifications[
    llama3_70b_classifications["valid_json_response"] == True
]
llama3_70b_subset = llama3_70b_subset[
    ["link", "civic_label", "political_label"]
]
llama3_70b_subset = llama3_70b_subset.rename(
    columns={
        "civic_label": "llama3-70b_civic_label",
        "political_label": "llama3-70b_political_label"
    }
)

In [29]:
llama3_70b_subset.columns

Index(['link', 'llama3-70b_civic_label', 'llama3-70b_political_label'], dtype='object')

##### Ground-truth labels

Let's remove any NAs and then do processing like the others

In [32]:
ground_truth_labels = ground_truth_labels[
    ~pd.isna(ground_truth_labels["civic_hand_label"])
]

In [35]:
ground_truth_labels_subset = ground_truth_labels[
    ["link", "civic_hand_label", "political_ideology_hand_label"]
]

#### Now let's get some basic counts and statistics

GPT4:

In [20]:
print(gpt_subset["gpt4_civic_label"].value_counts())

civic
not civic    187
civic        174
Name: count, dtype: int64


In [37]:
print(
    gpt_subset[
        gpt_subset["gpt4_civic_label"] == "civic"
    ]["gpt4_political_label"].value_counts()
)

gpt4_political_label
left-leaning     140
unclear           18
right-leaning     15
moderate           1
Name: count, dtype: int64


Llama3-8b

In [38]:
print(llama3_8b_subset["llama3-8b_civic_label"].value_counts())

llama3-8b_civic_label
civic        199
not civic    148
Name: count, dtype: int64


In [39]:
print(
    llama3_8b_subset[
        llama3_8b_subset["llama3-8b_civic_label"] == "civic"
    ]["llama3-8b_political_label"].value_counts()
)

llama3-8b_political_label
left-leaning     130
right-leaning     34
unclear           32
moderate           3
Name: count, dtype: int64


Llama3-70b

In [40]:
print(llama3_70b_subset["llama3-70b_civic_label"].value_counts())

llama3-70b_civic_label
civic        200
not civic    147
Name: count, dtype: int64


In [41]:
print(
    llama3_70b_subset[
        llama3_70b_subset["llama3-70b_civic_label"] == "civic"
    ]["llama3-70b_political_label"].value_counts()
)

llama3-70b_political_label
left-leaning     173
right-leaning     11
unclear            9
moderate           7
Name: count, dtype: int64


Ground truth

In [42]:
print(ground_truth_labels_subset["civic_hand_label"].value_counts())

civic_hand_label
civic        193
not civic    161
Name: count, dtype: int64


In [43]:
print(
    ground_truth_labels_subset[
        ground_truth_labels_subset["civic_hand_label"] == "civic"
    ]["political_ideology_hand_label"].value_counts()
)

political_ideology_hand_label
left-leaning     160
unclear           20
right-leaning      7
moderate           5
Name: count, dtype: int64


### Analyze the data together

Let's join the data together to get a joined version of the labels and the ground truth

In [44]:
joined_df = pd.merge(
    gpt_subset, llama3_8b_subset, on="link"
)
joined_df = pd.merge(
    joined_df, llama3_70b_subset, on="link"
)
joined_df = pd.merge(
    joined_df, ground_truth_labels_subset, on="link"
)

In [45]:
joined_df.shape

(333, 9)

In [46]:
joined_df.to_csv("posts_with_ground_truth_and_llm_labels.csv")

Let's take a look at the data

In [47]:
joined_df.head()

Unnamed: 0,link,gpt4_civic_label,gpt4_political_label,llama3-8b_civic_label,llama3-8b_political_label,llama3-70b_civic_label,llama3-70b_political_label,civic_hand_label,political_ideology_hand_label
0,https://bsky.app/profile/jbouie.bsky.social/po...,civic,left-leaning,civic,left-leaning,civic,left-leaning,civic,left-leaning
1,https://bsky.app/profile/lethalityjane.bsky.so...,civic,right-leaning,civic,right-leaning,civic,unclear,civic,unclear
2,https://bsky.app/profile/esqueer.bsky.social/p...,civic,left-leaning,civic,left-leaning,civic,left-leaning,civic,left-leaning
3,https://bsky.app/profile/stuflemingnz.bsky.soc...,not civic,,not civic,unclear,not civic,unclear,not civic,
4,https://bsky.app/profile/sararoseg.bsky.social...,civic,left-leaning,civic,left-leaning,civic,left-leaning,civic,left-leaning


Let's make sure that there are no NaN values in the civic columns ("gpt4_civic_label", "llama3-8b_civic_label", "llama3-70b_civic_label", "civic_hand_label"). Let's not impute for now, let's just get the counts. 

Then, Let's also impute any NaNs in the "gpt4_political_label", "political_ideology_hand_label", "llama3-8b_political_label", and "llama3-70b_political_label" columns with "unclear".

In [60]:
civic_columns = ["gpt4_civic_label", "llama3-8b_civic_label", "llama3-70b_civic_label", "civic_hand_label"]
nan_counts_civic = joined_df[civic_columns].isna().sum()
print("NaN counts in civic columns:")
print(nan_counts_civic)


NaN counts in civic columns:
gpt4_civic_label          0
llama3-8b_civic_label     0
llama3-70b_civic_label    0
civic_hand_label          0
dtype: int64


In [61]:
political_columns = ["gpt4_political_label", "political_ideology_hand_label", "llama3-8b_political_label", "llama3-70b_political_label"]
joined_df[political_columns] = joined_df[political_columns].fillna("unclear")

# Verify the imputation
print("Verification after imputation:")
print(joined_df[political_columns].isna().sum())


Verification after imputation:
gpt4_political_label             0
political_ideology_hand_label    0
llama3-8b_political_label        0
llama3-70b_political_label       0
dtype: int64


Now that we've taken care of the last preprocessing, let's continue

#### Metrics

In [53]:
total_values = joined_df.shape[0]

##### GPT4

In [50]:
gpt4_civic_metrics = precision_recall_fscore_support(
    y_true=joined_df["civic_hand_label"].tolist(),
    y_pred=joined_df["gpt4_civic_label"].tolist(),
    average="binary",
    pos_label="civic"
)

In [51]:
(
    gpt4_civic_precision,
    gpt4_civic_recall,
    gpt4_civic_fbeta_score,
    gpt4_civic_support
) = gpt4_civic_metrics


In [52]:
print(f"Precision: {gpt4_civic_precision}\tRecall: {gpt4_civic_recall}\tF-1 score: {gpt4_civic_fbeta_score}\tSupport: {gpt4_civic_support}")

Precision: 0.9426751592356688	Recall: 0.8222222222222222	F-1 score: 0.8783382789317508	Support: None


In [54]:
confusion_matrix = pd.crosstab(
    joined_df["civic_hand_label"].tolist(),
    joined_df["gpt4_civic_label"].tolist()
)

In [55]:
acc = (
    (confusion_matrix.values[0][0] + confusion_matrix.values[1][1]) 
    / total_values
)

In [56]:
print(f"Accuracy: {acc}")

Accuracy: 0.8768768768768769


Political ideology classification

In [None]:
# TODO

##### Llama3-8b

In [57]:
llama3_8b_civic_metrics = precision_recall_fscore_support(
    y_true=joined_df["civic_hand_label"].tolist(),
    y_pred=joined_df["llama3-8b_civic_label"].tolist(),
    average="binary",
    pos_label="civic"
)

In [68]:
pd.crosstab(
    joined_df["civic_hand_label"],
    joined_df["llama3-8b_civic_label"]
)

llama3-8b_civic_label,civic,not civic
civic_hand_label,Unnamed: 1_level_1,Unnamed: 2_level_1
civic,114,66
not civic,74,79


In [58]:
(
    llama3_8b_civic_precision,
    llama3_8b_civic_recall,
    llama3_8b_civic_fbeta_score,
    llama3_8b_civic_support
) = llama3_8b_civic_metrics


In [59]:
print(f"Precision: {llama3_8b_civic_precision}\tRecall: {llama3_8b_civic_recall}\tF-1 score: {llama3_8b_civic_fbeta_score}\tSupport: {llama3_8b_civic_support}")

Precision: 0.6063829787234043	Recall: 0.6333333333333333	F-1 score: 0.6195652173913043	Support: None


##### Llama3-70b

In [62]:
llama3_70b_civic_metrics = precision_recall_fscore_support(
    y_true=joined_df["civic_hand_label"].tolist(),
    y_pred=joined_df["llama3-70b_civic_label"].tolist(),
    average="binary",
    pos_label="civic"
)

In [65]:
joined_df["civic_hand_label"].value_counts()

civic_hand_label
civic        180
not civic    153
Name: count, dtype: int64

In [66]:
joined_df["llama3-70b_civic_label"].value_counts()

llama3-70b_civic_label
civic        189
not civic    144
Name: count, dtype: int64

In [67]:
pd.crosstab(
    joined_df["civic_hand_label"],
    joined_df["llama3-70b_civic_label"]
)

llama3-70b_civic_label,civic,not civic
civic_hand_label,Unnamed: 1_level_1,Unnamed: 2_level_1
civic,112,68
not civic,77,76


In [63]:
(
    llama3_70b_civic_precision,
    llama3_70b_civic_recall,
    llama3_70b_civic_fbeta_score,
    llama3_70b_civic_support
) = llama3_70b_civic_metrics


In [64]:
print(f"Precision: {llama3_70b_civic_precision}\tRecall: {llama3_70b_civic_recall}\tF-1 score: {llama3_70b_civic_fbeta_score}\tSupport: {llama3_70b_civic_support}")

Precision: 0.5925925925925926	Recall: 0.6222222222222222	F-1 score: 0.6070460704607046	Support: None
