In [1]:
import pandas as pd
from datasets import load_dataset

In [2]:
ds = load_dataset('mlburnham/Pol_NLI')
test = ds['test'].to_pandas()

In [14]:
from datasets import load_dataset
import pandas as pd
# Replace 'your_dataset_name' with the actual dataset name

# Function to calculate descriptive statistics
def describe_dataset(split):
    df = ds[split].to_pandas()  # Convert the split to a pandas DataFrame

    # Unique datasets
    unique_datasets = df['dataset'].nunique()

    # Unique premises, hypotheses, and augmented hypotheses
    unique_premises = df['premise'].nunique()
    unique_hypotheses = df['hypothesis'].nunique()
    unique_augmented_hypotheses = df['augmented_hypothesis'].nunique()

    # Average length of premises in terms of word count
    avg_premise_word_count = df['premise'].apply(lambda x: len(x.split())).mean()

    # Entailment counts
    entailment_counts = df['entailment'].value_counts().to_dict()

    return {
        'unique_datasets': unique_datasets,
        'unique_premises': unique_premises,
        'unique_hypotheses': unique_hypotheses,
        'unique_augmented_hypotheses': unique_augmented_hypotheses,
        'avg_premise_word_count': avg_premise_word_count,
        'entailment_counts': entailment_counts
    }
# Analyze each split
splits = ['train', 'validation', 'test']
results = {}

for split in splits:
    results[split] = describe_dataset(split)

# Calculate total unique datasets across all splits
all_datasets = pd.concat([ds[split].to_pandas() for split in splits])
total_unique_datasets = all_datasets['dataset'].nunique()

# Print results
for split, stats in results.items():
    print(f"Split: {split}")
    print(f"  Unique datasets: {stats['unique_datasets']}")
    print(f"  Unique premises: {stats['unique_premises']}")
    print(f"  Unique hypotheses: {stats['unique_hypotheses']}")
    print(f"  Unique augmented hypotheses: {stats['unique_augmented_hypotheses']}")
    print(f"  Average premise length: {stats['avg_premise_word_count']:.2f} words")
    print(f"  Entailment counts: {stats['entailment_counts']}")
    print()

print(f"Total unique datasets across all splits: {total_unique_datasets}")

Split: train
  Unique datasets: 21
  Unique premises: 110155
  Unique hypotheses: 769
  Unique augmented hypotheses: 2531
  Average premise length: 57.27 words
  Entailment counts: {1: 95474, 0: 75815}

Split: validation
  Unique datasets: 21
  Unique premises: 14231
  Unique hypotheses: 648
  Unique augmented hypotheses: 1628
  Average premise length: 45.65 words
  Entailment counts: {1: 8697, 0: 6339}

Split: test
  Unique datasets: 13
  Unique premises: 13904
  Unique hypotheses: 82
  Unique augmented hypotheses: 297
  Average premise length: 39.73 words
  Entailment counts: {1: 9080, 0: 6286}

Total unique datasets across all splits: 21


In [54]:
# number of not entail
all_datasets['entailment'].sum()

113251

In [57]:
# number of entail
all_datasets.shape[0] - all_datasets['entailment'].sum()

88440

In [60]:
all_datasets.groupby('task').count()

Unnamed: 0_level_0,premise,hypothesis,entailment,dataset,augmented_hypothesis
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
event extraction,31234,31234,31234,31234,31234
hatespeech and toxicity,41871,41871,41871,41871,41871
stance detection,66581,66581,66581,66581,66581
topic classification,62005,62005,62005,62005,62005


In [16]:
# Initialize a dictionary to store the results
dataset_info = {}

# Iterate through each split
for split in ['train', 'validation', 'test']:
    df = ds[split].to_pandas()  # Convert the split to a pandas DataFrame

    # Get unique datasets in this split
    unique_datasets = df['dataset'].unique()

    # Update the dataset_info dictionary
    for dataset_name in unique_datasets:
        if dataset_name not in dataset_info:
            dataset_info[dataset_name] = {'train': 0, 'validation': 0, 'test': 0}
        dataset_info[dataset_name][split] = df[df['dataset'] == dataset_name].shape[0]

# Convert the dictionary to a DataFrame
result_df = pd.DataFrame.from_dict(dataset_info, orient='index').reset_index()
result_df.rename(columns={'index': 'dataset'}, inplace=True)

# Add a column for the total number of documents
result_df['total_documents'] = result_df[['train', 'validation', 'test']].sum(axis=1)

# Print the resulting DataFrame
print(result_df)

                                          dataset  train  validation  test  \
0      mlburnham/global_warming_stance_entailment   3517          90     0   
1         mlburnham/dem_rep_party_platform_topics  17287        2200  2377   
2   mlburnham/argument_quality_ranking_entailment  14026        1389  1267   
3            mlburnham/ibm_claimstance_entailment   2152         261   341   
4      mlburnham/ibm_claimstance_topic_entailment   3180         506   407   
5                  mlburnham/PoliStance_Affect_QT   2803          92   341   
6                     mlburnham/PoliStance_Affect  23139        1427  3006   
7                 mlburnham/hatespeech_entailment   2904          96     0   
8         mlburnham/violent_hatespeech_entailment   5191        1396  1351   
9    mlburnham/dehumanizing_hatespeech_entailment   5667         147   255   
10       mlburnham/targeted_hatespeech_entailment  20002        1906  1396   
11                  mlburnham/polarizing_rhetoric   1514        

In [33]:
result_df['dataset'] = result_df['dataset'].str.replace('mlburnham/', '')

In [48]:
result_df['test'].sum()

15366

In [37]:
result_df['dataset'] = result_df['dataset'].str.replace('_entailment', '')
result_df['dataset'] = result_df['dataset'].str.replace('_', ' ')
result_df['dataset'] = result_df['dataset'].str.title()
result_df

Unnamed: 0,dataset,train,validation,test,total_documents
0,Global Warming Stance,3517,90,0,3607
1,Dem Rep Party Platform Topics,17287,2200,2377,21864
2,Argument Quality Ranking,14026,1389,1267,16682
3,Ibm Claimstance,2152,261,341,2754
4,Ibm Claimstance Topic,3180,506,407,4093
5,Polistance Affect Qt,2803,92,341,3236
6,Polistance Affect,23139,1427,3006,27572
7,Hatespeech,2904,96,0,3000
8,Violent Hatespeech,5191,1396,1351,7938
9,Dehumanizing Hatespeech,5667,147,255,6069


In [43]:
for data in result_df['total_documents']:
    print(data)

3607
21864
16682
2754
4093
3236
27572
3000
7938
6069
23304
1560
4650
1363
19711
11367
11687
15761
4704
1671
9098


In [19]:
# Combine all splits into a single DataFrame
all_data = pd.concat([ds[split].to_pandas() for split in ['train', 'validation', 'test']])

# Count the occurrences of each hypothesis
hypothesis_counts = all_data['hypothesis'].value_counts().reset_index()
hypothesis_counts.columns = ['hypothesis', 'count']

# Sort by count in descending order
hypothesis_counts = hypothesis_counts.sort_values(by='count', ascending=False)

# Display the top N most common hypotheses
top_n = 20  # Change this to display more or fewer results
print(hypothesis_counts.head(top_n))

                                           hypothesis  count
0                        This text is about politics.   4650
1                           This text is hate speech.   3000
2   This text is defending people for their place ...   2570
3        This court case is about criminal procedure.   2046
4                     This text is about grenade use.   1896
5                Climate change is a serious concern.   1814
6                    Climate change is not a concern.   1793
7                          This text is about health.   1723
8   The author of this text supports stricter immi...   1707
9   The author of this text opposes stricter immig...   1697
10      This text is about crime and law enforcement.   1610
11  This text advocates for violence against peopl...   1582
12        This text is attacking political outgroups.   1560
13        This court case is about economic activity.   1558
14    This text is defending people for their gender.   1513
15  The author of this t

In [23]:
print(hypothesis_counts.tail(top_n))

                                            hypothesis  count
832  This text advocates for violence against seniors.      4
833  The author of this text opposes abolishing con...      4
834  The author of this text opposes disbanding ASEAN.      4
837  The author of this text does not believe that ...      3
838      This text is attacking the visually impaired.      3
836         This text is dehumanizing the middle aged.      3
835  The author of this text believes that states s...      3
839  The author of this text supports passing the A...      2
840  The author of this text opposes passing the Am...      2
841  The author of this text does not believe that ...      2
842               This text is dehumanizing buddhists.      2
843  The author of this text believes that housewiv...      2
844  This text advocates for violence against the v...      1
845       This text is attacking the hearing impaired.      1
846         The author of this text opposes the squad.      1
847   Th

In [22]:
for hypothesis in hypothesis_counts.tail(top_n)['hypothesis']:
    print(hypothesis)

This text advocates for violence against seniors.
The author of this text opposes abolishing congressional earmarks.
The author of this text opposes disbanding ASEAN.
The author of this text does not believe that states should not subsidize the growing of tobacco.
This text is attacking the visually impaired.
This text is dehumanizing the middle aged.
The author of this text believes that states should not subsidize the growing of tobacco.
The author of this text supports passing the American Jobs Act.
The author of this text opposes passing the American Jobs Act.
The author of this text does not believe that housewives should be paid for their work.
This text is dehumanizing buddhists.
The author of this text believes that housewives should be paid for their work.
This text advocates for violence against the visually impaired.
This text is attacking the hearing impaired.
The author of this text opposes the squad.
This text is dehumanizing the visually impaired.
The author of this text

In [30]:
hypothesis_counts['count'].mean()

236.72652582159625