# Exploring some of the datasets for the RP_CSSP projects

In [1]:
import pandas as pd

### Dataset 1: Reddit Politicians

This is 1% of the total data available at: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/YWRXEP&version=1.0

In [2]:
pol_reddit = pd.read_csv('data/all_politicians_comments/all_comments_sample_137K.csv')
pol_reddit.head()

Unnamed: 0,id,body,subreddit,to_type,NEL,Names,created_utc,sex,ethnicity,origin,...,nickname_used,Adjectives,Verbs,Nouns,Descriptors_parsed,Verbs_parsed,Relation,Valence,Arousal,Dominance
0,e2r5x8o,"Paul Joseph Watson, Count Dankula and more ha...",The_Donald,t1,Q291169,['Jeremy Corbyn'],,male,,United Kingdom,...,False,"['more', 'parliamentary', 'small', 'effective'...","['have', 'be', 'have', 'be', 'take', 'get', 'g...","['system', 'party', 'system', 'seat', 'winner'...",[],[],['nmod'],0.570419,0.425279,0.563581
1,e2r5x9y,Just wait til [NAME] releases [NAME] tapes! .....,politics,t1,Q7747,['Putin'],,male,Russians,Russia,...,False,[],['wait'],"['name', 'release', 'name', 'tape', 'music', '...",[],[],"['nmod', 'npadvmod']",0.5698,0.3814,0.4252
2,e2r5zwt,"I want Michael Avenatti to live a long time, a...",politics,t1,Q22686,['Donald Trump'],,male,,United States of America,...,False,['long'],"['want', 'live', 'want', 'lose', 'say', 'need'...","['time', 'dignity', 'name', 'name', 'poison', ...",[],['say'],['dep'],0.600769,0.472,0.527231
3,e2r640r,I think we've established [NAME]'s 'type'...ty...,politics,t3,Q22686,['Trump'],,male,,United States of America,...,False,"['young', 'nubile']","['think', 'establish', 'prefer', 'remind']","['name', ""type'"", 'type', 'chick', 'type']",[],['establish'],['dobj'],0.669714,0.372143,0.507857
4,e2r66h0,There's no way [NAME] or Trump know exactly wh...,politics,t3,Q212648,['Giuliani'],,male,,United States of America,...,False,"['able', 'several', 'treasonous', 'mere', 'rus...","['know', 'record', 'be', 'get', 'expect', 'rel...","['way', 'name', 'trump', 'name', 'trump', 'mag...","['trump', 'able']",['be'],"['parataxis', 'dep']",0.549697,0.484636,0.545667


In [3]:
pol_reddit.columns

Index(['id', 'body', 'subreddit', 'to_type', 'NEL', 'Names', 'created_utc',
       'sex', 'ethnicity', 'origin', 'DOB', 'highest_position', 'party',
       'entity_given_name', 'entity_family_name', 'given_name_used',
       'family_name_used', 'full_name_used', 'nickname_used', 'Adjectives',
       'Verbs', 'Nouns', 'Descriptors_parsed', 'Verbs_parsed', 'Relation',
       'Valence', 'Arousal', 'Dominance'],
      dtype='object')

### Dataset 2: Stance Detection

P-Stance [https://aclanthology.org/2021.findings-acl.208/] is relatively new dataset about stance towards several politicians. You can download it from here: https://drive.google.com/drive/folders/1so8lY1XKpnhUtTvb15edEz6aeHt7CSuh

For now, we will look at one of the files with stance towards Donald Trump: 

In [17]:
trump_stance = pd.read_csv('data/p_stance/raw_train_trump.csv')
trump_stance.head()

Unnamed: 0,Tweet,Target,Stance
0,You COWARD... you should have been at THE WHIT...,Donald Trump,AGAINST
1,Cool. I trust he requested the same records fo...,Donald Trump,AGAINST
2,Off your meds again Donnie? You are surrounded...,Donald Trump,AGAINST
3,That has to happen anyway. But his profits wil...,Donald Trump,AGAINST
4,This is so stupid. Unmasking lets them know wh...,Donald Trump,AGAINST


In [18]:
trump_stance.columns

Index(['Tweet', 'Target', 'Stance'], dtype='object')

### Dataset 3: Annotator Disagreement

There are several NLP datasets which have individual-level annotations and annotator backgrounds (See slide 52 in Lecture 3). One of them is the 'Social Bias Frames' dataset: https://aclanthology.org/2020.acl-main.486/ available on huggingface: https://huggingface.co/datasets/social_bias_frames

In [4]:
from datasets import load_dataset

sbf = load_dataset("social_bias_frames")

In [11]:
# convert hugginface dataset object to pandas dataframe
train_df = sbf['train'].to_pandas()
test_df = sbf['test'].to_pandas()
validation_df = sbf['validation'].to_pandas()

In [12]:
len(train_df), len(test_df), len(validation_df)

(112900, 17501, 16738)

In [13]:
all_sbf_data = pd.concat([train_df, test_df, validation_df]).reset_index()
len(all_sbf_data)

147139

In [16]:
all_sbf_data.head()

Unnamed: 0,index,whoTarget,intentYN,sexYN,sexReason,offensiveYN,annotatorGender,annotatorMinority,sexPhrase,speakerMinorityYN,WorkerId,HITId,annotatorPolitics,annotatorRace,annotatorAge,post,targetMinority,targetCategory,targetStereotype,dataSource
0,0,0.0,0.66,0.0,,1.0,woman,,,,-8935932304856669427,363A7XIFV4G2799C5V96YERJA9AVAM,liberal,white,45.0,RT @_LexC__: I'm convinced that some of y'all ...,,,,t/davidson
1,1,0.0,0.66,0.0,,0.5,man,,,,6347880360297734464,363A7XIFV4G2799C5V96YERJA9AVAM,mod-liberal,white,35.0,RT @_LexC__: I'm convinced that some of y'all ...,,,,t/davidson
2,2,0.0,0.33,0.0,,0.5,man,,,,-7452610791699819066,363A7XIFV4G2799C5V96YERJA9AVAM,liberal,asian,23.0,RT @_LexC__: I'm convinced that some of y'all ...,,,,t/davidson
3,3,1.0,1.0,0.0,,1.0,man,,,0.0,-500114755446676507,3JTPR5MTZS6RLS3JBV4IOU0G2X35K5,liberal,white,25.0,RT @iBeZo: Stupid fucking nigger LeBron. You f...,black folks,race,all stupid,t/davidson
4,4,1.0,1.0,0.0,,1.0,man,,,0.0,-500114755446676507,3JTPR5MTZS6RLS3JBV4IOU0G2X35K5,liberal,white,25.0,RT @iBeZo: Stupid fucking nigger LeBron. You f...,black folks,race,are not people but apes.,t/davidson


In [14]:
all_sbf_data.columns

Index(['index', 'whoTarget', 'intentYN', 'sexYN', 'sexReason', 'offensiveYN',
       'annotatorGender', 'annotatorMinority', 'sexPhrase',
       'speakerMinorityYN', 'WorkerId', 'HITId', 'annotatorPolitics',
       'annotatorRace', 'annotatorAge', 'post', 'targetMinority',
       'targetCategory', 'targetStereotype', 'dataSource'],
      dtype='object')

In [15]:
len(all_sbf_data['post'].unique())

44671