In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import numpy as np
from scipy.spatial import distance
from collections import Counter

In [3]:
notes_file = '../data/notes-00000.tsv'
ratings_file = '../data/ratings-00000.tsv'
hydrated_file = '../data/idtweetVALUES.jsonl'

In [4]:
notes_df = pd.read_csv(notes_file, delimiter='\t')

hydrated_Tweets=pd.read_json(hydrated_file, lines=True)

#merge notes with hydrated file
notes_text_df = pd.merge(notes_df, hydrated_Tweets, left_on='tweetId', right_on='id')
notes_text_df_en = notes_text_df[notes_text_df['lang']=='en']


In [5]:
# aggregate with noteId
agg_noted_df = notes_text_df_en.groupby('tweetId').agg({'noteId':'count'})
# get tweets having more than two notes
TweetsMoreThanTwoNoters = list(agg_noted_df[agg_noted_df['noteId']>1].index)
notes_df_two_noters = notes_text_df_en[notes_text_df_en.tweetId.apply(lambda x: True if x in TweetsMoreThanTwoNoters else False)]
notes_df_two_noters = notes_df_two_noters.reset_index(drop=True)

In [6]:
# pivot table
notes_df_two_noters=notes_df_two_noters.drop_duplicates(subset=['participantId','tweetId'])
pivot_table = notes_df_two_noters.pivot(index='participantId',columns='tweetId',values='classification')

d={'MISINFORMED_OR_POTENTIALLY_MISLEADING':1,'NOT_MISLEADING':1,np.nan:0}
pivot_table=pivot_table.applymap(lambda x: d[x])

users = pivot_table.to_numpy()

In [7]:
#compute dot product between users
max_=-1
max_ind=-1
closest_users = {}
for i in tqdm(range(users.shape[0])[:]):
    sims = {ind:np.dot(users[i],other_usr) for ind,other_usr in enumerate(users) if ind!=i}
    sims = {k:v for (k,v) in sims.items() if v}
    closest_users[i]=sims
    if sims:
        max_similarity = max(sims.values())
        if max_similarity > max_:
            max_ = max_similarity
            max_ind  = (i,max(sims,key=sims.get))


100%|██████████| 1550/1550 [00:22<00:00, 67.78it/s]


In [8]:
def two_users_with_max_tweets_func(x):
    return max(x[1].values()) if x[1] else -1

user_1=max(closest_users.items(),key=two_users_with_max_tweets_func)[0]

user_2 = max(closest_users[user_1],key=closest_users[user_1].get)

In [9]:
# users with most common tweets
df1 = notes_text_df_en[notes_text_df_en.participantId==pivot_table.index[user_1]]
df2 = notes_text_df_en[notes_text_df_en.participantId==pivot_table.index[user_2]]


df1 = notes_df_two_noters[notes_df_two_noters.participantId==pivot_table.index[user_1]]
df2 = notes_df_two_noters[notes_df_two_noters.participantId==pivot_table.index[user_2]]


common = pd.merge(df1,df2,on='tweetId',how='inner')

common_text = pd.merge(common,hydrated_Tweets,left_on='tweetId', right_on='id',how='inner')

for index,row in common_text.iterrows():
    print(f'Tweet ID: {row.tweetId}')
    print(f'Tweet Text:\n{row.full_text}')
    print()
    print(f'User #1 Classification: {row.classification_x}')
    print(f'User #1 Note: {row.summary_x}')
    print()
    print(f'User #2 Classification: {row.classification_y}')
    print(f'User #2 Note: {row.summary_y}')
    print('\n\n\n')

Tweet ID: 1354889772901277703
Tweet Text:
You heard it here first:

If we do not CONTAIN Trump by impeachment or indictment, in a few months he will start to run a shadow government from Mar-a-Lago and at least a dozen governors will start to take their orders from HIM. 

He will claim he is the legitimate President &amp;

User #1 Classification: MISINFORMED_OR_POTENTIALLY_MISLEADING
User #1 Note: She is speculative at best. And she makes claims that are over the top in their fearmongering. 

User #2 Classification: MISINFORMED_OR_POTENTIALLY_MISLEADING
User #2 Note: Trump is not running a shadow government from Mar-a-Lago. He?s playing golf and set up the office of the FORMER president. This tweet is inciting fear based on fantasy.     https://www.cbsnews.com/amp/news/office-of-the-former-president-trump-florida/




Tweet ID: 1354608406548656130
Tweet Text:
Republican members of Congress don’t want consequences for white supremacy or insurrection against the United States because the

In [10]:
#users with no common tweets

no_common_users = [k for (k,v) in closest_users.items() if not v ]

for i in no_common_users:
    print(i)
    ind=pivot_table.index[i]

    df1 = notes_text_df_en[notes_text_df_en.participantId==ind]
    df1 = notes_df_two_noters[notes_df_two_noters.participantId==ind]
    
    common_text = pd.merge(df1,hydrated_Tweets,left_on='tweetId', right_on='id',how='inner')

    for _,x in common_text.iterrows():
        print(f'Tweet ID: {x.tweetId}')
        print(f'Tweet Text: {x.full_text_y}')
        print(f'User Classification: {x.classification}')
        print(f'User Note: {x.summary}')
        print()

363
Tweet ID: 1428065649536815104
Tweet Text: If you want to add helpful context to any Tweet that might be misleading, open the ••• menu and then hit "Contribute to Birdwatch". Try it on this Tweet! https://t.co/fk57uFse7E
User Classification: NOT_MISLEADING
User Note: Test

406
Tweet ID: 1418537677163483147
Tweet Text: @ABC Only Hunter Biden can use the n word with no repercussions
User Classification: MISINFORMED_OR_POTENTIALLY_MISLEADING
User Note: Unproven bias from what seems to be a GOP supporter



In [11]:
# user with most common notes with other users
def user_with_most_common_notes_with_other_users(x):
    return sum(x[1].values())

i=max(closest_users.items(),key=user_with_most_common_notes_with_other_users)[0]
ind=pivot_table.index[i]


#df1 = notes_text_df_en[notes_text_df_en.participantId==ind]
df1 = notes_df_two_noters[notes_df_two_noters.participantId==ind]

common_text = pd.merge(df1,hydrated_Tweets,left_on='tweetId', right_on='id',how='inner')

for _,x in common_text.iterrows():
    print(f'Tweet ID: {x.tweetId}')
    print(f'Tweet Text: {x.full_text_y}')
    print(f'User Classification: {x.classification}')
    print(f'User Note: {x.summary}')
    print()

Tweet ID: 1433455893262348288
Tweet Text: Last night, the Supreme Court officially overturned five decades of settled law and permitted Texas' unconstitutional abortion ban to stand. 

Yes: They gutted Roe v. Wade without hearing arguments, in a one-paragraph, unsigned 5-4 opinion issued in the middle of the night.
User Classification: MISINFORMED_OR_POTENTIALLY_MISLEADING
User Note: There is no criminal penalty under TX SB8.  It's a strictly civil issue and has no provisions for criminal prohibition, criminal consequences, or criminal penalties for having or performing an abortion.  Roe V Wade is untouched.  Just go read the law.

Tweet ID: 1403077496296386561
Tweet Text: Freedom of speech doesn't exist for Muslim women in Congress. The benefit of the doubt doesn't exist for Muslim women in Congress. House Democratic leadership should be ashamed of its relentless, exclusive tone policing of Congresswomen of color.
User Classification: MISINFORMED_OR_POTENTIALLY_MISLEADING
User Note: S

The 2nd Amendment does NOT authorize gun OWNERSHIP! https://t.co/4cLrNsCKXy
User Classification: MISINFORMED_OR_POTENTIALLY_MISLEADING
User Note: He's right.  The Bill of Rights does not authorize any rights.  It is a list of things that the US gov't is not allowed to do, not a list of permissions given by the gov't.  Shall make no law, shall not be infringed, along with 8 separate references to The People is pretty clear  https://www.archives.gov/founding-docs/bill-of-rights-transcript

Tweet ID: 1355177721438085122
Tweet Text: Fox has cut away from the White House press briefing on coronavirus to do a segment going after NY Gov. Cuomo.
User Classification: MISINFORMED_OR_POTENTIALLY_MISLEADING
User Note: Missing a bit of context on why they cut away.  Also interesting that CNN had cut away from POTUS press conferences and daily POTUS COVID briefings during the previous administration.

Tweet ID: 1354880952141553674
Tweet Text: “We need healing + unity, but I will not take any respons

In [12]:
closest_users[i]

{1: 1,
 5: 1,
 11: 5,
 16: 1,
 47: 12,
 57: 1,
 62: 5,
 63: 1,
 67: 1,
 82: 2,
 84: 2,
 91: 1,
 93: 2,
 94: 4,
 104: 1,
 108: 5,
 111: 2,
 112: 1,
 124: 1,
 131: 1,
 135: 3,
 137: 3,
 149: 1,
 164: 1,
 174: 1,
 177: 1,
 184: 2,
 187: 6,
 190: 1,
 215: 1,
 223: 1,
 224: 2,
 226: 3,
 231: 2,
 234: 1,
 237: 1,
 252: 1,
 266: 1,
 272: 1,
 275: 1,
 292: 7,
 298: 1,
 299: 1,
 301: 1,
 311: 1,
 312: 1,
 330: 2,
 335: 3,
 341: 1,
 366: 1,
 367: 1,
 369: 1,
 377: 1,
 380: 6,
 384: 1,
 400: 1,
 408: 1,
 411: 1,
 421: 1,
 423: 1,
 426: 2,
 431: 1,
 441: 1,
 448: 1,
 460: 2,
 466: 1,
 468: 1,
 474: 1,
 475: 1,
 486: 2,
 492: 1,
 494: 1,
 524: 1,
 527: 1,
 535: 1,
 549: 1,
 580: 1,
 589: 1,
 596: 1,
 602: 1,
 627: 1,
 632: 1,
 649: 2,
 653: 2,
 663: 1,
 666: 1,
 667: 1,
 697: 2,
 712: 1,
 714: 2,
 737: 1,
 756: 1,
 759: 1,
 769: 1,
 770: 6,
 777: 1,
 784: 2,
 788: 1,
 798: 1,
 799: 1,
 807: 1,
 809: 1,
 814: 2,
 821: 1,
 822: 3,
 824: 3,
 826: 1,
 835: 9,
 837: 1,
 843: 1,
 851: 1,
 867: 4,
 878: 1

In [13]:
closest_users[1463]

{16: 1,
 63: 1,
 70: 1,
 88: 1,
 272: 2,
 335: 1,
 527: 1,
 714: 1,
 759: 1,
 814: 1,
 859: 1,
 969: 1,
 1078: 2,
 1130: 1,
 1194: 1,
 1195: 1,
 1297: 1,
 1321: 1,
 1436: 1,
 1462: 1,
 1470: 1,
 1545: 1}

In [14]:
[i for i,x in enumerate(users[1463]) if x and users[1078][i]]

[648, 1024]

In [15]:
f=[]
for x in list(df1.tweetId):
    f.extend(list(notes_df_two_noters[notes_df_two_noters.tweetId==x].participantId))

In [16]:
len(set(f))

218

In [17]:
len(closest_users[1078])

217

In [18]:
def find_winner(x):
    x = Counter(x).items()
    m = max([xx[1] for xx in x ])
    return [xx[0] for xx in x if xx[1]==m]

In [19]:
c=0
n=0
for index,row in df1.iterrows():
    user_label=row.classification
    user_pid= row.participantId
    d = list(notes_df_two_noters[(notes_df_two_noters.tweetId==row.tweetId) & (notes_df_two_noters.participantId!=user_pid)]['classification'])
    winner = find_winner(d)
    if len(winner)==1:
        if winner[0]==user_label:
            c+=1
    n+=1

In [20]:
c/n

0.8

In [24]:
#user with most notes

agg_count = notes_text_df_en.groupby('participantId')\
                .agg({'noteId':'count'})

ind = agg_count[agg_count.noteId==max(agg_count['noteId'])].index[0]

df1 = notes_text_df_en[notes_text_df_en.participantId==ind]
common_text = pd.merge(df1,hydrated_Tweets,left_on='tweetId', right_on='id',how='inner')
trump=0
covid = 0
for _,x in common_text.iterrows():

    if 'trump' in x.full_text_y.lower()+' '+x.summary or 'election' in x.full_text_y.lower()+' '+x.summary \
    or 'biden' in x.full_text_y.lower()+' '+x.summary:
        trump+=1
    elif 'covid' in x.full_text_y.lower()+' '+x.summary or 'pandemic' in x.full_text_y.lower()+' '+x.summary\
    or 'coronavirus' in x.full_text_y.lower()+' '+x.summary:
        covid+=1
    else:
        print(f'Tweet ID: {x.tweetId}')
        print(f'Tweet Text: {x.full_text_y}')
        print(f'User Note: {x.summary}')
        print()

print(f'US related tweets: {trump/len(common_text)*100}%')

Tweet ID: 1375893680515612677
Tweet Text: We do not have a white supremacy problem in America. We do have a black culture problem. And it’s time we talk about it. https://t.co/Y381HA3A7d
User Note: Violent white supremacists continue to be a real problem in the U.S.    FBI: &quot;What are Known Violent Extremist Groups?&quot;  https://www.fbi.gov/cve508/teen-website/what-are-known-violent-extremist-groups    &quot;White supremacy extremists are motivated by a hatred of other races and religions. Some try to achieve their political and social goals through violence.&quot;

US related tweets: 71.34146341463415%


In [25]:
a=[]
pid=df1.iloc[0]['participantId']
for index,row in df1.iterrows():
    d=notes_text_df[(notes_text_df.tweetId==row.tweetId) & (notes_text_df.participantId!=pid)]
    a.append(len(d))

In [26]:
Counter(a)

Counter({1: 12, 2: 1, 0: 643})

In [27]:
#tweet with most notes
agg_notes=notes_text_df_en.groupby('tweetId')\
                .agg({'noteId':'count'})

ind = agg_notes[agg_notes.noteId==max(agg_notes.noteId)].index[0]


df1 = notes_text_df_en[notes_text_df_en.tweetId==ind]
common_text = pd.merge(df1,hydrated_Tweets,left_on='tweetId', right_on='id',how='inner')
s=[]

for _,x in common_text.iterrows():
    s.append(x.classification)
print(Counter(s))

Counter({'MISINFORMED_OR_POTENTIALLY_MISLEADING': 57, 'NOT_MISLEADING': 4})
