In [16]:
import tqdm # Handy for showing progress on longer running jobs
import utils #Load the utilities we created in other notebooks
import Analyze
from collections import defaultdict
from secret import *
import pandas as pd

In [17]:
#anti-fur users
subreddits=["animalrights","Vegan"]

In [18]:
#Code to get users and top domains
rows=[]
N=25#number of subreddit posts to read
for subreddit in subreddits:
    print(subreddit)
    users=defaultdict(int)
    all_domains=defaultdict(int)
    for post in tqdm.tqdm(reddit.subreddit(subreddit).top("year", limit=N), total=N): #searched top by this year and month to increase my sample size 
        comments=utils.traverse_post(post)
        if post.author:#can be null if deleted
            users[post.author.name]+=1
        for comment,level in comments:
            if comment.author:
                users[comment.author.name]+=1
        texts=[utils.get_post_text(post)]+[comment.body for comment,level in comments]
        for text in texts:
            domains=utils.extract_link_information(text)
            for k,v in domains.items():
                all_domains[k]+=v
    rows.append({
        "subreddit": subreddit,
        "all_domains": all_domains,
        "users": users,
    })
subreddit_df=pd.DataFrame(rows)
subreddit_df

Call this function with 'time_filter' as a keyword argument.
  for post in tqdm.tqdm(reddit.subreddit(subreddit).top("year", limit=N), total=N): #searched top by this year and month to increase my sample size


animalrights


100%|███████████████████████████████████████████| 25/25 [00:51<00:00,  2.04s/it]


Vegan


100%|███████████████████████████████████████████| 25/25 [08:57<00:00, 21.49s/it]


Unnamed: 0,subreddit,all_domains,users
0,animalrights,"{'v.redd.it': 12, 'activisthub.org': 10, 'vega...","{'Repulsive-Repeat-135': 2, 'AutoModerator': 2..."
1,Vegan,"{'i.redd.it': 21, 'vbcc.veganhacktivists.org':...","{'curiousvegan007': 2, 'veganactivismbot': 56,..."


## Sample Users and Get Other Subreddit Interactions

In [19]:
subreddit_df['user_count_in_posts']=subreddit_df['users'].apply(len)
subreddit_df

Unnamed: 0,subreddit,all_domains,users,user_count_in_posts
0,animalrights,"{'v.redd.it': 12, 'activisthub.org': 10, 'vega...","{'Repulsive-Repeat-135': 2, 'AutoModerator': 2...",194
1,Vegan,"{'i.redd.it': 21, 'vbcc.veganhacktivists.org':...","{'curiousvegan007': 2, 'veganactivismbot': 56,...",3781


## Large number of users accounts that we could use for our sampling

In [20]:
user_sample_size=50
subreddit_df['user_sample']=subreddit_df['users'].apply(lambda x: [pd.Series(x.keys()).sample(user_sample_size)])

subreddit_df

Unnamed: 0,subreddit,all_domains,users,user_count_in_posts,user_sample
0,animalrights,"{'v.redd.it': 12, 'activisthub.org': 10, 'vega...","{'Repulsive-Repeat-135': 2, 'AutoModerator': 2...",194,"[[DonManuel, Limping_throwaway, pigpaydirt, -o..."
1,Vegan,"{'i.redd.it': 21, 'vbcc.veganhacktivists.org':...","{'curiousvegan007': 2, 'veganactivismbot': 56,...",3781,"[[thegrumpypanda101, Senpaii_Lover, awesomeide..."


In [21]:
#Construct the user_df like before.
rows=[]
for idx,source_subreddit_row in subreddit_df.iterrows():
    for user in tqdm.tqdm(source_subreddit_row['user_sample'][0], total=user_sample_size):
        redditor=reddit.redditor(user)

        #NOTE These might be slow for redditors with big accounts, rather then make them a list, keeping them in "generator" form
        #  may be a good idea
        try:
            posts=[post for post in redditor.submissions.new()]
            comments=[comment for comment in redditor.comments.new()]
            for c in posts+comments:
                row={
                    'source_subreddit': source_subreddit_row['subreddit'],
                    "subreddit_name": c.subreddit.display_name,
                    "user": user
                }
                rows.append(row)
        except:
            print(user)
users_df=pd.DataFrame(rows)
users_df
    

  4%|█▊                                          | 2/50 [00:03<01:05,  1.36s/it]

Limping_throwaway


 12%|█████▎                                      | 6/50 [00:10<01:11,  1.62s/it]

CryingMadGirl


100%|███████████████████████████████████████████| 50/50 [01:50<00:00,  2.21s/it]
 56%|████████████████████████                   | 28/50 [00:55<00:32,  1.48s/it]

Celeblith_II


 72%|██████████████████████████████▉            | 36/50 [01:14<00:24,  1.75s/it]

saltedpecker


 82%|███████████████████████████████████▎       | 41/50 [01:23<00:14,  1.64s/it]

idiotbusyfor40sec


100%|███████████████████████████████████████████| 50/50 [01:41<00:00,  2.03s/it]


Unnamed: 0,source_subreddit,subreddit_name,user
0,animalrights,BigEnergy,DonManuel
1,animalrights,energy,DonManuel
2,animalrights,RenewableEnergy,DonManuel
3,animalrights,GunsAreCool,DonManuel
4,animalrights,energy,DonManuel
...,...,...,...
12723,Vegan,lifehacks,wildmonster91
12724,Vegan,BeAmazed,wildmonster91
12725,Vegan,IdiotsInCars,wildmonster91
12726,Vegan,crawling,wildmonster91


In [22]:
## Storing users data [top 25 users by year] for that sub
#users_df.to_csv("users_df.csv",index=False)

In [23]:

rows=[]
for user, df in users_df.groupby("user"):
    redditor=reddit.redditor(user)
    row={"user": user,
            "comment_karma":redditor.comment_karma ,
            "post_karma": redditor.awardee_karma > 20, #if users have over 25 
            "total_karma": redditor.total_karma,
             "cake_day": pd.to_datetime(redditor.created_utc*1e9),
        }
    for subreddit,count in df.groupby('subreddit_name').count().iterrows():
        row[f'interacted_in-{subreddit}']=count['user']#count names columns a bit different
    rows.append(row)
user_profile=pd.DataFrame(rows)
user_profile

Unnamed: 0,user,comment_karma,post_karma,total_karma,cake_day,interacted_in-ARTIST,interacted_in-AnimalRights,interacted_in-Aquariums,interacted_in-Aquascape,interacted_in-Avatar,...,interacted_in-genzmoment,interacted_in-guccibacan,interacted_in-knives,interacted_in-mallninjashit,interacted_in-metaldetecting,interacted_in-pocketknives,interacted_in-starwarscollecting,interacted_in-tacticalgear,interacted_in-vegetablegardening,interacted_in-weapons
0,-one-eye-open-,5737,True,7287,2019-11-12 20:13:18,2.0,1.0,18.0,1.0,24.0,...,,,,,,,,,,
1,ABBBS2000,7383,True,8139,2020-05-29 01:21:30,,,,,,...,,,,,,,,,,
2,About400,43393,True,44811,2018-03-22 20:27:54,,,,,,...,,,,,,,,,,
3,Admirable_Jacket8393,644,False,645,2021-10-12 13:07:04,,,,,,...,,,,,,,,,,
4,Altruistic_Stick7895,148,False,149,2021-07-21 08:35:48,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,thegrumpypanda101,7307,True,8891,2020-07-29 05:06:13,,,,,,...,,,,,,,,,,
90,thislittleplace,1171,True,2515,2020-08-02 19:44:15,,,,,,...,,,,,,,,,,
91,w0ke_brrr_4444,27033,True,28219,2020-04-17 20:39:44,,,,,,...,,,,,,,,,,
92,wildmonster91,15668,True,16494,2020-11-30 04:13:02,,,2.0,,,...,,,,,,,,,,


In [24]:
interacted_in_metrics=user_profile.filter(regex="interacted_in").describe().T#transpose so we can more easily query on count
interacted_in_metrics[interacted_in_metrics['count']>5].sort_values(by=["count","mean"],ascending=[False,False])

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
interacted_in-vegan,50.0,14.420000,15.172599,1.0,3.0,7.0,23.75,64.0
interacted_in-AskReddit,34.0,6.735294,10.774011,1.0,2.0,3.0,8.00,63.0
interacted_in-AnimalRights,31.0,3.290323,3.174834,1.0,2.0,2.0,3.50,16.0
interacted_in-facepalm,24.0,2.583333,2.483277,1.0,1.0,2.0,3.00,10.0
interacted_in-interestingasfuck,20.0,2.300000,1.809333,1.0,1.0,2.0,2.25,7.0
...,...,...,...,...,...,...,...,...
interacted_in-trashy,6.0,1.333333,0.516398,1.0,1.0,1.0,1.75,2.0
interacted_in-TikTokCringe,6.0,1.333333,0.516398,1.0,1.0,1.0,1.75,2.0
interacted_in-nottheonion,6.0,1.333333,0.516398,1.0,1.0,1.0,1.75,2.0
interacted_in-UkraineWarVideoReport,6.0,1.166667,0.408248,1.0,1.0,1.0,1.00,2.0


# Improve our categories

In [25]:
to_categorize=[x.replace("interacted_in-","") for x in interacted_in_metrics[interacted_in_metrics['count']>10].sort_values(by=["count"],ascending=False).index]

##to filter out any opposition to the sample 
# we make sure that our picked sample for 'anti-fur' have also interacted in these subreddits 
#minimum interaction is 5
interest_mappings={ "anti-fur/vegan": ["animalrights","vegan","vegetarian","plantbaseddiet","veganmealprep","veganrecipes","veganrecipes"]
                  }

#pro-fur



#so we can easily see what we haven't done yet
not_done_yet=set(to_categorize)-set([x for l in interest_mappings.values() for x in l ])
to_categorize[1500:]

[]

## Look at the major differences

In [26]:
#function to check if a a subreddit matches of list of subreddits
def check_interests(collection, matches, match_rule="ignore_case"):
    #match_rule exists in case to extend this to checking for common phrases used in comment or post body text.
    if match_rule=="ignore_case":
        count=len([c for c in collection if any(c.casefold()==m.casefold() for m in matches)])
    else:
        raise Exception("Unsupported match_rule")
    return count
#add concepts
rows=[]
for user, df in tqdm.tqdm(users_df.groupby("user")):
    redditor=reddit.redditor(user)
    row={"user": user,
            "comment_karma":redditor.comment_karma,
            "post_karma": redditor.awardee_karma,
            "total_karma": redditor.total_karma,
             "total_interactions": [len(df)],
             "cake_day": pd.to_datetime(redditor.created_utc*1e9),
        }
    for category, matches in interest_mappings.items():
        row["interest_in-"+category]=check_interests(df['subreddit_name'], matches) 
#     for subreddit,count in df.groupby('subreddit_name').count().iterrows():
#         row[f'interacted_in-{subreddit}']=count['user']#count names columns a bit different
    rows.append(row)
user_profile=pd.DataFrame(rows).replace(0, pd.np.nan)
user_profile

100%|███████████████████████████████████████████| 94/94 [01:30<00:00,  1.04it/s]
  user_profile=pd.DataFrame(rows).replace(0, pd.np.nan)


Unnamed: 0,user,comment_karma,post_karma,total_karma,total_interactions,cake_day,interest_in-anti-fur/vegan
0,-one-eye-open-,5737,53.0,7287,[159],2019-11-12 20:13:18,1.0
1,ABBBS2000,7383,84.0,8139,[111],2020-05-29 01:21:30,
2,About400,43393,160.0,44811,[143],2018-03-22 20:27:54,5.0
3,Admirable_Jacket8393,644,,645,[100],2021-10-12 13:07:04,
4,Altruistic_Stick7895,148,,149,[44],2021-07-21 08:35:48,2.0
...,...,...,...,...,...,...,...
89,thegrumpypanda101,7307,289.0,8891,[146],2020-07-29 05:06:13,14.0
90,thislittleplace,1171,68.0,2515,[110],2020-08-02 19:44:15,25.0
91,w0ke_brrr_4444,27033,114.0,28219,[111],2020-04-17 20:39:44,2.0
92,wildmonster91,15668,203.0,16494,[134],2020-11-30 04:13:02,


## save users file

In [27]:
user_profile.to_csv("user_profile.csv")

# Conclusion

 I ran a code to extract data from the "animalrights" and "vegan" subreddits, and identified the top 80 users who had more than 15 interactions in anti-fur posts over the specified time period (this year/month). To ensure the accuracy of our sample, I cross-checked the user data manually.