# Let's test out a zero-shot classification model on the data as is

This notebook is the next step after scrapping posts and comments from the subreddit r/AITA and analysing the data collected: <br />
https://github.com/Nico404/scrap_reddit <br />
https://github.com/Nico404/AITA_data_exploration_and_ML/blob/master/AITA_data_exploration.ipynb

In [47]:
import pandas as pd
import pickle
from transformers import pipeline
import pprint


Let's import our data from AITA_data_exploration.ipynb

In [16]:
with open('data/pickled_post_data.pkl', 'rb') as f:
    data = pickle.load(f)

posts_df = pd.DataFrame(data)

print(posts_df.head())


   post_id                                       post_content  \
0  10uxee0  I know this post sounds super petty, but this ...   
1  10ur722  My daughter Bryn F9 is going on a trip to a ne...   
2  10upxdd  Alright so my son (17) has weekly therapy appo...   
3  10v2vra  We live three blocks away from my parents and ...   
4  10ung90  My daughter (16) and I have gotten into a mass...   

                                          post_title  \
0  AITA for telling my boyfriend I'll shave my le...   
1  AITA for pulling my daughter from a waterpark ...   
2  AITA for not letting an elderly woman have my ...   
3  AITA for taking my kids to my parents house to...   
4  AITA for calling my daughter a selfish insecur...   

                                     comment_results  
0  {'NTA': 31, 'YWBTA': 0, 'YWNBTA': 0, 'ESH': 0,...  
1  {'NTA': 14, 'YWBTA': 0, 'YWNBTA': 0, 'ESH': 0,...  
2  {'NTA': 27, 'YWBTA': 0, 'YWNBTA': 0, 'ESH': 0,...  
3  {'NTA': 63, 'INFO': 1, 'YWBTA': 0, 'YWNBTA': 0...

In [40]:
#let's setup our transformer label candidates and load our model 
candidate_labels = ["NTA", "YTA", "ESH", "NAH", "INFO", "YWBTA", "YWNBTA"]
candidate_labels_short = ["NTA", "YTA"]

pipe = pipeline(model="facebook/bart-large-mnli")



Let's run the zero-shot model on both Post content and Post title and compare it with the actual results we got from comments.
Let's try and make a candidate shortlist also and add that to the mix

In [43]:
all_posts_results = {}
for i, row in posts_df.iterrows():
    post_results = {}
    post_content_prediction = pipe(row['post_content'], candidate_labels)
    for label, score in zip(post_content_prediction["labels"], post_content_prediction["scores"]):
        post_results[label] = round(score, 2)
        all_posts_results[row['post_id']] = post_results
    break

all_posts_results_short = {}
for i, row in posts_df.iterrows():
    post_results = {}
    post_content_prediction = pipe(row['post_content'], candidate_labels_short)
    for label, score in zip(post_content_prediction["labels"], post_content_prediction["scores"]):
        post_results[label] = round(score, 2)
        all_posts_results_short[row['post_id']] = post_results
    break

all_post_title_results = {}
for i, row in posts_df.iterrows():
    post_title_results = {}
    post_title_prediction = pipe(row['post_title'], candidate_labels)
    for label, score in zip(post_title_prediction["labels"], post_title_prediction["scores"]):
        post_title_results[label] = round(score, 2)
        all_post_title_results[row['post_id']] = post_title_results
    break

all_post_title_results_short = {}
for i, row in posts_df.iterrows():
    post_title_results = {}
    post_title_prediction = pipe(row['post_title'], candidate_labels_short)
    for label, score in zip(post_title_prediction["labels"], post_title_prediction["scores"]):
        post_title_results[label] = round(score, 2)
        all_post_title_results_short[row['post_id']] = post_title_results
    break

all_comment_results = {}
for i, row in posts_df.iterrows():
    post_id = row['post_id']
    comment_results = row['comment_results']
    total_count = sum(comment_results.values())
    probabilities = {key: value / total_count for key, value in comment_results.items()}
    all_comment_results[post_id] = probabilities
    break

print("post content", all_posts_results)
print("post content short labels", all_posts_results_short)
print("post title", all_post_title_results)
print("post title short labels", all_post_title_results_short)
print("comment results", all_comment_results)



post content {'10uxee0': {'ESH': 0.19, 'INFO': 0.17, 'NAH': 0.17, 'YTA': 0.14, 'YWNBTA': 0.12, 'YWBTA': 0.11, 'NTA': 0.1}}
post content short labels {'10uxee0': {'YTA': 0.57, 'NTA': 0.43}}
post title {'10uxee0': {'INFO': 0.25, 'YTA': 0.2, 'NTA': 0.17, 'ESH': 0.13, 'NAH': 0.1, 'YWBTA': 0.08, 'YWNBTA': 0.07}}
post title short labels {'10uxee0': {'YTA': 0.54, 'NTA': 0.46}}
comment results {'10uxee0': {'NTA': 1.0, 'YWBTA': 0.0, 'YWNBTA': 0.0, 'ESH': 0.0, 'NAH': 0.0, 'INFO': 0.0, 'YTA': 0.0}}


For this one case:
- similar results for both title and content on short-listed labels
- unclear results on content vs title for this
- all zero-shots have drastically different conclusions that the results we scrapped from comments

Lets make functions and prettify the output

In [52]:
def get_predictions(dataframe, labels, prompt_type):
    predictions = {}
    for i, row in dataframe.iterrows():
        post_results = {}
        column_name = 'post_content' if prompt_type == 'content' else 'post_title'
        post_prediction = pipe(row[column_name], labels)
        post_results = {label: round(score, 2) for label, score in zip(post_prediction["labels"], post_prediction["scores"])}
        predictions[row['post_id']] = post_results
        if i == 5:
            break
    return predictions


def get_comment_results(dataframe):
    comment_results = {}
    for i, row in dataframe.iterrows():
        post_id = row['post_id']
        comment_results[post_id] = {key: value / sum(row['comment_results'].values()) for key, value in row['comment_results'].items()}
        if i == 5:
            break
    return comment_results



all_posts_results = get_predictions(posts_df, candidate_labels, 'content')
all_posts_results_short = get_predictions(posts_df, candidate_labels_short, 'content')
all_post_title_results = get_predictions(posts_df, candidate_labels, 'title')
all_post_title_results_short = get_predictions(posts_df, candidate_labels_short, 'title')
all_comments_results = get_comment_results(posts_df)

pp = pprint.PrettyPrinter(indent=(len(all_posts_results)+1))
print(all_posts_results)
pp.pprint(all_posts_results_short)
print(all_post_title_results)
pp.pprint(all_post_title_results_short)
print(all_comment_results)


{'10uxee0': {'ESH': 0.19, 'INFO': 0.17, 'NAH': 0.17, 'YTA': 0.14, 'YWNBTA': 0.12, 'YWBTA': 0.11, 'NTA': 0.1}, '10ur722': {'INFO': 0.25, 'NAH': 0.17, 'NTA': 0.17, 'YTA': 0.15, 'ESH': 0.12, 'YWNBTA': 0.07, 'YWBTA': 0.06}}
{'10ur722': {'NTA': 0.53, 'YTA': 0.47}, '10uxee0': {'NTA': 0.43, 'YTA': 0.57}}
{'10uxee0': {'INFO': 0.25, 'YTA': 0.2, 'NTA': 0.17, 'ESH': 0.13, 'NAH': 0.1, 'YWBTA': 0.08, 'YWNBTA': 0.07}, '10ur722': {'INFO': 0.21, 'YTA': 0.19, 'ESH': 0.15, 'NAH': 0.14, 'NTA': 0.13, 'YWNBTA': 0.1, 'YWBTA': 0.08}}
{'10ur722': {'NTA': 0.4, 'YTA': 0.6}, '10uxee0': {'NTA': 0.46, 'YTA': 0.54}}
{  '10upxdd': {  'ESH': 0.0,
                 'INFO': 0.0,
                 'NAH': 0.0,
                 'NTA': 1.0,
                 'YTA': 0.0,
                 'YWBTA': 0.0,
                 'YWNBTA': 0.0},
   '10ur722': {  'ESH': 0.0,
                 'INFO': 0.0,
                 'NAH': 0.0,
                 'NTA': 1.0,
                 'YTA': 0.0,
                 'YWBTA': 0.0,
                 'Y