# Running Mass Inference on RMP Dataset

In [1]:
from transformers import pipeline
import pandas as pd
import numpy as np

Downloading and using the models through the Pipline API. Running three types of NLP tasks:
1. Polarity Score
2. Emotions
3. Summarizations.

Storing them in three seperate csv files:
For polarity score, each course review will receive a score. 
For emotions, each course review will receive a top 3 emotion
For summarizations, each course will get a summarized result of all its comments. 

Files will be stored to /data

In [5]:
polarity_score_predictor = pipeline(model="distilbert-base-uncased-finetuned-sst-2-english", task="sentiment-analysis")
emotion_predictor = pipeline(model="SamLowe/roberta-base-go_emotions", task="text-classification", top_k=3)
summarizer = pipeline(model="facebook/bart-large-cnn", task="summarization")



In [2]:
RATING_PATH = "../data/clean_ratings.csv"
PROF_PATH = "../data/clean_prof_info.csv"

rating = pd.read_csv(RATING_PATH)
prof = pd.read_csv(PROF_PATH)

Running sentiment polarity score prediction on all 50k+ course reviews. 

In [None]:
rating["polarity_score"]=polarity_score_predictor(rating["comment"].tolist())

In [47]:
polarity_dataset = rating[["class", "comment", "polarity_score"]]
polarity_dataset.to_csv("../data/course_review_polarity.csv", index=False)

Running emotion prediction on all 50k course reviews.
Test workflow on a small sample

In [52]:
first_five = rating.loc[:4]
first_five["emotion1", "emotion2", "emotion3"] = emotion_predictor(first_five["comment"].tolist())
first_five

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first_five["emotion1", "emotion2", "emotion3"] = emotion_predictor(first_five["comment"].tolist())


Unnamed: 0,profID,attendanceMandatory,class,comment,date,difficultyRating,grade,helpfulRating,isForCredit,isForOnlineClass,ratingTags,wouldTakeAgain,polarity_score,emotion1,emotion2,emotion3,"(emotion1, emotion2, emotion3)"
0,7964,False,ANTHRCUL101,Fricke is the man. Entire class probably took ...,2019-04-28 17:13:12,1.0,A,5.0,False,False,"['Respected', 'Inspirational', 'Amazing Lectur...",True,"{'label': 'NEGATIVE', 'score': 0.9982315897941...","{'label': 'neutral', 'score': 0.6455406546592712}","{'label': 'approval', 'score': 0.1846129000186...","{'label': 'optimism', 'score': 0.1674855947494...","[{'label': 'neutral', 'score': 0.6455406546592..."
1,7964,False,ANTHRO101,Tom Fricke is one of those professors you will...,2019-01-08 18:41:24,1.0,A+,5.0,False,False,"['Accessible Outside Class', 'Hilarious', 'Ama...",True,"{'label': 'POSITIVE', 'score': 0.9996259212493...","{'label': 'admiration', 'score': 0.85450613498...","{'label': 'approval', 'score': 0.2074013650417...","{'label': 'joy', 'score': 0.09679455310106277}","[{'label': 'admiration', 'score': 0.8545061349..."
2,7964,False,ANTHRCUL101,Prof. Fricke is amazing. He is hilarious and t...,2018-12-16 03:11:18,1.0,A,5.0,False,False,"['Hilarious', 'Graded By Few Things', 'Caring']",True,"{'label': 'POSITIVE', 'score': 0.9993315935134...","{'label': 'admiration', 'score': 0.74482583999...","{'label': 'amusement', 'score': 0.341294288635...","{'label': 'approval', 'score': 0.1215387433767...","[{'label': 'admiration', 'score': 0.7448258399..."
3,7964,False,CULTANTHRO101,Such an easy class. Exams were exactly like th...,2018-12-12 10:03:19,1.0,A,5.0,False,False,"['Accessible Outside Class', 'Graded By Few Th...",True,"{'label': 'POSITIVE', 'score': 0.9967494010925...","{'label': 'admiration', 'score': 0.89814394712...","{'label': 'approval', 'score': 0.1903753727674...","{'label': 'neutral', 'score': 0.05876286700367...","[{'label': 'admiration', 'score': 0.8981439471..."
4,7964,False,ANTHRCUL101,Easiest class i have taken at UM. The exams to...,2018-12-11 16:33:00,1.0,A+,5.0,False,False,"['Respected', 'Hilarious', 'Amazing Lectures']",True,"{'label': 'POSITIVE', 'score': 0.9996769428253...","{'label': 'admiration', 'score': 0.74522703886...","{'label': 'joy', 'score': 0.6837484240531921}","{'label': 'approval', 'score': 0.1419211179018...","[{'label': 'admiration', 'score': 0.7452270388..."


In [None]:
chunk_size = 100
for i in range(0, len(rating), chunk_size):
    rating.loc[i:i+chunk_size-1, ["emotion1", "emotion2", "emotion3"]] = emotion_predictor(rating.loc[i:i+chunk_size-1, "comment"].tolist())
    print(f"Processed up to index {i + chunk_size - 1}")

In [46]:
emotion_dataset = rating[["class", "comment", "emotion1", "emotion2", "emotion3"]]
emotion_dataset.to_csv("../data/course_review_emotion.csv", index=False)

Just quickly check that the dataset has been correctly written. 

In [43]:
test = pd.read_csv("../data/course_review_emotion.csv")
test.head()

Unnamed: 0.1,Unnamed: 0,class,comment,emotion1,emotion2,emotion3
0,0,ANTHRCUL101,Fricke is the man. Entire class probably took ...,"{'label': 'neutral', 'score': 0.6455406546592712}","{'label': 'approval', 'score': 0.1846129000186...","{'label': 'optimism', 'score': 0.1674855947494..."
1,1,ANTHRO101,Tom Fricke is one of those professors you will...,"{'label': 'admiration', 'score': 0.85450613498...","{'label': 'approval', 'score': 0.2074013650417...","{'label': 'joy', 'score': 0.09679455310106277}"
2,2,ANTHRCUL101,Prof. Fricke is amazing. He is hilarious and t...,"{'label': 'admiration', 'score': 0.74482583999...","{'label': 'amusement', 'score': 0.341294288635...","{'label': 'approval', 'score': 0.1215387433767..."
3,3,CULTANTHRO101,Such an easy class. Exams were exactly like th...,"{'label': 'admiration', 'score': 0.89814394712...","{'label': 'approval', 'score': 0.1903753727674...","{'label': 'neutral', 'score': 0.05876286700367..."
4,4,ANTHRCUL101,Easiest class i have taken at UM. The exams to...,"{'label': 'admiration', 'score': 0.74522703886...","{'label': 'joy', 'score': 0.6837484240531921}","{'label': 'approval', 'score': 0.1419211179018..."


Clean up variables after csv file have been written. Since we have our CSV, we can just load them up later with read_csv instead of rerunning the above cells for inference on the entire dataset. 

In [60]:
del polarity_dataset
del emotion_dataset

Summarization:
1. Group by Positive and Negative comments
2. Group by emotions
3. Group by review tags