# Install and import the required packages


In [None]:
!pip install numpy transformers pandas scikit-learn torch matplotlib datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import transformers
import json
import numpy as np
import matplotlib
import torch 
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_metric
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Define data pre-processing functions

In [None]:
def get_topics_from_csv(file_path):
  """Retrieves list of topic keywords from a topic csv file
  Args:
    file_path (str): file path to topic csv file
  Returns:
    topic_dict (dict): dictionary mapping from video_id to a list of topic keywords
  """
  # Dataframe of csv file
  df = pd.read_csv(file_path)

  # Create a dictionary mapping from video_id to their topics
  topic_dict = dict()
  video_ids = list(df['video_id'].unique())
  for video_id in video_ids:
    query = f"video_id=='{video_id}'"
    topic_keywords = df.query(query)['topic_keywords'].iloc[0]
    topic_dict[video_id] = topic_keywords

  return topic_dict

def text_to_label(cls):
  """Converts class to an integer label
  Args:
    cls (str): class
  Returns:
    label (int): integer label in set {0,1,2}, returns -1 if there is an unidentifiable class
  """
  if cls == "spam":
    return 0
  elif cls == "neutral":
    return 1
  elif cls == "ham":
    return 2
  else:
    return -1

def parse_comment(comment):
  """Parses a comment in a format suitable for the custom tokenizer
  Args:
    comment (str): raw comment string in the comment retrieval format
  Returns:
    parsed_comment (str): parsed string
  """  
  if '[MAIN]' in comment:
    return comment
  else:
    return f"[MAIN] {comment}"
    
def get_comments_from_csv(file_path):
  """Retrieve comment data from a specified .csv file
  Args:
      file_name (str): .csv file name
  Returns:
      comments_by_videoid (dict): dictionary mapping from video ID to its list of comments
      video_name_dict (dict): dictionary mapping from video ID its video name
  """
  if file_path[-4:] != '.csv':
      file_path += '.csv'

   # Dataframe of csv file
  df = pd.read_csv(file_path)

  # Dictionary mapping from video_id to a list containing: username, parsed comment and label
  info_dict = dict()

  video_ids = list(df['video_id'].unique())
  for video_id in video_ids:
      query = f"video_id=='{video_id}'"
      comments = list(df.query(query)['comment'])
      usernames = list(df.query(query)['username'])
      labels = list(df.query(query)['class'])
      info = []
      for i in range(0,len(comments)):
        label = text_to_label(labels[i])
        if label == -1: # Ignore any classes that are not in the set {0,1,2}
          continue 
        info.append([f"[USER] {usernames[i]} {parse_comment(comments[i])} ",label])

      info_dict[video_id] = info
      
  return info_dict

def create_dataset(data_path,dataset_filename):
  """Create a dataset from a .csv file
  Args:
      data_path (str): path to the data folder
      dataset_filename (str): name of the dataset file
  Returns:
      dataset (list): dataset where each data item contains a sentence pair (comment, topic keywords)
      labels (list): list of labels corresponding to the indices in the dataset
  """
  topic_dict = get_topics_from_csv(f"{data_path}/topics/{dataset_filename}-topics.csv")
  info_dict = get_comments_from_csv(f"{data_path}/labelled/{dataset_filename}")

  video_ids = topic_dict.keys()
  dataset = []
  labels = []
  for video_id in video_ids:
    for comment_data in info_dict[video_id]:
      data_sample = (comment_data[0],topic_dict[video_id])
      dataset.append(data_sample)
      labels.append(comment_data[1])

  return dataset, labels

# Prepare model and tokenizer

In [None]:
model_shortcut = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_shortcut)
model = AutoModelForSequenceClassification.from_pretrained(model_shortcut, num_labels=3)

# Add custom tokens to tokenizer
num_added_toks = tokenizer.add_tokens(['[USER]','[MAIN]','[REPLY]'], special_tokens=True)
model.resize_token_embeddings(len(tokenizer))

saved_model = DistilBertForSequenceClassification.from_pretrained("/content/drive/MyDrive/NLU Bert Spam Classification/results/model_0")

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.wei

# Import modules required for demonstration

In [None]:
!pip install pyLDAvis gensim spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation,strip_numeric
import spacy
import pyLDAvis
import pyLDAvis.gensim 
import string
import nltk
import googleapiclient.discovery
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Demonstration

In [11]:
import sys
sys.path.append('/content/drive/MyDrive/NLU Bert Spam Classification/scripts')

import ModelTopics
import GetComments
import GetChannel

def create_video_dict(video_id):
  """Create a dict mapping from video id to a dict containing data about the video, to be used as input for topic modelling
  Args:
    video_id (str): YouTube video id
  Returns:
    video_dict (dict): dict mapping from video id to a dict containing data about the video
  """
  youtube_comments, video_name = GetComments.getComments(video_id)
  channel_name = GetChannel.getChannel(video_id)
  data_dict = {'comments': youtube_comments,
               'video_name': video_name,
               'channel_name': channel_name}
  video_dict = {video_id: data_dict}
  return video_dict

# Input data here
video_link = 'https://www.youtube.com/watch?v=nqJiWbD08Yw'
username = 'professional poster'
comment = "I love Tom's narration so much, but his ability to look both young and old at the same time creates sort of an uncanny valley, which somehow even improves the experience of watching these videos"

video_id = video_link.partition("watch?v=")[2]
video_dict = create_video_dict(video_id)
topic_keywords = ", ".join(list(ModelTopics.get_topics(video_dict)[video_id]))

  and should_run_async(code)


Processing 1/1 (nqJiWbD08Yw)


In [12]:
from transformers import pipeline

# Convert input into a suitable encoded sequence
input_sequence = f"[USER] {username} {parse_comment(comment)}"
input_encoding = tokenizer(comment,topic_keywords, padding="max_length", truncation='only_first')

# Create pipeline for custom input sequences
pipe = pipeline("text-classification", model=saved_model, tokenizer=tokenizer)
pipe(input_sequence)

  and should_run_async(code)


[{'label': 'LABEL_2', 'score': 0.5994858741760254}]