In [2]:
from IPython.display import display, HTML

display(HTML("""
<style>
    div.jp-OutputArea-output pre {
        white-space: pre
    }
</style>
"""))
# delete all file from working Directory
# !rm -rf /kaggle/working/*
# !rm -rf /kaggle/working/tuned_distilbert_for_QSO_2

In [None]:
!pip install datasets

In [None]:
#@title Get all comments from list of videos and add then in dataframe

from googleapiclient.discovery import build
import pandas as pd


def get_all_comments(video_id, api_key):
    youtube = build('youtube', 'v3', developerKey=api_key)
    comments = []
    next_page_token = None

    while True:
        request = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            textFormat='plainText',
            maxResults=100,
            pageToken=next_page_token
        )
        response = request.execute()

        # Extract comments
        for item in response.get('items', []):
            comment = item['snippet']['topLevelComment']['snippet']
            comments.append([
                comment['textDisplay'],
                item['snippet']['topLevelComment']['id'],
                item['snippet']['totalReplyCount'],
                comment.get('likeCount', 0),
                comment['publishedAt'],
                video_id
            ])

        # Check for next page
        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    return comments

def get_comments_from_videos(video_ids, api_key):
    all_comments = []
    for video_id in video_ids:
        comments = get_all_comments(video_id, api_key)
        all_comments.extend(comments)
    return all_comments

# ['vtXuW0JTfig']
api_key = 'AIzaSyAc12zhv5J2zWlL0ENgDtFRKzkkxxAMbB0'
video_ids =  ["6c-VD86TKoU", "jx2dDV2eWBM", "TvN_lUFYXMU", "sTeoEFzVNSc", "SLwpqD8n3d0", "1-hk3JaGlSU", "dHlDAhARLxo", "Kl09iSWvEBk", "r16Rn4_jDfk", "bjFvcFjJpE0", "p-h1LpM1xm4", "GxmfcnU3feo", "t9CAFYn7YgY", "F2pEQlUmKWc", "FhqNN1LykWU", 'P6FORpg0KVo', '7bA0gTroJjw', "7Xnr805bm4E", "Mr2f4MxGmA8", "SHhJ1RqWl-k", "HDhlXPBXwFA", "ELxGmf9f_ZM", "k4715CJ0Ii8", "GQkY6jsn1GU" ]
all_comments = get_comments_from_videos(video_ids, api_key)





df = pd.DataFrame(all_comments, columns=['Comments', 'Comment_ID', 'Reply_Count', 'Like_Count', 'Date', 'VidId'])
df.to_csv('youtube_comments.csv', index=False)

print(f"Total comments fetched: {len(all_comments)}")
df.head(5)

In [None]:
#@title Pre processing the comments to remove all unwanted Characters
import re

def pre_processing_comments(text):
    text = str(text).lower()  # Convert the text to lowercase
    text = re.sub(r"([-.!,/\"])", '', text) # Remove specific punctuation marks: -.,/ and "
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!,'’]", "", text) # Remove additional punctuation marks and special characters: -()\"#/@;:<>{}`+=~|.!,'’"
    text = re.sub(r"[ ]+", " ", text) # Replace multiple spaces with a single space
    text = re.sub(r'[^\x00-\x7F]+', ' ', text) # Replace any non-ASCII characters with a space
    text = re.sub(r'[\r\n\t]+', ' ', text) # Replace newlines and tabs with spaces
    text = text.strip() # Remove any leading/trailing whitespace again to clean up any spaces added by the previous replacements
    return text

df['Clean_Comments'] = df['Comments'].apply(pre_processing_comments)
df.to_csv('youtube_comments.csv', index=False)

df.head(5)


In [None]:
# Load zero-shot-classification model to classify text into desire category
# As zero-shot-classification model is slow on CPU, using GPU is recommended
# this model allows classifying text without specific training for the target classes


import os
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from torch.cuda.amp import autocast
from accelerate import Accelerator


model_name = "tasksource/deberta-base-long-nli" # model name
tokenizer = AutoTokenizer.from_pretrained(model_name) # to create word embeddings
model = AutoModelForSequenceClassification.from_pretrained(model_name)  # Load the pre-trained model

device = 0 if torch.cuda.is_available() else -1 # use cuda if available or if not use cpu

classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer, device=device)

accelerator = Accelerator() # for faster computations
classifier = accelerator.prepare(classifier)   # optimize it for available hardware

In [None]:
def classify_comment_label(label):
    if label in ["question", "inquiry", "query", "appeal"]: return "Question"
    if label in ["suggest", "request", "advice", "opinion", "instruction"]: return "Suggestion"
    if label in ["problem", "appreciation", "other", "comment", "Observation", "Complaint", "Discussion", "Experience", "spam"]: return "Other"

def is_question_as_well_as_suggestion(label1, label2):
    questions, suggestions = ["question", "inquiry"], ["suggest","request", "instruction"]
    return (label1 in questions and label2 in suggestions) or (label1 in suggestions and label2 in questions)

# Load comments
dataSet_path = 'labeled_comments_dataset.csv'
comments = [str(comment) if pd.notna(comment) and comment.strip() else "Empty Comment" for comment in df['Clean_Comments'].to_list()]
print("  Fetched Comments length:", len(comments), "| all comments:", comments)



In [None]:
#@title Predict the Class for the comment 
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128' # for memory fragmentation management to avoid error like out-of-memory

# have tried over 100 different keywords but below keywords are carefully selected which give the most accurate result.
primary_labels = ["question", "inquiry", "suggest", "request", "problem", "appreciation", "other", "comment", "Observation", "Complaint", "Discussion", "Experience", "spam"]
secondary_labels = ["question", "inquiry", "query", "appeal", "suggest", "advice", "request", "opinion", "instruction"]
dataSet_path = 'labeled_comments_dataset.csv'
# os.remove(dataSet_path)


def classify_batch(batch):
    with autocast(): # 16-bit or 32-bit precision
        batch_results = classifier(batch["comments"], candidate_labels=primary_labels)

    # if there is high probability that a comment is question as well as suggestion then add addition keywords to further verify the label of comment
    rechecked_batch_results = [classifier(result['sequence'], candidate_labels=secondary_labels) if is_question_as_well_as_suggestion(result['labels'][0], result['labels'][1]) else result for result in batch_results]
    rechecked_batch_results = batch_results

    classify_comments = [result["sequence"] for result in rechecked_batch_results] # i.e ["comment1","comment2","comment3","comment4"]
    classification_label = [classify_comment_label(result["labels"][0]) for result in rechecked_batch_results] # i.e ["Question", "Suggestion", "Question", "Other"]

    # Save each batch after processing
    df = pd.DataFrame({"Comments": classify_comments, "Classification": classification_label})
    df.to_csv( dataSet_path , mode='a', index=False, quoting=1, header=not os.path.exists( dataSet_path ))

    torch.cuda.empty_cache()
    return {"comments": classify_comments, "classification": classification_label}


# Convert to Hugging Face Dataset for efficient processing
dataset = Dataset.from_dict({"comments": comments})
classified_dataset = dataset.map(classify_batch, batched=True, batch_size=1) # 1 to 30 

    
print("Classification completed and saved.")


In [None]:
#@title Load the saved dataset
import pandas as pd
from collections import Counter

dframe = pd.read_csv('labeled_comments_dataset.csv')
print(f"Counting the total Entries in Classification colume: {Counter(dframe['Classification'])}")

print( "\nType of Entries in Classification colume", type(dframe['Classification'][0]) )
dframe.head()


In [None]:
#Balancing the majority class in the imbalanced dataset
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

undersampler = RandomUnderSampler(sampling_strategy={'Other': 3000, 'Question': 3000}, random_state=50)

X = dframe['Comments']
y = dframe['Classification']

X = X.values.reshape(-1, 1) # Reshape X for undersampling
X_under, y_under = undersampler.fit_resample(X, y)

dframe = pd.DataFrame({'Comments': X_under.flatten(), 'Classification': y_under})
print(f"Class distribution after undersampling: {Counter(y_under)}")

In [None]:
#Balancing the minority class in the imbalanced dataset
oversampler = RandomOverSampler(sampling_strategy={'Suggestion': 3000}, random_state=50)
X_over, y_over = oversampler.fit_resample(X_under, y_under)
balanced_df = pd.DataFrame({'Comments': X_over.flatten(), 'Classification': y_over})

print(f"Class distribution after oversampling: {Counter(y_over)}")
balanced_df.head()

In [None]:
#@title Shuffle the DataFrame
shuffled_balaced_dframe = balanced_df.sample(frac=1, random_state=20).reset_index(drop=True) # shuffle the DataFrame
shuffled_balaced_dframe.head()

In [None]:
#@title Convert the Class type from string to int64
shuffled_balaced_dframe['Classification'] = shuffled_balaced_dframe['Classification'].replace("Other", 0).replace("Question", 1).replace("Suggestion", 2)

print( "Dataframe colume items type", type(shuffled_balaced_dframe['Classification'][0]) )
shuffled_balaced_dframe.to_csv( "balanced_labeled_comments_dataset_with_int64.csv", index=False)

shuffled_balaced_dframe.head()