# IMT 547 Project: Data Preprocessing

Chesie Yu

02/18/2024

In [1]:
# Import the libraries
import json
import random
import re
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import contractions
import nltk
from nltk.corpus import stopwords
import spacy
from spacy_langdetect import LanguageDetector

## 1. Load the Data

In [167]:
# Load the data
yt = pd.read_csv("../data/yt.csv")
yt.head(2)

Unnamed: 0,channel_id,channel_name,video_id,video_title,video_creation_time,video_description,video_tags,video_viewcount,video_likecount,video_commentcount,comment_id,comment_author_id,comment_text,comment_time,comment_likecount,comment_replycount,genre
0,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,F-yEoHL7MYY,I t̶r̶i̶e̶d̶ ̶t̶o̶ beat Elden Ring Without Dyi...,2022-04-30T16:40:18Z,🌏 Get exclusive NordVPN deal here ➵ https://N...,"['pewdiepie', 'pewds', 'pewdie']",11533976,472939,15131,UgzEVFJJ14jr8fCd4lZ4AaABAg,UCh1NFazdOQ9VQzNgMJWsXfw,Bro it sucks I wish I could share a drink with...,2024-02-19T02:42:05Z,0,0,action
1,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,F-yEoHL7MYY,I t̶r̶i̶e̶d̶ ̶t̶o̶ beat Elden Ring Without Dyi...,2022-04-30T16:40:18Z,🌏 Get exclusive NordVPN deal here ➵ https://N...,"['pewdiepie', 'pewds', 'pewdie']",11533976,472939,15131,UgwdCwRLsGcxL5xm0_J4AaABAg,UCRkVAxbhKWhlalEhobUKZqQ,Why is the comment section and title about Eld...,2024-02-18T09:28:23Z,0,0,action


In [168]:
# Check the dimensions
print(f"Number of rows: {yt.shape[0]}\n"
      f"Number of columns: {yt.shape[1]}\n")

# Check for missing values
print(f"Number of missing values: {yt.isna().sum().sum()}")

Number of rows: 139202
Number of columns: 17

Number of missing values: 991


<br>

### Summary Statistics

In [169]:
# Check the time range
yt["video_creation_time"].min(), yt["video_creation_time"].max()

('2011-04-22T01:05:52Z', '2024-02-18T20:15:11Z')

In [170]:
# Number of unique channels
print(f"Number of unique channels: {yt['channel_id'].nunique()}")

Number of unique channels: 33


In [171]:
# Number of unique videos
print(f"Number of unique videos: {yt['video_id'].nunique()}")

Number of unique videos: 1435


In [172]:
# Print the summary statistics
yt.describe()

Unnamed: 0,video_viewcount,video_likecount,video_commentcount,comment_likecount,comment_replycount
count,139202.0,139202.0,139202.0,139202.0,139202.0
mean,3723344.0,121080.9,7012.163252,24.945676,0.801418
std,6020486.0,169350.4,11589.935394,1259.739301,17.043648
min,10353.0,158.0,15.0,0.0,0.0
25%,667968.0,18478.0,856.0,0.0,0.0
50%,1900779.0,55614.0,2571.0,0.0,0.0
75%,4319423.0,143902.0,8426.0,0.0,0.0
max,108664100.0,1584318.0,151324.0,324733.0,679.0


<br>

## 2. Data Cleaning

### Handle Missings

In [173]:
# Check the missings
yt.isna().sum()

channel_id               0
channel_name             0
video_id                 0
video_title              0
video_creation_time      0
video_description      805
video_tags               0
video_viewcount          0
video_likecount          0
video_commentcount       0
comment_id               0
comment_author_id        0
comment_text           186
comment_time             0
comment_likecount        0
comment_replycount       0
genre                    0
dtype: int64

In [174]:
# Remove the missings
yt.dropna(inplace=True)
yt.shape

(138211, 17)

### Edit Data Types

In [175]:
# Check the data types
yt.dtypes

channel_id             object
channel_name           object
video_id               object
video_title            object
video_creation_time    object
video_description      object
video_tags             object
video_viewcount         int64
video_likecount         int64
video_commentcount      int64
comment_id             object
comment_author_id      object
comment_text           object
comment_time           object
comment_likecount       int64
comment_replycount      int64
genre                  object
dtype: object

In [176]:
# Convert to datetime
yt["video_creation_time"] = pd.to_datetime(yt["video_creation_time"])
yt["comment_time"] = pd.to_datetime(yt["comment_time"])

<br>

## 3. Text Preprocessing

### Filter English Comments

In [177]:
# # Load the SpaCy model
# # Documentation: https://pypi.org/project/spacy-langdetect/
# nlp = spacy.load("en_core_web_sm")
# nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)

# def filter_english(comment):
#     """
#     Detect English comments.  
#     """
#     doc = nlp(comment)
#     return doc._.languege["language"] == "en" and doc._.language["score"] > 0.95

# yt = yt[yt["comment_text"].apply(filter_english)]

In [178]:
yt.shape

(138211, 17)

### Text Cleaning

In [179]:
# Function for text preprocessing
def clean(text):
    """
    Performs text preprocessing steps on one document.
    """
    # Convert to lowercase
    text = text.lower()
    # Remove contractions
    text = contractions.fix(text)
    
    # Remove URLs
    text = re.sub(r"http\S+", "", text)
    # Remove mentions
    text = re.sub(r"(?<![@\w])@(\w{1,25})", "", text)
    # Remove hashtags
    text = re.sub(r"(?<![#\w])#(\w{1,25})", "", text)
    # Remove new line characters
    text = re.sub("\n", " ", text)
    
    # Remove non-alphabetic characters
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text)
    
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    text = " ".join([word for word in text.split() if word not in stop_words])
    
    return text

In [180]:
# Extract the comments
comments = yt["comment_text"]
comments[:5]

0    Bro it sucks I wish I could share a drink with...
1    Why is the comment section and title about Eld...
2                    lol the wolf parts got me dead af
3           ZERO DEATHS thats a real gamer right there
4    Just beat maliketh for the first time and I’m ...
Name: comment_text, dtype: object

In [181]:
# Clean the comments
comments = comments.apply(clean)

# Remove empty comments
comments = comments[comments.str.len() > 0]

### Tokenization

In [182]:
# Import the libraries
from nltk.tokenize import word_tokenize

# Tokenize the comments
tokenized_comments = comments.apply(word_tokenize)

In [183]:
# Combine into one DataFrame
yt["cleaned_comment"] = comments
yt["tokenized_comment"] = tokenized_comments
yt.head()

Unnamed: 0,channel_id,channel_name,video_id,video_title,video_creation_time,video_description,video_tags,video_viewcount,video_likecount,video_commentcount,comment_id,comment_author_id,comment_text,comment_time,comment_likecount,comment_replycount,genre,cleaned_comment,tokenized_comment
0,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,F-yEoHL7MYY,I t̶r̶i̶e̶d̶ ̶t̶o̶ beat Elden Ring Without Dyi...,2022-04-30 16:40:18+00:00,🌏 Get exclusive NordVPN deal here ➵ https://N...,"['pewdiepie', 'pewds', 'pewdie']",11533976,472939,15131,UgzEVFJJ14jr8fCd4lZ4AaABAg,UCh1NFazdOQ9VQzNgMJWsXfw,Bro it sucks I wish I could share a drink with...,2024-02-19 02:42:05+00:00,0,0,action,bro sucks wish could share drink man,"[bro, sucks, wish, could, share, drink, man]"
1,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,F-yEoHL7MYY,I t̶r̶i̶e̶d̶ ̶t̶o̶ beat Elden Ring Without Dyi...,2022-04-30 16:40:18+00:00,🌏 Get exclusive NordVPN deal here ➵ https://N...,"['pewdiepie', 'pewds', 'pewdie']",11533976,472939,15131,UgwdCwRLsGcxL5xm0_J4AaABAg,UCRkVAxbhKWhlalEhobUKZqQ,Why is the comment section and title about Eld...,2024-02-18 09:28:23+00:00,0,0,action,comment section title elden ring youtube gave ...,"[comment, section, title, elden, ring, youtube..."
2,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,F-yEoHL7MYY,I t̶r̶i̶e̶d̶ ̶t̶o̶ beat Elden Ring Without Dyi...,2022-04-30 16:40:18+00:00,🌏 Get exclusive NordVPN deal here ➵ https://N...,"['pewdiepie', 'pewds', 'pewdie']",11533976,472939,15131,Ugxeq8pABQ9xIzDRPhh4AaABAg,UC8KI7wwDi_mWdwMzeI_R9cw,lol the wolf parts got me dead af,2024-02-18 05:16:52+00:00,0,0,action,lol wolf parts got dead af,"[lol, wolf, parts, got, dead, af]"
3,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,F-yEoHL7MYY,I t̶r̶i̶e̶d̶ ̶t̶o̶ beat Elden Ring Without Dyi...,2022-04-30 16:40:18+00:00,🌏 Get exclusive NordVPN deal here ➵ https://N...,"['pewdiepie', 'pewds', 'pewdie']",11533976,472939,15131,UgzeOXCIvTilV1aoJQ14AaABAg,UCpXe_5C-212ePvmcwN_R4iw,ZERO DEATHS thats a real gamer right there,2024-02-18 03:34:00+00:00,1,0,action,zero deaths real gamer right,"[zero, deaths, real, gamer, right]"
4,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,F-yEoHL7MYY,I t̶r̶i̶e̶d̶ ̶t̶o̶ beat Elden Ring Without Dyi...,2022-04-30 16:40:18+00:00,🌏 Get exclusive NordVPN deal here ➵ https://N...,"['pewdiepie', 'pewds', 'pewdie']",11533976,472939,15131,UgyPRAH5Xn_5JHkIhE94AaABAg,UCNFbQi_ufDcCPRR48uGeYRA,Just beat maliketh for the first time and I’m ...,2024-02-16 22:30:04+00:00,0,0,action,beat maliketh first time really starting under...,"[beat, maliketh, first, time, really, starting..."


In [184]:
# Remove the missings
yt.dropna(inplace=True)
yt.shape

(129188, 19)

In [185]:
# Write to CSV
yt.to_csv("../data/yt_cleaned.csv", index=False)

<br>

## 4. Data Labeling

### Toxicity Annotations

In [17]:
# Import the libraries
import itertools
import logging
from googleapiclient import discovery
from googleapiclient.errors import HttpError

In [18]:
# The Perspective API keys
PERSPECTIVE_API_KEYS = [
    "AIzaSyAMpL8JpwPU4c1nEGKCiBAiGp979r6o4-4",  # perspective-api-414709
    "AIzaSyD_-Oiitvk4OL5zgvX90Nn5TcoA23TrMlM",  # perspective-api-414723
    "AIzaSyCLQ0SAdw0-xKDEqGyTcBPO7yApPF2M3R0",  # perspe-414800
    "AIzaSyDTzo_CBwQ_5zVDojWSBMnH1jI_F6rEs7s",  # precise-antenna-414801
    "AIzaSyAt70Atcrnx2bfvFuPTwtvOV8Nf2PBPx4A",  # sound-datum-414801
    "AIzaSyBgO09nuuysiO7YNqexVZiskWhJPSv5t3A",  # perspective-api-414710
    "AIzaSyBFU4rFCLaCAVuQ0i4K3QhF_f9wBV4gBm4",  # perspective-api-414800
    "AIzaSyC8kMo6iX7iXX_lj8gx8IM0LuNS8p94UA4",  # shaped-canyon-414800
    "AIzaSyAhRHCYoYkRkQkco4NzhNuKT7Zm92BKOS8",  # perspective-api-414801
    "AIzaSyCr_b9CLWmy9Rt0f0ME74ZZmh3uT6gAwpk"  # hardy-order-414801
]

def build_client(api_key):
    """
    Build a client for a given Perspective API key.
    """
    # Create a client object
    # Reference: https://developers.google.com/codelabs/setup-perspective-api#4
    client = discovery.build(
        "commentanalyzer",  # Name
        "vlalpha1",  # Version
        developerKey=api_key,
        discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
        static_discovery=False
    )
    return client

# Pre-build a client for each API key
clients = {key: build_client(key) for key in PERSPECTIVE_API_KEYS}

# Set up the iterator
api_key_iterator = itertools.cycle(PERSPECTIVE_API_KEYS)

In [19]:
# Configure logging to file
logging.basicConfig(
    filename="../logs/toxicity.log",
    level=logging.INFO,  # Log info, warning, error, critical
    format="%(asctime)s - %(levelname)s - %(message)s",
    filemode="w"  # Overwrite on each run
)

In [20]:
def perspective_toxicity(comments):
    """
    Compute Perspective toxicity scores for a given list of texts.
    Support throttling management w/ client reuse, key rotation, and 
    exponential backoff.
    """
    # Empty list to store toxicity scores
    scores = []

    # Loop through the comments
    for index, comment in enumerate(comments):
        # Specify the comment text and attributes
        analyze_request = {
            "comment": {"text": comment},
            "languages": ["en"],
            "requestedAttributes": {
                "TOXICITY": {},
                "SEVERE_TOXICITY": {},
                "IDENTITY_ATTACK": {},
                "INSULT": {},
                "PROFANITY": {},
                "THREAT": {}}
        }
        
        # Attempts allowed
        attempts_per_key = 5
        total_attempts = len(PERSPECTIVE_API_KEYS) * attempts_per_key
        # Reset attempt count for each comment
        attempt = 0
        
        # While retry attempts are not exhausted
        while attempt < total_attempts:
            # Rotate to the next API key
            current_key = next(api_key_iterator)
            client = clients[current_key]
        
            try:
                res = client.comments().analyze(body=analyze_request).execute()
                scores.append({
                    "toxicity": res["attributeScores"]["TOXICITY"]["summaryScore"]["value"],
                    "severe_toxicity": res["attributeScores"]["SEVERE_TOXICITY"]["summaryScore"]["value"],
                    "identity_attack": res["attributeScores"]["IDENTITY_ATTACK"]["summaryScore"]["value"],
                    "insult": res["attributeScores"]["INSULT"]["summaryScore"]["value"],
                    "profanity": res["attributeScores"]["PROFANITY"]["summaryScore"]["value"],
                    "threat": res["attributeScores"]["THREAT"]["summaryScore"]["value"]
                })
                logging.info(f"Success for comment #{index} with key {current_key} on attempt {attempt + 1}")
                # Break the loop if successful
                break
            
            # Http errors
            except HttpError as e:
                # Rate limit errors
                if e.resp.status == 429:
                    logging.warning(f"HTTP 429 Rate limit exceeded for comment #{index} with key '{current_key}' on attempt {attempt + 1}. Retrying with exponential backoff.")
                else:
                    logging.warning(f"HTTP error for comment #{index} with key '{current_key}' on attempt {attempt + 1}: {e}. Retrying with exponential backoff.")
            # Timeout errors
            except TimeoutError:
                logging.warning(f"TimeoutError for comment #{index} with key '{current_key}' on attempt {attempt + 1}. Retrying with exponential backoff.")
            # Unexpected errors
            except Exception as e:
                logging.warning(f"Unexpected error for comment #{index} with key '{current_key}' on attempt {attempt + 1}: {e}. Retrying with exponential backoff.")
            
            # Exponential backoff + random jitter
            sleep_time = (2 ** (attempt // len(PERSPECTIVE_API_KEYS))) + random.uniform(0, 1)
            time.sleep(sleep_time)
            attempt += 1
            
            # Check if all retry attempts are exhausted
            if attempt >= total_attempts:
                logging.error(f"Max attempts reached for comment #{index} with key {current_key}. Moving to the next comment.")

        # Sleep to avoid exceeding rate limits
        # time.sleep(0.05)
    
    # Convert to DataFrame
    toxicity_scores = pd.DataFrame(scores)
    
    return toxicity_scores

In [21]:
# %%timeit -r 1 -n 3
# Start timing
start_time = time.time()

# Compute Perspective API toxicity scores for each comment
toxicity_scores = perspective_toxicity(comments)

# End timing
print(f"Runtime: {time.time() - start_time:.4f}")
toxicity_scores.head()

Runtime: 15705.3795


Unnamed: 0,toxicity,severe_toxicity,identity_attack,insult,profanity,threat
0,0.717606,0.197615,0.066587,0.486277,0.591665,0.012544
1,0.151034,0.007553,0.017867,0.026316,0.064959,0.025099
2,0.294055,0.01845,0.021585,0.066696,0.218892,0.120762
3,0.128675,0.008316,0.011839,0.019161,0.054438,0.054624
4,0.151034,0.00618,0.016059,0.030485,0.047173,0.044279


In [186]:
# Combine into one DataFrame
for column in toxicity_scores.columns:
    yt[column] = toxicity_scores[column].values
yt.head()

Unnamed: 0,channel_id,channel_name,video_id,video_title,video_creation_time,video_description,video_tags,video_viewcount,video_likecount,video_commentcount,...,comment_replycount,genre,cleaned_comment,tokenized_comment,toxicity,severe_toxicity,identity_attack,insult,profanity,threat
0,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,F-yEoHL7MYY,I t̶r̶i̶e̶d̶ ̶t̶o̶ beat Elden Ring Without Dyi...,2022-04-30 16:40:18+00:00,🌏 Get exclusive NordVPN deal here ➵ https://N...,"['pewdiepie', 'pewds', 'pewdie']",11533976,472939,15131,...,0,action,bro sucks wish could share drink man,"[bro, sucks, wish, could, share, drink, man]",0.717606,0.197615,0.066587,0.486277,0.591665,0.012544
1,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,F-yEoHL7MYY,I t̶r̶i̶e̶d̶ ̶t̶o̶ beat Elden Ring Without Dyi...,2022-04-30 16:40:18+00:00,🌏 Get exclusive NordVPN deal here ➵ https://N...,"['pewdiepie', 'pewds', 'pewdie']",11533976,472939,15131,...,0,action,comment section title elden ring youtube gave ...,"[comment, section, title, elden, ring, youtube...",0.151034,0.007553,0.017867,0.026316,0.064959,0.025099
2,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,F-yEoHL7MYY,I t̶r̶i̶e̶d̶ ̶t̶o̶ beat Elden Ring Without Dyi...,2022-04-30 16:40:18+00:00,🌏 Get exclusive NordVPN deal here ➵ https://N...,"['pewdiepie', 'pewds', 'pewdie']",11533976,472939,15131,...,0,action,lol wolf parts got dead af,"[lol, wolf, parts, got, dead, af]",0.294055,0.01845,0.021585,0.066696,0.218892,0.120762
3,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,F-yEoHL7MYY,I t̶r̶i̶e̶d̶ ̶t̶o̶ beat Elden Ring Without Dyi...,2022-04-30 16:40:18+00:00,🌏 Get exclusive NordVPN deal here ➵ https://N...,"['pewdiepie', 'pewds', 'pewdie']",11533976,472939,15131,...,0,action,zero deaths real gamer right,"[zero, deaths, real, gamer, right]",0.128675,0.008316,0.011839,0.019161,0.054438,0.054624
4,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,F-yEoHL7MYY,I t̶r̶i̶e̶d̶ ̶t̶o̶ beat Elden Ring Without Dyi...,2022-04-30 16:40:18+00:00,🌏 Get exclusive NordVPN deal here ➵ https://N...,"['pewdiepie', 'pewds', 'pewdie']",11533976,472939,15131,...,0,action,beat maliketh first time really starting under...,"[beat, maliketh, first, time, really, starting...",0.151034,0.00618,0.016059,0.030485,0.047173,0.044279


In [187]:
# Check the dimensions
yt.shape

(129188, 25)

In [24]:
# import requests
# import json

# # The URL for the Perspective API
# url = "https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key=" + PERSPECTIVE_API_KEY

# # The data sent to request
# data_dict = {
#     "comment": {"text": "Friendly discussion is cool, but please no personal attacks!"},
#     "languages": ["en"],
#     "requestedAttributes": {"TOXICITY": {}}
# }

# response = requests.post(url, data=json.dumps(data_dict))
# result = response.json()

# print(result)

### Sentiment Scores

In [25]:
# Import the libraries
from nltk.corpus import opinion_lexicon
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from empath import Empath

In [26]:
def vader_sentiment(text):
    """
    Compute VADER sentiment scores for a given text.
    """
    # Initialize the analyzer
    analyzer = SentimentIntensityAnalyzer()
    
    # Compute the scores
    return analyzer.polarity_scores(text)

In [27]:
# %%timeit -r 1 -n 1
# Compute VADER sentiment scores for each comment
vader_scores = comments.apply(vader_sentiment).apply(pd.Series)
vader_scores.head()

Unnamed: 0,neg,neu,pos,compound
0,0.219,0.351,0.43,0.34
1,0.0,1.0,0.0,0.0
2,0.387,0.36,0.252,-0.3612
3,0.0,1.0,0.0,0.0
4,0.0,0.815,0.185,0.3612


In [28]:
def textblob_sentiment(text):
    """
    Compute TextBlob sentiment scores for a given text.
    """
    # Initialize the analyzer
    blob = TextBlob(text)
    
    # Compute the scores
    return {"polarity": blob.sentiment.subjectivity, 
            "subjectivity": blob.sentiment.subjectivity}

In [29]:
# Compute TextBlob sentiment scores for each comment
textblob_scores = comments.apply(textblob_sentiment).apply(pd.Series)
textblob_scores.head()

Unnamed: 0,polarity,subjectivity
0,0.3,0.3
1,0.0,0.0
2,0.55,0.55
3,0.417857,0.417857
4,0.216667,0.216667


In [30]:
def empath_sentiment(text):
    """
    Compute Empath sentiment scores for a given text.
    """
    # Initialize the analyzer
    lexicon = Empath()
    
    # Compute the scores
    categories = lexicon.analyze(text, normalize=True)
    
    # Filter out the positive and negative emotions
    return {k:v for k, v in categories.items() if k in ["positive_emotion", "negative_emotion"]}

In [31]:
# Compute Empath sentiment scores for each comment
empath_scores = comments.apply(empath_sentiment).apply(pd.Series)
empath_scores.head()

Unnamed: 0,negative_emotion,positive_emotion
0,0.0,0.142857
1,0.0,0.0
2,0.166667,0.0
3,0.0,0.0
4,0.083333,0.0


In [188]:
# Combine into one DataFrame
yt = pd.concat([yt, vader_scores, textblob_scores, empath_scores], axis=1)
yt.head()

Unnamed: 0,channel_id,channel_name,video_id,video_title,video_creation_time,video_description,video_tags,video_viewcount,video_likecount,video_commentcount,...,profanity,threat,neg,neu,pos,compound,polarity,subjectivity,negative_emotion,positive_emotion
0,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,F-yEoHL7MYY,I t̶r̶i̶e̶d̶ ̶t̶o̶ beat Elden Ring Without Dyi...,2022-04-30 16:40:18+00:00,🌏 Get exclusive NordVPN deal here ➵ https://N...,"['pewdiepie', 'pewds', 'pewdie']",11533976,472939,15131,...,0.591665,0.012544,0.219,0.351,0.43,0.34,0.3,0.3,0.0,0.142857
1,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,F-yEoHL7MYY,I t̶r̶i̶e̶d̶ ̶t̶o̶ beat Elden Ring Without Dyi...,2022-04-30 16:40:18+00:00,🌏 Get exclusive NordVPN deal here ➵ https://N...,"['pewdiepie', 'pewds', 'pewdie']",11533976,472939,15131,...,0.064959,0.025099,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,F-yEoHL7MYY,I t̶r̶i̶e̶d̶ ̶t̶o̶ beat Elden Ring Without Dyi...,2022-04-30 16:40:18+00:00,🌏 Get exclusive NordVPN deal here ➵ https://N...,"['pewdiepie', 'pewds', 'pewdie']",11533976,472939,15131,...,0.218892,0.120762,0.387,0.36,0.252,-0.3612,0.55,0.55,0.166667,0.0
3,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,F-yEoHL7MYY,I t̶r̶i̶e̶d̶ ̶t̶o̶ beat Elden Ring Without Dyi...,2022-04-30 16:40:18+00:00,🌏 Get exclusive NordVPN deal here ➵ https://N...,"['pewdiepie', 'pewds', 'pewdie']",11533976,472939,15131,...,0.054438,0.054624,0.0,1.0,0.0,0.0,0.417857,0.417857,0.0,0.0
4,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,F-yEoHL7MYY,I t̶r̶i̶e̶d̶ ̶t̶o̶ beat Elden Ring Without Dyi...,2022-04-30 16:40:18+00:00,🌏 Get exclusive NordVPN deal here ➵ https://N...,"['pewdiepie', 'pewds', 'pewdie']",11533976,472939,15131,...,0.047173,0.044279,0.0,0.815,0.185,0.3612,0.216667,0.216667,0.083333,0.0


In [189]:
# Check the dimensions
yt.shape

(129188, 33)

In [190]:
# Write to CSV
yt.to_csv("../data/yt_labeled.csv", index=False, escapechar="\\")