In [1]:
import pandas as pd
import re
from datetime import datetime
import ast

# 파일 경로 설정
file_path = 'data/youtube_title.txt'

# 파일 열기 및 내용 읽기
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()

if content.startswith('['):
    content = content[1:]
if content.endswith(']'):
    content = content[:-1]

# 블록 단위로 나누기
blocks = re.split(r'\},\s*\{', content)

# 데이터 초기화
data_list = []

# 각 블록을 파싱
for block in blocks:
    
    # 블록을 딕셔너리 형태로 만들기 위해 중괄호 추가
    block = block.strip()
    if not block.startswith('{'):
        block = '{' + block
    if not block.endswith('}'):
        block = block + '}'
    
    # datetime 부분을 문자열로 변환
    block = re.sub(r"datetime\.datetime\((.*?)\)", r"'\1'", block)
    
    # 문자열을 딕셔너리로 변환
    try:
        data = ast.literal_eval(block)
        
        # 'date' 필드를 datetime 객체로 변환 후 문자열로 변환
        data['date'] = datetime.strptime(data['date'], '%Y, %m, %d, %H, %M, %S, %f').strftime('%Y-%m-%d %H:%M:%S')
        
        # 리스트에 데이터 추가
        data_list.append(data)
    except Exception as e:
        print(f"Error parsing block: {block}")
        print(e)

# 데이터프레임 생성
data_youtube = pd.DataFrame(data_list)

In [2]:
# 파일 경로 설정
file_path = 'data/youtube_title_v2.txt'

# 파일 열기 및 내용 읽기
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()

if content.startswith('['):
    content = content[1:]
if content.endswith(']'):
    content = content[:-1]

# 블록 단위로 나누기
blocks = re.split(r'\},\s*\{', content)

# 데이터 초기화
data_list = []

# 각 블록을 파싱
for block in blocks:
    
    # 블록을 딕셔너리 형태로 만들기 위해 중괄호 추가
    block = block.strip()
    if not block.startswith('{'):
        block = '{' + block
    if not block.endswith('}'):
        block = block + '}'
    
    # datetime 부분을 문자열로 변환
    block = re.sub(r"datetime\.datetime\((.*?)\)", r"'\1'", block)
    
    # 문자열을 딕셔너리로 변환
    try:
        data = ast.literal_eval(block)
        
        # 'date' 필드를 datetime 객체로 변환 후 문자열로 변환
        data['date'] = datetime.strptime(data['date'], '%Y, %m, %d, %H, %M, %S, %f').strftime('%Y-%m-%d %H:%M:%S')
        
        # 리스트에 데이터 추가
        data_list.append(data)
    except Exception as e:
        print(f"Error parsing block: {block}")
        print(e)

# 데이터프레임 생성
data_v2 = pd.DataFrame(data_list)

In [3]:
# 파일 경로 설정
file_path = 'data/youtube_google.txt'

# 파일 열기 및 내용 읽기
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()

if content.startswith('['):
    content = content[1:]
if content.endswith(']'):
    content = content[:-1]

# 블록 단위로 나누기
blocks = re.split(r'\},\s*\{', content)

# 데이터 초기화
data_list = []

# 각 블록을 파싱
for block in blocks:
    
    # 블록을 딕셔너리 형태로 만들기 위해 중괄호 추가
    block = block.strip()
    if not block.startswith('{'):
        block = '{' + block
    if not block.endswith('}'):
        block = block + '}'
    
    # datetime 부분을 문자열로 변환
    block = re.sub(r"datetime\.datetime\((.*?)\)", r"'\1'", block)
    
    # 문자열을 딕셔너리로 변환
    try:
        data = ast.literal_eval(block)
        
        # 'date' 필드를 datetime 객체로 변환 후 문자열로 변환
        data['date'] = datetime.strptime(data['date'], '%Y, %m, %d, %H, %M, %S, %f').strftime('%Y-%m-%d %H:%M:%S')
        
        # 리스트에 데이터 추가
        data_list.append(data)
    except Exception as e:
        print(f"Error parsing block: {block}")
        print(e)

# 데이터프레임 생성
data_google = pd.DataFrame(data_list)

In [4]:
final_data = pd.concat([data_youtube, data_v2, data_google], axis=0)

In [5]:
final_data

Unnamed: 0,date,query,titles,ids
0,2024-07-30 02:10:38,Microsoft,"[I&#39;m &#39;most bullish&#39; on Microsoft, ...","[lLvInYy1mL4, d-CuF6dlqLg, ZHrayP-Y71Q, xkKSHI..."
1,2024-07-30 02:10:38,Microsoft,"[I&#39;m &#39;most bullish&#39; on Microsoft, ...","[lLvInYy1mL4, d-CuF6dlqLg, F4x_L-bCYBA, XGD0eG..."
2,2024-07-30 02:10:38,Apple,[The Underdogs: OOO (Out Of Office) | Apple at...,"[SbYckRAt5os, CPWxExGk7PM, RXeOiIDNNek, oBZT6K..."
3,2024-07-30 02:10:38,Apple,[The Underdogs: OOO (Out Of Office) | Apple at...,"[SbYckRAt5os, CPWxExGk7PM, RXeOiIDNNek, oBZT6K..."
4,2024-07-30 02:10:38,Nvidia,[Nvidia $350 Target $144 Billion Deal With Gov...,"[vEv_jjUG0ZM, 6Nr0_lZScug, SFe_LPxXtFY, bMIRhO..."
5,2024-07-30 02:10:38,Nvidia,[Nvidia $350 Target $144 Billion Deal With Gov...,"[vEv_jjUG0ZM, 6Nr0_lZScug, SFe_LPxXtFY, bMIRhO..."
6,2024-07-30 02:24:53,GOOGLE,"[www.google.com, Satisfying Google Logo Art #s...","[xi8Z-BdLuLo, m2E2VokTywc, 8MVb5hOgu0Q, Dk_kzM..."
7,2024-07-30 02:24:53,GOOGLE,"[www.google.com, 🔴 HAPPY BIRTHDAY JJ! 🎂 CoCome...","[xi8Z-BdLuLo, 8MVb5hOgu0Q, Dk_kzMUeAvQ, e4con8..."
8,2024-07-30 02:10:38,Amazon,"[I Tested 1-Star Amazon Products!, 19 Amazon I...","[95SWw6J1DxU, OJOYcLuQThk, tjobI43dbTU, wUoxbm..."
9,2024-07-30 02:10:38,Amazon,"[I Tested 1-Star Amazon Products!, Trying 100 ...","[95SWw6J1DxU, tjobI43dbTU, wUoxbm9MUc0, pzJDzI..."


In [6]:
final_data = final_data.explode('titles')

In [7]:
final_data

Unnamed: 0,date,query,titles,ids
0,2024-07-30 02:10:38,Microsoft,"I&#39;m &#39;most bullish&#39; on Microsoft, s...","[lLvInYy1mL4, d-CuF6dlqLg, ZHrayP-Y71Q, xkKSHI..."
0,2024-07-30 02:10:38,Microsoft,Microsoft Copilot Tutorial for Beginners,"[lLvInYy1mL4, d-CuF6dlqLg, ZHrayP-Y71Q, xkKSHI..."
0,2024-07-30 02:10:38,Microsoft,"CrowdStrike Update: Latest News, Lessons Learn...","[lLvInYy1mL4, d-CuF6dlqLg, ZHrayP-Y71Q, xkKSHI..."
0,2024-07-30 02:10:38,Microsoft,Microsoft Purview All New Unified Portal FULL...,"[lLvInYy1mL4, d-CuF6dlqLg, ZHrayP-Y71Q, xkKSHI..."
0,2024-07-30 02:10:38,Microsoft,How CrowdStrike broke Windows computers everyw...,"[lLvInYy1mL4, d-CuF6dlqLg, ZHrayP-Y71Q, xkKSHI..."
...,...,...,...,...
1,2024-07-30 02:24:53,GOOGLE,Google Logo Intro Compilation - light effects ...,"[xi8Z-BdLuLo, 8MVb5hOgu0Q, Dk_kzMUeAvQ, e4con8..."
1,2024-07-30 02:24:53,GOOGLE,Weird Cat Snake Is Real 🐍🤯 on google maps and ...,"[xi8Z-BdLuLo, 8MVb5hOgu0Q, Dk_kzMUeAvQ, e4con8..."
1,2024-07-30 02:24:53,GOOGLE,Google Google Thuppakki Movie Songs | Star - ...,"[xi8Z-BdLuLo, 8MVb5hOgu0Q, Dk_kzMUeAvQ, e4con8..."
1,2024-07-30 02:24:53,GOOGLE,Google Play Games App Install in Google Play S...,"[xi8Z-BdLuLo, 8MVb5hOgu0Q, Dk_kzMUeAvQ, e4con8..."


In [9]:
final_data

Unnamed: 0,date,query,titles,ids
0,2024-07-30 02:10:38,Microsoft,"I&#39;m &#39;most bullish&#39; on Microsoft, s...","[lLvInYy1mL4, d-CuF6dlqLg, ZHrayP-Y71Q, xkKSHI..."
0,2024-07-30 02:10:38,Microsoft,Microsoft Copilot Tutorial for Beginners,"[lLvInYy1mL4, d-CuF6dlqLg, ZHrayP-Y71Q, xkKSHI..."
0,2024-07-30 02:10:38,Microsoft,"CrowdStrike Update: Latest News, Lessons Learn...","[lLvInYy1mL4, d-CuF6dlqLg, ZHrayP-Y71Q, xkKSHI..."
0,2024-07-30 02:10:38,Microsoft,Microsoft Purview All New Unified Portal FULL...,"[lLvInYy1mL4, d-CuF6dlqLg, ZHrayP-Y71Q, xkKSHI..."
0,2024-07-30 02:10:38,Microsoft,How CrowdStrike broke Windows computers everyw...,"[lLvInYy1mL4, d-CuF6dlqLg, ZHrayP-Y71Q, xkKSHI..."
...,...,...,...,...
1,2024-07-30 02:24:53,GOOGLE,Google Logo Intro Compilation - light effects ...,"[xi8Z-BdLuLo, 8MVb5hOgu0Q, Dk_kzMUeAvQ, e4con8..."
1,2024-07-30 02:24:53,GOOGLE,Weird Cat Snake Is Real 🐍🤯 on google maps and ...,"[xi8Z-BdLuLo, 8MVb5hOgu0Q, Dk_kzMUeAvQ, e4con8..."
1,2024-07-30 02:24:53,GOOGLE,Google Google Thuppakki Movie Songs | Star - ...,"[xi8Z-BdLuLo, 8MVb5hOgu0Q, Dk_kzMUeAvQ, e4con8..."
1,2024-07-30 02:24:53,GOOGLE,Google Play Games App Install in Google Play S...,"[xi8Z-BdLuLo, 8MVb5hOgu0Q, Dk_kzMUeAvQ, e4con8..."


# 감정분석

In [10]:
from collections import OrderedDict
from transformers import pipeline, TFAutoModelForSequenceClassification, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
titles = final_data['titles'].unique().tolist()
final_data = final_data.drop_duplicates(subset=['titles'])

In [12]:
final_data

Unnamed: 0,date,query,titles,ids
0,2024-07-30 02:10:38,Microsoft,"I&#39;m &#39;most bullish&#39; on Microsoft, s...","[lLvInYy1mL4, d-CuF6dlqLg, ZHrayP-Y71Q, xkKSHI..."
0,2024-07-30 02:10:38,Microsoft,Microsoft Copilot Tutorial for Beginners,"[lLvInYy1mL4, d-CuF6dlqLg, ZHrayP-Y71Q, xkKSHI..."
0,2024-07-30 02:10:38,Microsoft,"CrowdStrike Update: Latest News, Lessons Learn...","[lLvInYy1mL4, d-CuF6dlqLg, ZHrayP-Y71Q, xkKSHI..."
0,2024-07-30 02:10:38,Microsoft,Microsoft Purview All New Unified Portal FULL...,"[lLvInYy1mL4, d-CuF6dlqLg, ZHrayP-Y71Q, xkKSHI..."
0,2024-07-30 02:10:38,Microsoft,How CrowdStrike broke Windows computers everyw...,"[lLvInYy1mL4, d-CuF6dlqLg, ZHrayP-Y71Q, xkKSHI..."
...,...,...,...,...
13,2024-07-30 02:22:52,TSLA,Every Tesla I Have Owned! 😳👀,"[QX9BDnLBdxM, hJbm0shFt2c, 9JwyQvNXlLE, F-tCGE..."
13,2024-07-30 02:22:52,TSLA,"Tesla Model 3 VS BYD Seal, #shorts","[QX9BDnLBdxM, hJbm0shFt2c, 9JwyQvNXlLE, F-tCGE..."
13,2024-07-30 02:22:52,TSLA,NEW Tesla Model 3 vs BMW i4 - Closer than you ...,"[QX9BDnLBdxM, hJbm0shFt2c, 9JwyQvNXlLE, F-tCGE..."
13,2024-07-30 02:22:52,TSLA,Is Tesla Cyberquad Worth it?,"[QX9BDnLBdxM, hJbm0shFt2c, 9JwyQvNXlLE, F-tCGE..."


In [13]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)


All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [96]:
data1m = pd.read_csv("data/youtube_title_1m.csv")
titles_1m = data1m['titles'].tolist()
titles_3m = data3m['titles'].tolist()

In [32]:
titles_1m

['I&#39;m &#39;most bullish&#39; on Microsoft, says Madrona&#39;s Mike McIlwain after Big Tech&#39;s bumpy week',
 'Microsoft Copilot Tutorial for Beginners',
 'CrowdStrike Update: Latest News, Lessons Learned from a Retired Microsoft Engineer',
 'Microsoft Purview All New Unified Portal  FULL DEMO',
 'How CrowdStrike broke Windows computers everywhere. #mkbhd #waveformpodcast #microsoft',
 'Copilot for Microsoft 365 | Reimagine what’s possible',
 '🔥 Create &amp; Deploy a NextJs Full Stack Microsoft Loop 2.0 App with React, Gemini API, Clerk',
 'Why Microsoft Skipped Windows 9 #Shorts',
 '🤷\u200d♂️ What is Microsoft 365 - Explained',
 'The Microsoft 365 Copilot AI Event in Less than 3 Minutes',
 'Global tech outage: Microsoft VP explains what went wrong',
 'How the CrowdStrike-Microsoft global tech outage unfolded',
 'CrowdStrike IT Outage Explained by a Windows Developer',
 'Microsoft Flight Simulator - NEW PLANES IN AUGUST',
 'Microsoft Word for Beginners - The Complete Course',
 'Ne

In [42]:
# TensorFlow 기반의 감정 분석 파이프라인 생성
sentiment_pipeline = pipeline(
    'sentiment-analysis', 
    model=model, 
    tokenizer=tokenizer, 
    framework='tf',
    max_length=512,  # 최대 토큰 길이 설정
    truncation=True  # 최대 길이를 초과하는 경우 자르기
)

scores = []
sent = []

# comments 리스트를 250개씩 나누어 함수 실행 후 결과 저장
batch_size = 10
# texts = list(OrderedDict.fromkeys(titles))

'''
for i in range(0, len(titles), batch_size):
    batch = titles_1m[i:i + batch_size]
    results = sentiment_pipeline(batch)
    for text, result in zip(batch, results):
        scores.append(result['score'])
        sent.append(result['label'])
'''
        
texts = data1m['titles'].tolist()
for i in range(0, len(texts), batch_size):
    batch = texts[i:i + batch_size]
    results = sentiment_pipeline(batch)
    for text, result in zip(texts, results):
        scores.append(result['score'])
        sent.append(result['label'])
    
result_1m = pd.DataFrame({'date': data1m['date'], 'stock': data1m['query'], 'contents': data1m['titles'], 'sent': sent, 'score': scores})
    

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [97]:
data3m = pd.read_csv('data/youtube_title_3m.csv')

# titles 열의 데이터를 파싱하여 각 제목이 한 행이 되도록 변환
titles_expanded = data3m['titles'].apply(eval).explode().reset_index(drop=True)

# 원래의 'date'와 'query' 열과 결합하여 새로운 데이터프레임 생성
expanded_df = pd.DataFrame({
    'date': data3m.loc[data3m.index.repeat(data3m['titles'].apply(eval).str.len())]['date'].reset_index(drop=True),
    'query': data3m.loc[data3m.index.repeat(data3m['titles'].apply(eval).str.len())]['query'].reset_index(drop=True),
    'titles': titles_expanded
})
data3m_final = expanded_df.drop_duplicates(subset='titles').reset_index(drop=True)

In [99]:
data3m_final['titles'].nunique()

329

In [43]:
# TensorFlow 기반의 감정 분석 파이프라인 생성
sentiment_pipeline = pipeline(
    'sentiment-analysis', 
    model=model, 
    tokenizer=tokenizer, 
    framework='tf',
    max_length=512,  # 최대 토큰 길이 설정
    truncation=True  # 최대 길이를 초과하는 경우 자르기
)

scores = []
sent = []

# comments 리스트를 250개씩 나누어 함수 실행 후 결과 저장
batch_size = 10
# texts = list(OrderedDict.fromkeys(titles))

'''
for i in range(0, len(titles), batch_size):
    batch = titles_1m[i:i + batch_size]
    results = sentiment_pipeline(batch)
    for text, result in zip(batch, results):
        scores.append(result['score'])
        sent.append(result['label'])
'''
        
texts = data3m['titles'].tolist()
for i in range(0, len(texts), batch_size):
    batch = texts[i:i + batch_size]
    results = sentiment_pipeline(batch)
    for text, result in zip(texts, results):
        scores.append(result['score'])
        sent.append(result['label'])
    
result_1m = pd.DataFrame({'date': data1m['date'], 'stock': data1m['query'], 'contents': data1m['titles'], 'sent': sent, 'score': scores})
    

Unnamed: 0,date,stock,contents,sent,score
0,2024-07-30 02:10:38,Microsoft,"I&#39;m &#39;most bullish&#39; on Microsoft, s...",NEGATIVE,0.929638
1,2024-07-30 02:10:38,Microsoft,Microsoft Copilot Tutorial for Beginners,NEGATIVE,0.985871
2,2024-07-30 02:10:38,Microsoft,"CrowdStrike Update: Latest News, Lessons Learn...",POSITIVE,0.990871
3,2024-07-30 02:10:38,Microsoft,Microsoft Purview All New Unified Portal FULL...,NEGATIVE,0.972434
4,2024-07-30 02:10:38,Microsoft,How CrowdStrike broke Windows computers everyw...,NEGATIVE,0.979395
...,...,...,...,...,...
394,2024-07-30 02:22:52,TSLA,The State of Tesla FSD (Mid-2024),NEGATIVE,0.974024
395,2024-07-30 02:22:52,TSLA,Tesla FSD 12.5 is Smooth as Hell,NEGATIVE,0.934127
396,2024-07-30 02:22:52,TSLA,2024 Tesla Model X PLAID: POV review &amp; top...,POSITIVE,0.991333
397,2024-07-30 02:22:52,TSLA,🔴Live Tesla &amp; Apple Signals | Free 5m Char...,NEGATIVE,0.993042


In [100]:
# TensorFlow 기반의 감정 분석 파이프라인 생성
sentiment_pipeline = pipeline(
    'sentiment-analysis', 
    model=model, 
    tokenizer=tokenizer, 
    framework='tf',
    max_length=512,  # 최대 토큰 길이 설정
    truncation=True  # 최대 길이를 초과하는 경우 자르기
)

scores = []
sent = []

# comments 리스트를 250개씩 나누어 함수 실행 후 결과 저장
batch_size = 10
# texts = list(OrderedDict.fromkeys(titles))
        
texts = data3m_final['titles'].tolist()
for i in range(0, len(texts), batch_size):
    batch = texts[i:i + batch_size]
    results = sentiment_pipeline(batch)
    for text, result in zip(texts, results):
        scores.append(result['score'])
        sent.append(result['label'])
    
result_3m = pd.DataFrame({'date': data3m_final['date'], 'stock': data3m_final['query'], 'contents': data3m_final['titles'], 'sent': sent, 'score': scores})

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [101]:
result_3m

Unnamed: 0,date,stock,contents,sent,score
0,2024-07-30 02:10:38,Microsoft,"I&#39;m &#39;most bullish&#39; on Microsoft, s...",NEGATIVE,0.929638
1,2024-07-30 02:10:38,Microsoft,Microsoft Copilot Tutorial for Beginners,NEGATIVE,0.985871
2,2024-07-30 02:10:38,Microsoft,Why Microsoft Skipped Windows 9 #Shorts,NEGATIVE,0.999128
3,2024-07-30 02:10:38,Microsoft,🤷‍♂️ What is Microsoft 365 - Explained,NEGATIVE,0.969988
4,2024-07-30 02:10:38,Microsoft,Global tech outage: Microsoft VP explains what...,NEGATIVE,0.999616
...,...,...,...,...,...
324,2024-07-30 02:22:52,TSLA,Tesla FSD 12.5 is a HUGE Leap Forward (First I...,POSITIVE,0.999690
325,2024-07-30 02:22:52,TSLA,Is Tesla Cyberquad Worth it?,POSITIVE,0.941680
326,2024-07-30 02:22:52,TSLA,What’s Next for Tesla ROBOTAXI | Warren Redlich,POSITIVE,0.956601
327,2024-07-30 02:22:52,TSLA,2024 Tesla Model 3 ASMR 🔥,POSITIVE,0.514250


In [102]:
result_1m.to_csv("result/youtube_title_1m.csv")
result_3m.to_csv("result/youtube_title_2m.csv")

# comments

In [104]:
import pandas as pd
import ast

# 파일 경로 설정
file_path = 'data/youtube/alphabet/comment-1m.txt'

# 파일 열기 및 내용 읽기
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()

# 파일 내용을 리스트로 변환
data = ast.literal_eval(content)

# 각 블록을 개별 행으로 변환하여 데이터프레임 생성
flattened_data = []
for block in data:
    for comment in block:
        flattened_data.append(comment)

# 데이터프레임 생성
df = pd.DataFrame(flattened_data, columns=['comment'])

In [123]:
######## 종목 이름 넣으면 감정분석해주는 함수 ###########

def youtube_comments(stock_name):
    # 파일 경로 설정
    file_path1 = f'/Users/jiheelee/Desktop/2024-1/Stock-info-archive/data/youtube/{stock_name}/comment-1m.txt'
    file_path3 = f'/Users/jiheelee/Desktop/2024-1/Stock-info-archive/data/youtube/{stock_name}/comment-3m.txt'

    # 파일 열기 및 내용 읽기
    with open(file_path1, 'r', encoding='utf-8') as file:
        content1 = file.read()
    with open(file_path3, 'r', encoding='utf-8') as file:
        content3 = file.read()

    # 파일 내용을 리스트로 변환
    data1 = ast.literal_eval(content1)
    data3 = ast.literal_eval(content3)

    # 각 블록을 개별 행으로 변환하여 데이터프레임 생성
    data1_ = []
    for block in data1:
        for comment in block:
            data1_.append(comment)
            
    data3_ = []
    for block in data3:
        for comment in block:
            data3_.append(comment)
            
            
    model_name = "distilbert-base-uncased-finetuned-sst-2-english"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
    
    # TensorFlow 기반의 감정 분석 파이프라인 생성
    sentiment_pipeline = pipeline(
        'sentiment-analysis', 
        model=model, 
        tokenizer=tokenizer, 
        framework='tf',
        max_length=512,  # 최대 토큰 길이 설정
        truncation=True  # 최대 길이를 초과하는 경우 자르기
    )
    
    scores1 = []
    sent1 = []

    # comments 리스트를 250개씩 나누어 함수 실행 후 결과 저장
    batch_size = 10
    # texts = list(OrderedDict.fromkeys(titles))
            
    # texts = data1m['titles'].tolist()
    
    for i in range(0, len(data1_), batch_size):
        batch = data1_[i:i + batch_size]
        results = sentiment_pipeline(batch)
        for result in results:
            scores1.append(result['score'])
            sent1.append(result['label'])
            
    scores3 = []
    sent3 = []
    
    for i in range(0, len(data3_), batch_size):
        batch = data3_[i:i + batch_size]
        results = sentiment_pipeline(batch)
        for result in results:
            scores3.append(result['score'])
            sent3.append(result['label'])
    
    result1 = pd.DataFrame({'comments': data1_, 'sent': sent1, 'score': scores1})
    result3 = pd.DataFrame({'comments': data3_, 'sent': sent3, 'score': scores3})
    
    
    return result1, result3

In [124]:
youtube_com_GOOGL1, youtube_com_GOOGL3 = youtube_comments('alphabet')
youtube_com_AMZN1, youtube_com_AMZN3 = youtube_comments('amazon')
youtube_com_AAPL1, youtube_com_AAPL3 = youtube_comments('apple')
youtube_com_META1, youtube_com_META3 = youtube_comments('meta')
youtube_com_MSFT1, youtube_com_MSFT3 = youtube_comments('microsoft')
youtube_com_NVDA1, youtube_com_NVDA3 = youtube_comments('ndvia')
youtube_com_TSLA1, youtube_com_TSLA3 = youtube_comments('tesla')

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipe

In [125]:
youtube_com_GOOGL1.to_csv("result/youtube_com_1m_GOOGL.csv")
youtube_com_GOOGL3.to_csv("result/youtube_com_3m_GOOGL.csv")
youtube_com_AAPL1.to_csv("result/youtube_com_1m_AAPL.csv")
youtube_com_AAPL3.to_csv("result/youtube_com_3m_AAPL.csv")
youtube_com_AMZN1.to_csv("result/youtube_com_1m_AMZN.csv")
youtube_com_AMZN3.to_csv("result/youtube_com_3m_AMZN.csv")
youtube_com_META1.to_csv("result/youtube_com_1m_META.csv")
youtube_com_META3.to_csv("result/youtube_com_3m_META.csv")
youtube_com_MSFT1.to_csv("result/youtube_com_1m_MSFT.csv")
youtube_com_MSFT3.to_csv("result/youtube_com_3m_MSFT.csv")
youtube_com_NVDA1.to_csv("result/youtube_com_1m_NVDA.csv")
youtube_com_NVDA3.to_csv("result/youtube_com_3m_NVDA.csv")
youtube_com_TSLA1.to_csv("result/youtube_com_1m_TSLA.csv")
youtube_com_TSLA3.to_csv("result/youtube_com_3m_TSLA.csv")