In [11]:
import os
import requests
from dotenv import load_dotenv
import pandas as pd
from datetime import datetime, timedelta
import isodate
import time
from datetime import datetime
import pytz
from pandas import Timestamp
import numpy as np
from scipy import optimize
import plotly.graph_objects as go
import plotly.io as pio

# Load environment variables
load_dotenv()

API_KEY = os.getenv("YOUTUBE_KEY")
BASE_URL = "https://www.googleapis.com/youtube/v3"

# Display the DataFrame
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 100)

def get_channel_id_from_username(username, api_key):
    channel_username = username.lstrip('@')
    api_url = f"https://www.googleapis.com/youtube/v3/search?part=snippet&type=channel&q={channel_username}&key={api_key}"
    response = requests.get(api_url)
    data = response.json()
    if 'items' in data and data['items']:
        return data['items'][0]['snippet']['channelId']
    else:
        return None

def get_channel_upload_playlist_id(channel_id):
    url = f"{BASE_URL}/channels?part=contentDetails&id={channel_id}&key={API_KEY}"
    response = requests.get(url)
    data = response.json()
    if 'error' in data:
        raise ValueError(f"API Error: {data['error']['message']}")
    items = data.get('items', [])
    if not items:
        raise ValueError(f"No channel found for ID: {channel_id}")
    return items[0]['contentDetails']['relatedPlaylists']['uploads']

def get_latest_channel_videos(channel_id, max_videos=100):
    video_ids = []
    playlist_id = get_channel_upload_playlist_id(channel_id)
    next_page_token = None
    
    while len(video_ids) < max_videos:
        url = f"{BASE_URL}/playlistItems?part=snippet&playlistId={playlist_id}&maxResults=50&key={API_KEY}"
        if next_page_token:
            url += f"&pageToken={next_page_token}"
        
        response = requests.get(url)
        data = response.json()
        
        if 'error' in data:
            raise ValueError(f"API Error: {data['error']['message']}")
        
        items = data.get('items', [])
        if not items:
            break
        
        for item in items:
            video_ids.append(item['snippet']['resourceId']['videoId'])
            if len(video_ids) >= max_videos:
                break
        
        if len(video_ids) >= max_videos:
            break
        
        next_page_token = data.get('nextPageToken')
        if not next_page_token:
            break
        
        print(f"Videos found so far: {len(video_ids)}")
        time.sleep(1)  # To avoid hitting API rate limits
    
    print(f"Total video IDs found: {len(video_ids)}")
    return video_ids[:max_videos]

def get_video_details(video_ids):
    videos = []
    for i in range(0, len(video_ids), 50):
        batch = video_ids[i:i+50]
        url = f"{BASE_URL}/videos?part=snippet,contentDetails,statistics&id={','.join(batch)}&key={API_KEY}"
        response = requests.get(url)
        data = response.json()
        
        if 'error' in data:
            raise ValueError(f"API Error: {data['error']['message']}")
        
        items = data.get('items', [])
        if not items:
            print(f"No details found for video IDs: {batch}")
            continue
        
        for item in items:
            try:
                duration = isodate.parse_duration(item['contentDetails']['duration'])
                published_at = item['snippet'].get('publishedAt')
                if published_at:
                    published_at = pd.to_datetime(published_at).tz_convert('US/Pacific')
                video = {
                    'videoId': item['id'],
                    'title': item['snippet']['title'],
                    'publishedAt': published_at,
                    'viewCount': int(item['statistics'].get('viewCount', 0)),
                    'likeCount': int(item['statistics'].get('likeCount', 0)),
                    'commentCount': int(item['statistics'].get('commentCount', 0)),
                    'duration': str(duration),
                    'description': item['snippet']['description'],
                    'tags': ', '.join(item['snippet'].get('tags', [])),
                    'thumbnailUrl': item['snippet']['thumbnails']['default']['url']
                }
                videos.append(video)
            except KeyError as e:
                print(f"KeyError for video {item.get('id', 'Unknown')}: {str(e)}")
        
        time.sleep(1)  # To avoid hitting API rate limits
    
    return videos

def fetch_latest_channel_videos(channel_id, max_videos=100):
    try:
        video_ids = get_latest_channel_videos(channel_id, max_videos)
        if not video_ids:
            print("No video IDs found.")
            return pd.DataFrame()
        
        videos = get_video_details(video_ids)
        if not videos:
            print("No video details found.")
            return pd.DataFrame()
        
        df = pd.DataFrame(videos)
        return df
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return pd.DataFrame()

def process_channels(channel_usernames, max_videos_per_channel=100):
    all_videos = []
    for username in channel_usernames:
        print(f"Processing channel: {username}")
        channel_id = get_channel_id_from_username(username, API_KEY)
        if channel_id:
            df = fetch_latest_channel_videos(channel_id, max_videos_per_channel)
            if not df.empty:
                df['channel'] = username
                all_videos.append(df)
        else:
            print(f"Could not find channel ID for username: {username}")
    
    if all_videos:
        combined_df = pd.concat(all_videos, ignore_index=True)
        combined_df = combined_df.sort_values('publishedAt', ascending=False)
        
        print(f"Total videos fetched: {len(combined_df)}")
        combined_df.to_csv("yt_channels_latest_videos.csv", index=False)
        return combined_df
    else:
        print("No videos found for any channels.")
        return pd.DataFrame()


def parse_date(date_value):
    if pd.isna(date_value):
        return pd.NaT
    
    if isinstance(date_value, (int, float)):
        return pd.NaT
    
    try:
        # Try parsing as ISO format first
        date = datetime.fromisoformat(date_value.replace('Z', '+00:00'))
        return date.astimezone(pytz.UTC)
    except ValueError:
        try:
            # Try parsing with explicit format
            parts = date_value.split()
            if len(parts) == 3:
                date_part, time_part, tz_part = parts
                dt = datetime.strptime(f"{date_part} {time_part}", "%Y-%m-%d %H:%M:%S")
                
                if tz_part in ['PDT', 'PST']:
                    tz = pytz.timezone('US/Pacific')
                else:
                    tz = pytz.timezone('UTC')
                
                return tz.localize(dt).astimezone(pytz.UTC)
            elif len(parts) == 2:
                # Handle case where there's no timezone
                date_part, time_part = parts
                dt = datetime.strptime(f"{date_part} {time_part}", "%Y-%m-%d %H:%M:%S")
                return pytz.UTC.localize(dt)
            else:
                # If we can't parse it, return NaT
                return pd.NaT
        except Exception:
            # If all parsing attempts fail, return NaT
            return pd.NaT

    
def safe_duration_to_seconds(x):
    if pd.isna(x):
        return np.nan
    if isinstance(x, (int, float)):
        return x  # Assume it's already in seconds
    try:
        parts = x.split(':')
        return sum(int(part) * 60**i for i, part in enumerate(reversed(parts)))
    except:
        return np.nan 

# Function to fit for the curved line
def curve_func(x, a, b, c):
    return a * (x ** b) + c


In [23]:
import requests
import os
from dotenv import load_dotenv
import json
import re

def get_top_tech_review_channels():
    # Load environment variables
    load_dotenv()

    # Access the API key
    api_key = os.environ.get("PERPLEXITY_API_KEY")

    if not api_key:
        raise ValueError("PERPLEXITY_API_KEY not found in environment variables")

    url = "https://api.perplexity.ai/chat/completions"

    payload = {
        "model": "llama-3.1-sonar-huge-128k-online",
        "messages": [
            {
                "role": "system",
                "content": "Be precise and concise. Provide the answer as a Python list of YouTube channel usernames as they appear at the end of the URL."
            },
            {
                "role": "user",
                "content": "What are the best YouTube channels that provide the best product reviews for tech gadgets in terms of popularity and likes? Format the output as a Python list of channel usernames as they appear at the end of the URL (e.g., ['mkbhd', 'unboxtherapy'])."
            }
        ],
        "max_tokens": 1024,
        "temperature": 0.2,
        "top_p": 0.9,
        "return_citations": True,
        "return_images": False,
        "return_related_questions": False,
        "stream": False
    }

    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    try:
        response = requests.post(url, json=payload, headers=headers)
        response.raise_for_status()  # Raises a HTTPError if the status is 4xx, 5xx

        result = response.json()
        content = result['choices'][0]['message']['content']
        
        # Extract the list from the content using regex
        match = re.search(r'\[.*?\]', content, re.DOTALL)
        if match:
            channel_list = match.group(0)
            return eval(channel_list)  # Safely evaluate the string as a Python list
        else:
            return []  # Return an empty list if no match is found
    
    except requests.RequestException as e:
        print(f"An error occurred while making the API request: {e}")
        return []
    except (KeyError, IndexError) as e:
        print(f"An error occurred while parsing the API response: {e}")
        return []
    except SyntaxError as e:
        print(f"An error occurred while evaluating the channel list: {e}")
        return []


channel_usernames = get_top_tech_review_channels()
print("channel_usernames =", channel_usernames)

channel_usernames = ['unboxtherapy', 'mkbhd', 'mrwhosetheboss', 'linustechtips', 'austinevans', 'uravgconsumer', 'mrmobile', 'flossycarter', 'techtablets', 'phonebuff']


In [None]:
# List of channel usernames
channel_usernames = ["mkbhd", "unboxtherapy", "LinusTechTips", "Mrwhosetheboss", "UrAvgConsumer", "austinevans", "TechnicalGuruji", "TLD", "technobuffalo"]  # Add more channels as needed

# Process all channels
result_df = process_channels(channel_usernames, max_videos_per_channel=100)

# Display the first row of the result
result_df.head(1)

In [9]:
# df = result_df
df = pd.read_csv("yt_channels_latest_videos.csv")

# Apply the modified function
df['publishedAt'] = df['publishedAt'].apply(parse_date)

# Find the most recent date in the dataset
most_recent_date = df['publishedAt'].max()

# Calculate hours since published relative to the most recent date
df['hours_since_published'] = (most_recent_date - df['publishedAt']).dt.total_seconds() / 3600

# Calculate likes per view
df['likes_per_view'] = df['likeCount'] / df['viewCount']

df['commentCount'] = pd.to_numeric(df['commentCount'], errors='coerce')
df['viewCount'] = pd.to_numeric(df['viewCount'], errors='coerce')

# Now calculate comments per view
df['comments_per_view'] = df['commentCount'] / df['viewCount']

# If you also need to calculate views_by_hr, make sure 'hours_since_published' is numeric too
df['hours_since_published'] = pd.to_numeric(df['hours_since_published'], errors='coerce')
df['views_by_hr'] = df['viewCount'] / df['hours_since_published']
df.head(1)

Unnamed: 0,videoId,title,publishedAt,viewCount,likeCount,commentCount,duration,description,tags,thumbnailUrl,channel,hours_since_published,likes_per_view,comments_per_view,views_by_hr
0,uiTDtKFh3pE,The WIRELESS Gaming PC,2024-09-14 17:03:03+00:00,16877.0,1172.0,38.0,0:00:47,,,https://i.ytimg.com/vi/uiTDtKFh3pE/default.jpg,austinevans,0.0,0.069444,0.002252,inf


In [12]:


one_month_ago = datetime.now(pytz.UTC) - timedelta(days=30)
df_filtered = df[
    (pd.to_datetime(df['publishedAt'], utc=True) < one_month_ago) &
    (df['duration'].apply(safe_duration_to_seconds) > 120)
]

df_filtered['duration_seconds'] = df_filtered['duration'].apply(safe_duration_to_seconds)

# Normalize duration for marker size (smallest stays the same, largest is 9 times as large)
min_duration = df_filtered['duration_seconds'].min()
max_duration = df_filtered['duration_seconds'].max()
normalized_duration = (df_filtered['duration_seconds'] - min_duration) / (max_duration - min_duration)
marker_sizes = 5 + normalized_duration * 40  # Scale from 5 to 45

# Log normalize the number of comments
df_filtered['log_comments'] = np.log1p(df_filtered['commentCount'])

# Calculate combined metric for top 5%
df_filtered['combined_metric'] = df_filtered['views_by_hr'] * df_filtered['likes_per_view']
threshold_value = df_filtered['combined_metric'].quantile(0.95)

# Fit the curve
x_data = df_filtered['views_by_hr']
y_data = df_filtered['likes_per_view']
popt, _ = optimize.curve_fit(curve_func, x_data[df_filtered['combined_metric'] > threshold_value], 
                             y_data[df_filtered['combined_metric'] > threshold_value], 
                             p0=[1, -0.5, 0.001], maxfev=10000)

# Add some jitter to reduce overlapping
jitter_x = np.random.normal(0, 0.01, len(df_filtered))
jitter_y = np.random.normal(0, 0.01, len(df_filtered))

# Create the figure
fig = go.Figure()

# Add scatter plot
scatter = go.Scatter(
    x=df_filtered['views_by_hr'] * (1 + jitter_x),
    y=df_filtered['likes_per_view'] * (1 + jitter_y),
    mode='markers',
    marker=dict(
        size=marker_sizes,
        color=df_filtered['log_comments'],
        colorscale='Viridis',
        colorbar=dict(title='Log(Number of Comments + 1)'),
        showscale=True,
        opacity=0.7
    ),
    text=df_filtered['title'],
    hovertemplate=
    "<b>%{text}</b><br>" +
    "Views per Hour: %{x:.2f}<br>" +
    "Likes per View: %{y:.4f}<br>" +
    "Duration: %{marker.size:.0f} seconds<br>" +
    "Number of Comments: %{marker.color:.0f}<br>" +
    "<extra></extra>",
    name='Videos'
)

fig.add_trace(scatter)

# Add curved line to separate top 5%
x_range = np.logspace(np.log10(df_filtered['views_by_hr'].min()), np.log10(df_filtered['views_by_hr'].max()), 1000)
y_curve = curve_func(x_range, *popt)

fig.add_trace(go.Scatter(
    x=x_range,
    y=y_curve,
    mode='lines',
    line=dict(color='red', dash='dash'),
    name='Top 5% Threshold'
))

# Calculate the maximum y-value
max_y = df_filtered['likes_per_view'].max() * 1.1

# Customize the layout
fig.update_layout(
    title='Views per Hour vs Likes per View (Videos > 1 Month Old, > 2 Minutes)',
    xaxis_title='Views per Hour',
    yaxis_title='Likes per View',
    height=800,
    width=1200,
    hovermode='closest',
    template='plotly_white',
    yaxis=dict(range=[np.log10(df_filtered['likes_per_view'].min()), np.log10(max_y)]),
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    ),
    margin=dict(r=120),  # Increase right margin to accommodate colorbar
)


# Update axes to logarithmic scale
fig.update_xaxes(type="log")
fig.update_yaxes(type="log")

# Add a colorbar title
fig.update_coloraxes(
    colorbar_title_text='Log(Number of Comments + 1)',
    colorbar=dict(
        len=0.75,  # Reduce length of colorbar
        yanchor="top",
        y=1,
        x=1.05,  # Move colorbar to the right
        thickness=20,  # Adjust thickness of colorbar
        title=dict(side="right", font=dict(size=12))  # Move title to the right side
    )
)

# Add a note about marker size
fig.add_annotation(
    text="Marker size represents video duration",
    xref="paper", yref="paper",
    x=1.02, y=1.05,
    showarrow=False,
    font=dict(size=10),
    align="left"
)

pio.write_html(fig, file='plot.html', auto_open=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['duration_seconds'] = df_filtered['duration'].apply(safe_duration_to_seconds)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['log_comments'] = np.log1p(df_filtered['commentCount'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['combined_metric'] = df_filtered['v

In [13]:
import pandas as pd

# Assuming df_filtered is your original DataFrame from the previous code

# Step 1: Sort the DataFrame by 'combined_metric' in descending order
df_sorted = df_filtered.sort_values('combined_metric', ascending=False)

# Step 2: Select the top 25%
top_25_percent = df_sorted.head(int(len(df_sorted) * 0.25))

# Step 3: Add a new column that combines 'likes_per_view' and 'comments_per_view'
top_25_percent['engagement_metric'] = top_25_percent['likes_per_view'] + top_25_percent['comments_per_view']

# Step 4: Sort the resulting DataFrame by the new 'engagement_metric' column in descending order
top_25_percent = top_25_percent.sort_values('engagement_metric', ascending=False)

# Display the first few rows of the resulting DataFrame
top_25_percent.head()

# If you want to save this DataFrame to a CSV file, you can uncomment the following line:
# result_df.to_csv('top_25_percent_engagement.csv', index=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,videoId,title,publishedAt,viewCount,likeCount,commentCount,duration,description,tags,thumbnailUrl,channel,hours_since_published,likes_per_view,comments_per_view,views_by_hr,duration_seconds,log_comments,combined_metric,engagement_metric
366,kqcphMpZ4sU,World's Most High-Tech Wedding Speech!,2024-06-21 14:43:30+00:00,3188185.0,215610.0,18002.0,0:15:43,This will always be one of the high points of my life. I’m so happy ❤️\nThanks to @TheWeddingFil...,"Wedding, Speech, Marriage, Girlfriend, Wife, Fiance",https://i.ytimg.com/vi/kqcphMpZ4sU/default.jpg,Mrwhosetheboss,2042.325833,0.067628,0.005646,1561.056002,943,9.798294,105.570814,0.073274
465,a7ItChBrY8E,I Gave A Commencement Speech!,2024-05-23 19:41:23+00:00,1286436.0,84459.0,3883.0,0:08:58,Commencement for the class of 2024 at the school I graduated from!\n\nMKBHD Merch: http://shop.M...,"MKBHD, commencement, commencement speech, 2024 commencement, Stevens, Stevens Tech, Stevens Inst...",https://i.ytimg.com/vi/a7ItChBrY8E/default.jpg,mkbhd,2733.361111,0.065653,0.003018,470.642534,538,8.264621,30.89932,0.068672
380,P6rH5b479qw,Tech Experts React to Bad & Great Tech in Movies - Episode 1,2024-06-17 17:08:26+00:00,1480413.0,92794.0,5086.0,0:21:43,Visit https://www.squarespace.com/LTT and use offer code LTT for 10% off\n\nCheck out the UGREEN...,"Tech, Support, React, Movies, TV Show, Bones, NCIS, Level 1 Techs, Wendell, Gaming, Computer, Se...",https://i.ytimg.com/vi/P6rH5b479qw/default.jpg,LinusTechTips,2135.910278,0.062681,0.003436,693.106361,1303,8.534444,43.444709,0.066117
342,Y5Ud5HKO2C4,Jonathan Morrison Lost Everything. We Respond.,2024-06-27 15:01:40+00:00,897034.0,53582.0,1837.0,0:18:16,"WATCH @TLD's video: https://youtu.be/qS7vnaLICrw\n\nHUGE thank you to Amaran, Sony and everyone ...","jonathan morrison, jon, jonathan, morrison, tld, tld today, tldtoday, austin, austin evans",https://i.ytimg.com/vi/Y5Ud5HKO2C4/default.jpg,austinevans,1898.023056,0.059732,0.002048,472.614912,1096,7.516433,28.230426,0.06178
150,GdQ5bClEgHg,"Hello, old friend… - Media Ripping Explained",2024-08-12 18:35:14+00:00,1700758.0,97458.0,5540.0,0:13:34,Get a 15-day free trial for unlimited backup at https://www.backblaze.com/landing/podcast-ltt.ht...,"piracy, blu-ray, 4k video, how to pirate, how to rip discs, netflix, prime, amazon, disney, disn...",https://i.ytimg.com/vi/GdQ5bClEgHg/default.jpg,LinusTechTips,790.463611,0.057303,0.003257,2151.595565,814,8.61993,123.292203,0.06056


In [22]:
import os
import pandas as pd
from openai import OpenAI

# Set up OpenAI client for Solar API
client = OpenAI(
    api_key=os.environ.get("SOLAR_API_KEY"),
    base_url="https://api.upstage.ai/v1/solar"
)

def is_tech_review(title):

    
    # Combine the title, description, and tags into a single text
    content = f"Title: {title}"
    
    # Use Solar to analyze the content
    prompt = f"""
    Analyze the following video content and determine if it's likely to be a recommendation for tech gadgets as Christmas gifts. Consider the following criteria:

    1. The content implies that it will mention technology products or gadgets.
    2. It could be a review, list, or showcase of multiple tech items suitable for gifting.

    Content:
    {content}

    Based on these criteria, is this video likely a tech product review, unboxing, or positive commentary on tech gadgets suitable for Christmas gifts?
    
    Answer with ONLY 'yes' or 'no'.
    """
    
    response = client.chat.completions.create(
        model="solar-pro",
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        stream=False
    )
    
    return response.choices[0].message.content.lower().strip() == 'yes'

# Assuming top_25_percent is already loaded as a pandas DataFrame
tech_review_videos = []

for _, row in top_25_percent.iterrows():
    if is_tech_review(row['title']): 
        tech_review_videos.append(row['videoId'])

print(f"Found {len(tech_review_videos)} videos that are likely tech product reviews or unboxings:")
print(tech_review_videos)

Found 1 videos that are likely tech product reviews or unboxings:
['zijUjJdegI8']


In [37]:
import os
import yt_dlp
from pydub import AudioSegment
import time
from openai import OpenAI

# Constants
CHUNK_SIZE = 25 * 1024 * 1024  # 25 MB, Whisper's file size limit

def download_youtube_audio(video_id):
    video_url = f'https://www.youtube.com/watch?v={video_id}'
    output_path = 'downloaded_videos'
    
    os.makedirs(output_path, exist_ok=True)
    
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'outtmpl': f'{output_path}/{video_id}.%(ext)s',
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        try:
            info = ydl.extract_info(video_url, download=True)
            print(f"Successfully downloaded: {info['title']} as MP3")
            return f'{output_path}/{video_id}.mp3', info['duration']
        except Exception as e:
            print(f"Failed to download. Error: {str(e)}")
            return None, 0

def split_audio(audio_path):
    audio = AudioSegment.from_mp3(audio_path)
    chunks = []
    for i in range(0, len(audio), CHUNK_SIZE):
        chunk = audio[i:i + CHUNK_SIZE]
        chunk_path = f"{audio_path[:-4]}_{i // CHUNK_SIZE}.mp3"
        chunk.export(chunk_path, format="mp3")
        chunks.append(chunk_path)
    return chunks

def transcribe_audio_chunk(chunk_path, client):
    try:
        with open(chunk_path, "rb") as audio_file:
            transcript = client.audio.transcriptions.create(
                model="whisper-1", 
                file=audio_file
            )
        return transcript.text
    except Exception as e:
        print(f"Error in transcription: {str(e)}")
        return f"Transcription failed: {str(e)}"

def transcribe_audio(audio_path):
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        print("OPENAI_API_KEY is not set in environment variables")
        return "Transcription failed: No API key"

    client = OpenAI(api_key=api_key)

    file_size = os.path.getsize(audio_path)
    if file_size > CHUNK_SIZE:
        chunks = split_audio(audio_path)
    else:
        chunks = [audio_path]

    transcriptions = []
    for chunk in chunks:
        transcription = transcribe_audio_chunk(chunk, client)
        transcriptions.append(transcription)
        if chunk != audio_path:
            os.remove(chunk)  # Remove temporary chunk files

    return " ".join(transcriptions)

def process_video(video_id):
    audio_path, _ = download_youtube_audio(video_id)
    if audio_path:
        transcription = transcribe_audio(audio_path)
        os.remove(audio_path)  # Remove the audio file after transcription
        return transcription
    return None

# Main execution
transcription_list = []
for video_id in tech_review_videos:
    transcription = process_video(video_id)
    transcription_list.append(transcription)
    time.sleep(3)  # Add a short delay between video processing

[youtube] Extracting URL: https://www.youtube.com/watch?v=zijUjJdegI8
[youtube] zijUjJdegI8: Downloading webpage
[youtube] zijUjJdegI8: Downloading ios player API JSON
[youtube] zijUjJdegI8: Downloading web creator player API JSON
[youtube] zijUjJdegI8: Downloading m3u8 information
[info] zijUjJdegI8: Downloading 1 format(s): 251
[download] Destination: downloaded_videos/zijUjJdegI8.webm
[download] 100% of    9.93MiB in 00:00:01 at 6.02MiB/s     
[ExtractAudio] Destination: downloaded_videos/zijUjJdegI8.mp3
Deleting original file downloaded_videos/zijUjJdegI8.webm (pass -k to keep)
Successfully downloaded: Samsung Z Flip/Fold 6, Watch Ultra, Buds Pro and Ring Impressions! as MP3


In [42]:
import os
import json
from openai import OpenAI

# Set up OpenAI client for Solar API
client = OpenAI(
    api_key=os.environ.get("SOLAR_API_KEY"),
    base_url="https://api.upstage.ai/v1/solar"
)

def analyze_transcription(transcription):
    prompt = f"""
    Analyze the following transcription of a tech review video and provide a summary for each distinct product mentioned. Format your response as a valid JSON object with the following structure:

    {{
        "products": [
            {{
                "name": "Product Name",
                "sentiment_score": 7,
                "summary": "Brief summary of the product review.",
                "verdict": "FAVORABLE"
            }}
        ]
    }}

    Rules:
    1. The "products" key should contain an array of product objects.
    2. Each product object should have "name", "sentiment_score", "summary", and "verdict" keys.
    3. "sentiment_score" should be an integer from 1 to 10.
    4. "verdict" should be either "FAVORABLE", "UNFAVORABLE", or "NEUTRAL".
    5. Ensure all string values are properly escaped for JSON.
    6. The summary should be concise, focusing on key points that influenced the sentiment score.
    7. Limit your response to a maximum of 5 products to ensure completeness.

    Transcription:
    {transcription}

    Provide your analysis as a valid JSON object:
    """

    response = client.chat.completions.create(
        model="solar-pro",
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        stream=False
    )

    # Post-process the response to ensure valid JSON
    raw_response = response.choices[0].message.content.strip()
    try:
        # Try to parse the JSON as is
        return json.loads(raw_response)
    except json.JSONDecodeError:
        # If parsing fails, attempt to fix common issues
        try:
            # Add closing brackets if they're missing
            if not raw_response.endswith('}}'):
                raw_response += ']}}'
            # Remove any trailing commas before closing brackets
            raw_response = raw_response.replace(',]', ']').replace(',}', '}')
            return json.loads(raw_response)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")
            print("Raw response:", raw_response)
            return {"products": []}

# Assuming transcription_list contains your transcriptions
results = []

for transcription in transcription_list:
    analysis = analyze_transcription(transcription)
    results.extend(analysis.get("products", []))

# Print or process the combined results
for product in results:
    print(f"Product: {product['name']}")
    print(f"Sentiment Score: {product['sentiment_score']}")
    print(f"Summary: {product['summary']}")
    print(f"Verdict: {product['verdict']}")
    print("---")

Error parsing JSON: Unterminated string starting at: line 38 column 18 (char 2380)
Raw response: {
  "products": [
    {
      "name": "Samsung Galaxy Z Flip 6",
      "sentiment_score": 3,
      "summary": "The Samsung Galaxy Z Flip 6 is an updated version of its predecessor, with minimal changes and a slightly higher price. It features a slightly better hinge, new AI features, and more RAM, but its overall design remains similar.",
      "verdict": "NEUTRAL"
    },
    {
      "name": "Samsung Galaxy Z Fold 6",
      "sentiment_score": 3,
      "summary": "The Samsung Galaxy Z Fold 6 is another updated version of its predecessor, with minimal changes and a higher price. It has a slightly better hinge, new AI features, and a larger battery, but its overall design and crease are similar to the previous generation.",
      "verdict": "NEUTRAL"
    },
    {
      "name": "Samsung Galaxy Watch 7",
      "sentiment_score": 4,
      "summary": "The Samsung Galaxy Watch 7 is a refreshed vers