# Pull Transcriptions

In [29]:
import boto3
import pandas as pd
import json
import os
import sys
from contextlib import contextmanager

s3_client = boto3.client('s3', region_name='us-east-2')  
bucket_name = 'sagemaker-studio-058264083825-bar3vvoeivv'  

@contextmanager
def suppress_stdout():
    with open(os.devnull, 'w') as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:
            yield
        finally:
            sys.stdout = old_stdout

def list_transcription_files(bucket_name, prefix=''):
    try:
        response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
        if 'Contents' in response:
            return [obj['Key'] for obj in response['Contents'] if obj['Key'].endswith('.txt') or obj['Key'].endswith('.json')]
    except Exception as e:
        print(f"Error listing files: {e}")
    return []

def download_transcriptions(bucket_name, keys):
    transcriptions = []
    for key in keys:
        try:
            # Use the context manager to suppress prints
            with suppress_stdout():
                print(f"Downloading {key}...")
                response = s3_client.get_object(Bucket=bucket_name, Key=key)
                content = response['Body'].read().decode('utf-8')

                # Debugging: print content to ensure it's correct
                print(f"Content of {key}:")
                print(content[:500])  # Print first 500 characters to verify

            # If content is JSON, load and extract the transcription text
            if key.endswith('.json'):
                try:
                    content_json = json.loads(content)
                    if 'results' in content_json and 'transcripts' in content_json['results']:
                        transcribed_text = content_json['results']['transcripts'][0]['transcript']
                    else:
                        transcribed_text = content  # Fallback to raw content if structure is unexpected
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON for {key}: {e}")
                    transcribed_text = content  # Fallback to raw content
            else:
                transcribed_text = content

            # Append the transcription to the list
            transcriptions.append({'FileName': key, 'Transcription': transcribed_text})
        
        except Exception as e:
            print(f"Error downloading {key}: {e}")
    
    return transcriptions

with suppress_stdout():
    transcription_keys = list_transcription_files(bucket_name)

transcriptions = download_transcriptions(bucket_name, transcription_keys)
df = pd.DataFrame(transcriptions)

# Add Index to Match with Training Data Labels

In [20]:
def extract_first_number(job_name):
    return int(job_name.split('_')[0])

df['Index'] = df['FileName'].apply(extract_first_number)

# Add Data Labels

In [18]:
df1 = pd.read_csv('Train_Test Data - Sheet3 (1).csv')

# Merge Dataframes

In [22]:
df_merged = pd.merge(df1, df, on='Index', how='inner')

In [24]:
replacement_mapping = {'Anti - Biden': 1, 'Pro - Biden': 2, 'Neutral':0, 'Anti-Biden': 1, 'Pro-Biden': 2, 'Anti-biden': 1, 'anti-Biden': 1, 'Anti-Biden ': 1, 'Neutral ':0, 'Po-Biden': 2, 'Pro-biden': 2, 'Pro-Biden ': 2}
df_final = df_merged.replace(replacement_mapping)

In [25]:
df_final['Label'] = df_final['Label'].astype(int)

# Export to CSV

In [28]:
csv_filename = 'df_final.csv'
df_final.to_csv(csv_filename, index=False)