In [None]:
import sagemaker
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import os

# =======================================================================
# SageMaker Session and S3 Configuration
# =======================================================================
# Get the SageMaker session, execution role, and default S3 bucket
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()

print(f"Using SageMaker role: {role}")
print(f"Using S3 bucket: {bucket}")

# --- IMPORTANT: Configure your S3 paths here ---
# You must upload your raw CSV to this S3 location first.
input_data_s3_uri = f"s3://{bucket}/dataset/all-task-1-detailed-reviews.csv"

# The S3 prefix (folder) where the processed data will be saved.
output_data_s3_prefix = "dataset/processed-text"


# =======================================================================
# Load Data from S3
# =======================================================================
print(f"Loading raw data from {input_data_s3_uri}...")
try:
    df = pd.read_csv(input_data_s3_uri)
except Exception as e:
    print(f"Error loading data from S3: {e}")
    print("Please ensure your data is uploaded to the correct S3 path and that your SageMaker role has S3 permissions.")
    # You might want to stop execution if the file is not found, e.g., by raising the exception
    raise e


# =======================================================================
# Data Cleaning (No changes needed here)
# =======================================================================
df.dropna(subset=['review_text'], inplace=True)
df.drop_duplicates(subset=['review_text'], keep='first', inplace=True)

def clean_text(text):
    text = str(text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = ' '.join(text.split())
    return text

df['CleanedText'] = df['review_text'].apply(clean_text)

print(f"Dataframe shape after cleaning: {df.shape}")
print(f"Class distribution:\n{df['Is_Fake'].value_counts()}")


# =======================================================================
# Data Splitting (No changes needed here)
# =======================================================================
X = df['CleanedText']
y = df['Is_Fake']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Combine the labels and text into new DataFrames
train_df = pd.DataFrame({'Is_Fake': y_train, 'CleanedText': X_train})
val_df = pd.DataFrame({'Is_Fake': y_val, 'CleanedText': X_val})


# =======================================================================
# Upload Processed Data to S3
# =======================================================================
print(f"Uploading processed data to s3://{bucket}/{output_data_s3_prefix}...")

# Define the full S3 paths for train and validation sets
train_s3_path = f"s3://{bucket}/{output_data_s3_prefix}/train/train.csv"
val_s3_path = f"s3://{bucket}/{output_data_s3_prefix}/test/test.csv"

# Save the DataFrames directly to S3
train_df.to_csv(train_s3_path, header=False, index=False)
val_df.to_csv(val_s3_path, header=False, index=False)

print("âœ… Success! Processed data has been uploaded to S3.")
print(f"Training data is at: {train_s3_path}")
print(f"Validation data is at: {val_s3_path}")