In [None]:
import os
import pandas as pd
from huggingface_hub import login
from datasets import load_dataset 
from dotenv import load_dotenv

# --- 1. Setup and Login ---
load_dotenv()
my_token = os.getenv("HF_TOKEN")

if my_token:
    print("Logging into Hugging Face Hub...")
    login(token=my_token)
else:
    print("ERROR: HF_TOKEN not found in .env file.")


print("Loading 'customer_insurance_reviews_final.csv'...")
final_complaints_df = pd.read_csv('csv_files/customer_insurance_reviews_final.csv')

print("Cleaning DataFrame...")

if 'authorAvatar' in final_complaints_df.columns:
    final_complaints_df = final_complaints_df.drop(columns=['authorAvatar'])

if 'source_id' in final_complaints_df.columns:
    final_complaints_df['source_id'] = final_complaints_df['source_id'].astype(str)

if 'incident_id_number' in final_complaints_df.columns:
    final_complaints_df['incident_id_number'] = final_complaints_df['incident_id_number'].astype(str)

if 'incident_contact_number' in final_complaints_df.columns:
    final_complaints_df['incident_contact_number'] = final_complaints_df['incident_contact_number'].astype(str)
# -----------------------------

print("DataFrame Info after cleaning:")
final_complaints_df.info()


parquet_path = 'complaints.parquet'
print(f"Saving cleaned DataFrame to '{parquet_path}'...")
final_complaints_df.to_parquet(parquet_path, index=False)


print(f"Loading dataset from '{parquet_path}'...")
dataset = load_dataset('parquet', data_files=parquet_path, split='train')

# --- 5. Push to Hub ---

repo_id = "miehleketo93/customer_insurance_reviews_final"
print(f"Pushing dataset to '{repo_id}'...")
dataset.push_to_hub(repo_id)

print("\n--- Process Complete ---")
print(f"Dataset successfully pushed to https://huggingface.co/datasets/{repo_id}")

print("\nDataset head:")
print(dataset.head())