In [None]:
import os
import pandas as pd
from huggingface_hub import login
from datasets import load_dataset  # Use load_dataset for memory efficiency
from dotenv import load_dotenv

# --- 1. Setup and Login ---

# This loads the variables from your .env file
load_dotenv()
my_token = os.getenv("HF_TOKEN")

if my_token:
    print("Logging into Hugging Face Hub...")
    login(token=my_token)
else:
    print("ERROR: HF_TOKEN not found in .env file.")
    # You might want to exit the script if the token is missing
    # exit() 

# --- 2. Load and Clean Pandas DataFrame ---

print("Loading 'customer_insurance_reviews_final.csv'...")
final_complaints_df = pd.read_csv('customer_insurance_reviews_final.csv')

print("Cleaning DataFrame...")

# Drop 'authorAvatar' column
if 'authorAvatar' in final_complaints_df.columns:
    final_complaints_df = final_complaints_df.drop(columns=['authorAvatar'])

# Convert 'source_id' to string to prevent mixed-type errors
if 'source_id' in final_complaints_df.columns:
    final_complaints_df['source_id'] = final_complaints_df['source_id'].astype(str)

# Convert 'incident_id_number' to string to prevent mixed-type errors
if 'incident_id_number' in final_complaints_df.columns:
    final_complaints_df['incident_id_number'] = final_complaints_df['incident_id_number'].astype(str)

# --- (THIS IS THE NEW FIX) ---
# Convert 'incident_contact_number' to string to fix the new ArrowTypeError
if 'incident_contact_number' in final_complaints_df.columns:
    final_complaints_df['incident_contact_number'] = final_complaints_df['incident_contact_number'].astype(str)
# -----------------------------

print("DataFrame Info after cleaning:")
final_complaints_df.info()

# --- 3. Save to Parquet (The Memory-Safe Step) ---

parquet_path = 'complaints.parquet'
print(f"Saving cleaned DataFrame to '{parquet_path}'...")
# This line should now work without error
final_complaints_df.to_parquet(parquet_path, index=False)

# --- 4. Load Dataset from Parquet (Fix for ArrowMemoryError) ---

print(f"Loading dataset from '{parquet_path}'...")
# This loads the dataset efficiently from disk, not all into RAM
dataset = load_dataset('parquet', data_files=parquet_path, split='train')

# --- 5. Push to Hub ---

repo_id = "miehleketo93/customer_insurance_reviews"
print(f"Pushing dataset to '{repo_id}'...")
dataset.push_to_hub(repo_id)

print("\n--- Process Complete ---")
print(f"Dataset successfully pushed to https://huggingface.co/datasets/{repo_id}")

# Display the head of the final Hugging Face Dataset
print("\nDataset head:")
print(dataset.head())

In [None]:
import os
import glob
from huggingface_hub import login, HfApi
from datasets import load_dataset, Features, Value
from dotenv import load_dotenv

# --- 1. Setup and Login ---
load_dotenv()
my_token = os.getenv("HF_TOKEN")
HF_USERNAME = "miehleketo93" 

if my_token:
    print("Logging into Hugging Face Hub...")
    login(token=my_token)
else:
    print("ERROR: HF_TOKEN not found in .env file.")

# Initialize the Hub API
api = HfApi()

# --- 2. DEFINE DATA FOLDER ---
# Note the 'r' before the string to handle the backslashes
DATA_DIRECTORY = r"D:\Data Engineering\Data-Engineering\Hellopeter\Data_Pipelines\csv_files"

# --- 3. Find and Loop Through All CSV Files ---
csv_files = glob.glob(os.path.join(DATA_DIRECTORY, "*.csv"))

if not csv_files:
    print(f"No .csv files found in '{DATA_DIRECTORY}'. Please check the path.")
else:
    print(f"Found {len(csv_files)} CSV files to process...")

for csv_path in csv_files:
    try:
        # --- 4. Create Repo ID ---
        # Get the filename without the .csv extension
        dataset_name = os.path.basename(csv_path).replace('.csv', '')
        
        # Create the full repository ID
        repo_id = f"{HF_USERNAME}/{dataset_name}"
        
        print(f"\n--- Processing: {dataset_name} ---")
        print(f"Target repository: {repo_id}")

        # --- 5. Create Repo (if it doesn't exist) ---
        api.create_repo(
            repo_id=repo_id,
            repo_type="dataset",
            exist_ok=True  
        )
        print(f"Ensured repository exists: {repo_id}")

        # --- 6. Load CSV Dataset ---
        print(f"Loading data from {csv_path}...")
        dataset = load_dataset('csv', data_files=csv_path, split='train')

        print("Casting known problematic columns to string...")
        new_features = dataset.features.copy()
        columns_to_cast = ['source_id', 'incident_id_number', 'incident_contact_number']
        
        for col in columns_to_cast:
            if col in new_features:
                new_features[col] = Value('string')
                
        dataset = dataset.cast(new_features)
        print("Casting complete.")

        # --- 8. Push to Hub ---
        print(f"Pushing data to {repo_id}...")
        dataset.push_to_hub(repo_id)
        
        print(f" Successfully pushed '{dataset_name}' to the Hub.")

    except Exception as e:
        print(f" ERROR processing {csv_path}: {e}")
        print("Moving to next file...")

print("\n--- All files processed. ---")

In [None]:
import os
import glob
import pandas as pd
from huggingface_hub import login, HfApi
from datasets import Dataset
from dotenv import load_dotenv

# --- 1. Setup and Login ---
load_dotenv()
my_token = os.getenv("HF_TOKEN")
HF_USERNAME = "miehleketo93" 

if my_token:
    print("Logging into Hugging Face Hub...")
    login(token=my_token)
else:
    print("ERROR: HF_TOKEN not found in .env file.")
    exit(1)

# Initialize the Hub API
api = HfApi()

# --- 2. DELETE ALL EXISTING REPOS ---
print("\n=== DELETING ALL EXISTING DATASET REPOSITORIES ===")
try:
    # List all datasets for the user
    user_datasets = api.list_datasets(author=HF_USERNAME)
    
    dataset_list = list(user_datasets)
    
    if not dataset_list:
        print(f"No datasets found for user '{HF_USERNAME}'")
    else:
        print(f"Found {len(dataset_list)} datasets to delete...")
        
        for dataset in dataset_list:
            try:
                repo_id = dataset.id
                print(f"Deleting: {repo_id}")
                api.delete_repo(repo_id=repo_id, repo_type="dataset", token=my_token)
                print(f"✓ Deleted: {repo_id}")
            except Exception as e:
                print(f"✗ Error deleting {repo_id}: {e}")
        
        print("\n✓ All repositories deleted successfully!")
        
except Exception as e:
    print(f"✗ Error listing/deleting repositories: {e}")
    exit(1)

# --- 3. DEFINE DATA FOLDER ---
DATA_DIRECTORY = r"D:\Data Engineering\Data-Engineering\Hellopeter\Data_Pipelines\csv_files"

# --- 4. Find CSV Files ---
csv_files = glob.glob(os.path.join(DATA_DIRECTORY, "*.csv"))

if not csv_files:
    print(f"\nNo .csv files found in '{DATA_DIRECTORY}'. Please check the path.")
    exit(1)
else:
    print(f"\n=== UPLOADING {len(csv_files)} DATASETS ===")

# --- 5. Process Each CSV File ---
for csv_path in csv_files:
    try:
        # Get dataset name from filename
        dataset_name = os.path.basename(csv_path).replace('.csv', '')
        repo_id = f"{HF_USERNAME}/{dataset_name}"
        
        print(f"\n--- Processing: {dataset_name} ---")
        
        # Generate description based on dataset name
        description = f"""# {dataset_name.replace('_', ' ').title()} Dataset

This dataset contains {dataset_name.replace('_', ' ')} data from HelloPeter.

## Dataset Information
- **Format**: Parquet (optimized for fast loading)
- **Source**: HelloPeter Data Pipeline
- **Split**: train

## Usage
```python
from datasets import load_dataset

dataset = load_dataset("{repo_id}")
```

---
*This dataset is part of the HelloPeter data collection.*
"""
        
        # --- 6. Load CSV with Robust Parsing ---
        print(f"Loading data from {csv_path}...")
        
        # Try multiple parsing strategies
        df = None
        parsing_strategies = [
            # Strategy 1: Standard with error handling
            {
                'on_bad_lines': 'skip',
                'engine': 'python',
                'quoting': 1,  # QUOTE_ALL
                'escapechar': '\\'
            },
            # Strategy 2: More lenient
            {
                'on_bad_lines': 'skip',
                'engine': 'python',
                'quotechar': '"',
                'doublequote': True,
                'escapechar': None
            },
            # Strategy 3: C engine with error skip
            {
                'on_bad_lines': 'skip',
                'engine': 'c',
                'quoting': 1
            }
        ]
        
        for idx, strategy in enumerate(parsing_strategies, 1):
            try:
                print(f"  Attempting parsing strategy {idx}...")
                df = pd.read_csv(csv_path, **strategy, low_memory=False)
                print(f"  ✓ Successfully parsed with strategy {idx}")
                break
            except Exception as e:
                print(f"  ✗ Strategy {idx} failed: {str(e)[:100]}")
                if idx == len(parsing_strategies):
                    raise Exception("All parsing strategies failed")
                continue
        
        if df is None or df.empty:
            raise Exception("Failed to load data or dataframe is empty")
        
        print(f"✓ Loaded: {len(df)} rows, {len(df.columns)} columns")
        
        # --- 7. Convert to Dataset ---
        dataset = Dataset.from_pandas(df)
        
        # Cast problematic columns to string if they exist
        columns_to_cast = ['source_id', 'incident_id_number', 'incident_contact_number']
        
        for col in columns_to_cast:
            if col in dataset.column_names:
                # Convert column to string
                dataset = dataset.map(
                    lambda x: {col: str(x[col]) if x[col] is not None else ''},
                    desc=f"Converting {col} to string"
                )
        
        print(f"Dataset shape: {dataset.num_rows} rows, {dataset.num_columns} columns")
        
        # --- 8. Create Private Repository ---
        print(f"Creating private repository: {repo_id}...")
        api.create_repo(
            repo_id=repo_id,
            repo_type="dataset",
            private=True,
            exist_ok=True
        )
        
        # --- 9. Push to Hub in Parquet Format ---
        print(f"Pushing dataset to Hub (Parquet format)...")
        dataset.push_to_hub(
            repo_id=repo_id,
            token=my_token,
            private=True
        )
        
        # --- 10. Update Repository Description ---
        print("Updating repository description...")
        api.update_repo_settings(
            repo_id=repo_id,
            repo_type="dataset",
            private=True,
            description=description
        )
        
        print(f"✓ Successfully uploaded '{dataset_name}' as private Parquet dataset")
        print(f"  URL: https://huggingface.co/datasets/{repo_id}")

    except Exception as e:
        print(f"✗ ERROR processing {csv_path}: {e}")
        import traceback
        traceback.print_exc()
        print("\nContinuing to next file...\n")
        continue

print("\n" + "="*50)
print("✓ ALL DATASETS PROCESSED!")
print("="*50)