In [1]:
from openai import OpenAI
import os
import ftfy  # text encoding issues
from dotenv import load_dotenv, find_dotenv

# Load the OpenAI API key from .env file
_ = load_dotenv(find_dotenv())  # Locate and load .env
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Initialize OpenAI Client
client = OpenAI(api_key=OPENAI_API_KEY)

# Function to fix text encoding issues before sending to OpenAI
def preprocess_text(text):
    return ftfy.fix_text(text)  # Automatically fix encoding issues

# Function to determine max tokens dynamically
def determine_max_tokens(summary):
    token_count = len(summary.split())  # Approximate word count (1 word ≈ 1.3 tokens)
    
    if token_count <= 100:
        return 100  # Keep short summaries as is
    elif 100 < token_count <= 200:
        return 200  # Allow medium-length summaries more space
    else:
        return 150  # Trim longer summaries but keep them readable

# Function to clean a single summary using OpenAI
def clean_summary_with_openai(summary):
    # Step 1: Preprocess text before sending to OpenAI
    fixed_summary = preprocess_text(summary)

    # Step 2: Dynamically determine max_tokens
    max_tokens = determine_max_tokens(fixed_summary)

    prompt = f"""Fix any encoding issues and improve the readability of the following text:
    
    {fixed_summary}
    
    Ensure correct spacing, remove unnecessary symbols, and properly format all names, places, and characters."""

    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that corrects text encoding errors and improves readability."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=max_tokens,
            temperature=0.5  # Ensures factual accuracy
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"⚠️ Error cleaning summary: {e}")
        return fixed_summary  # Return preprocessed version if API fails

# Problematic summary from dataset
test_summary = """
TheBirth of the New World(Spanish:Nacimiento del Nuevo Mundo, colloquially known asLa Estatua de ColÃ³n,lit.Columbus' Statue) is a 360 foot (110 m) bronze sculpture located on the Atlantic coastline ofArecibo, Puerto Rico. When completed in 2016, it became the tallest sculpture inNorth America,[1](as well as the tallest not only in theUnited States, but in theWestern Hemispherein general) surpassing Mexico'sGuerrero Chimalli(which measures 200 feet or 61 meters in height including its base), and the fourth tallest worldwide, after theStatue of Unityin India (597 ft; 182 m), theSpring Temple Buddhain China (420 ft; 130 m), and theLaykyun Sekkyain Myanmar (380 ft; 120 m). After being imported to Puerto Rico, there were plans for it to be erected inCataÃ±o. After being moved toMayagÃ¼ezand facing further delays,Birth of the New Worldwas assembled at its current location.
 Originally designed byGeorgiansculptorZurab Tseretelias a monument to commemorate the 500th anniversary ofChristopher Columbus'sfirst voyage,Birth of the New Worldwas constructed in 1991. The statue prominently depicts Columbus controlling ananachronisticdepiction of a steering wheel, with a backdrop featuring theNiÃ±a,PintaandSanta MarÃ­atraversing the Atlantic Ocean.[2]Made of 2,750 bronze and steel pieces and weighing more than 1,300,000 pounds (650 short tons; 590 tonnes), the monument's 360 ft (110 m) height made it the tallest in theWestern Hemisphereduring the last decade of the 20th century, dwarfing theStatue of Liberty[3]and theMonumento a la Virgen de la Paz.
"""

# Run the test
print("\n Original Summary (with Encoding Issues):")
print(test_summary)

cleaned_summary = clean_summary_with_openai(test_summary)

print("\nCleaned Summary (Corrected & Readable):")
print(cleaned_summary)



 Original Summary (with Encoding Issues):

TheBirth of the New World(Spanish:Nacimiento del Nuevo Mundo, colloquially known asLa Estatua de ColÃ³n,lit.Columbus' Statue) is a 360 foot (110 m) bronze sculpture located on the Atlantic coastline ofArecibo, Puerto Rico. When completed in 2016, it became the tallest sculpture inNorth America,[1](as well as the tallest not only in theUnited States, but in theWestern Hemispherein general) surpassing Mexico'sGuerrero Chimalli(which measures 200 feet or 61 meters in height including its base), and the fourth tallest worldwide, after theStatue of Unityin India (597 ft; 182 m), theSpring Temple Buddhain China (420 ft; 130 m), and theLaykyun Sekkyain Myanmar (380 ft; 120 m). After being imported to Puerto Rico, there were plans for it to be erected inCataÃ±o. After being moved toMayagÃ¼ezand facing further delays,Birth of the New Worldwas assembled at its current location.
 Originally designed byGeorgiansculptorZurab Tseretelias a monument to comm

In [1]:
from openai import OpenAI
import os
import pandas as pd
import ftfy  
from dotenv import load_dotenv, find_dotenv
from tqdm import tqdm 

_ = load_dotenv(find_dotenv()) 
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

client = OpenAI(api_key=OPENAI_API_KEY)

# Function to fix text encoding issues before sending to OpenAI
def preprocess_text(text):
    return ftfy.fix_text(text)  # Automatically fix encoding issues

# Function to determine max tokens dynamically
def determine_max_tokens(summary):
    token_count = len(summary.split())  # Approximate word count (1 word ≈ 1.3 tokens)
    
    if token_count <= 100:
        return 100  # Keep short summaries 
    elif 100 < token_count <= 200:
        return 200  # Allow medium-length summaries 
    else:
        return 150  # Trim longer summaries but keep them readable

# Function to clean a single summary using OpenAI
def clean_summary_with_openai(summary):
    # Skip empty summaries
    if not isinstance(summary, str) or summary.strip() == "":
        return ""

    # Step 1: Preprocess text before sending to OpenAI
    fixed_summary = preprocess_text(summary)

    # Step 2: Dynamically determine max_tokens
    max_tokens = determine_max_tokens(fixed_summary)

    prompt = f"""Fix any encoding issues and improve the readability of the following text:
    
    {fixed_summary}
    
    Ensure correct spacing, remove unnecessary symbols, and properly format all names, places, and characters."""

    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that corrects text encoding errors and improves readability."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=max_tokens,
            temperature=0.5  # Ensures factual accuracy
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error cleaning summary: {e}")
        return fixed_summary  # Return preprocessed version if API fails


In [5]:
# Load the CSV file
file_path = "C:/Users/Ignacio/IronHackCodes/gitHStuff/FinalProj/processed_landmarks_final.csv"  # Ensure this is the correct path
df_landmarks = pd.read_csv(file_path)

In [7]:
# Apply the cleaning function to all summaries with tqdm for progress tracking
tqdm.pandas(desc="Processing Summaries")
df_landmarks["Cleaned Summary"] = df_landmarks["Summary"].progress_apply(clean_summary_with_openai)

Processing Summaries: 100%|██████████| 520/520 [18:32<00:00,  2.14s/it]


In [9]:
# Save the cleaned data back to a new CSV file
cleaned_file_path = "processed_landmarks_final_cleaned.csv"
df_landmarks.to_csv(cleaned_file_path, index=False)
print(f"Cleaning completed! Cleaned CSV saved at: {cleaned_file_path}")

Cleaning completed! Cleaned CSV saved at: processed_landmarks_final_cleaned.csv
