In [None]:
# Importing necessary libraries
import pandas as pd
import re


## Reading the ORES Predictions CSV File into a Pandas DataFrame
This segment reads a CSV file, ores_predictions.csv, into a Pandas DataFrame for further processing and analysis.


In [None]:

# Reading the CSV file
df = pd.read_csv('../data/ores_predictions.csv')


## Defining a List of US States for Data Cleaning
This part defines a list of US states, for use in data cleaning and filtering operations. The list includes the names of all 50 states in the United States.

In [None]:

# List of US states
us_states = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut",
    "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa",
    "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan",
    "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire",
    "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma",
    "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee",
    "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"
]





## Creating a Function to Clean Titles and Identify US States
This section presents a Python function named clean_title. The function takes a title as input, cleans the title by removing leading and trailing whitespaces, and checks for the presence of a US state. It returns the cleaned title if a US state is found; otherwise, it returns None.

In [None]:
# Function to clean the title and check for the presence of a US state
def clean_title(title):
    title = title.strip()
    parts = re.split(r',\s*', title)
    if parts[-1].title() in us_states:
        return ", ".join(parts[-2:]).strip()
    else:
        return None

## Applying Title Cleaning Function to DataFrame Columns
This block applies the clean_title function to the 'title' column of the DataFrame. It cleans the titles and updates the 'title' column with the cleaned versions, removing any irrelevant information and retaining only the city and state names.

In [None]:
# Applying the cleaning function to the 'title' column
df['title'] = df['title'].apply(clean_title)

## Filtering Out Non-US State Entries from the DataFrame
This section filters out rows from the DataFrame where the 'title' column does not contain a US state. It drops rows with None values in the 'title' column, presumably removing entries that do not correspond to cities in the United States.

In [None]:
# Filtering out rows that do not contain a US state
df = df.dropna(subset=['title'])

## Saving Cleaned Data to a New CSV File
This part saves the cleaned DataFrame, which now contains only city and state information, to a new CSV file named 'cleaned_data.csv'. The index column is excluded from the saved data.

In [None]:
# Saving the cleaned data to a new CSV file
df.to_csv('../data/cleaned_data.csv', index=False)
