In [102]:
# Importing necessary libraries
import pandas as pd
import re
import csv

## Reading the ORES Predictions CSV File into a Pandas DataFrame
This segment reads a CSV file, ores_predictions.csv, into a Pandas DataFrame for further processing and analysis. Additionally, include some headers.


In [103]:



# Create a list of the header values
headers = ["title", "rev_id", "prediction"]


data_rows = []
with open('../data/ores_predictions.csv', 'r') as file:
    csv_reader = csv.reader(file)
    data_rows = [row for row in csv_reader]

# Rewrite the data with headers to the same file
with open('../data/ores_predictions_with_headers.csv', 'w', newline='') as file:
    csv_writer = csv.writer(file)
    csv_writer.writerow(headers)  # Write the headers first
    csv_writer.writerows(data_rows)  # Write the existing data rows below the headers

    
# Reading the CSV file
df = pd.read_csv('../data/ores_predictions_with_headers.csv')

In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22157 entries, 0 to 22156
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       22157 non-null  object
 1   rev_id      22157 non-null  int64 
 2   prediction  22157 non-null  object
dtypes: int64(1), object(2)
memory usage: 519.4+ KB


## Defining a List of US States for Data Cleaning
This part defines a list of US states, for use in data cleaning and filtering operations. The list includes the names of all 50 states in the United States.

In [105]:

# List of US states
us_states = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut",
    "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa",
    "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan",
    "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire",
    "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma",
    "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee",
    "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"
]





## Creating a Function to Clean Titles and Identify US States
This section presents a Python function named clean_title. The function takes a title as input, cleans the title by removing leading and trailing whitespaces, and checks for the presence of a US state. It returns the cleaned title if a US state is found; otherwise, it returns None.

In [106]:
# Function to clean the title and check for the presence of a US state
def clean_title(title):
    title = title.strip()
    parts = re.split(r',\s*', title)
    if parts[-1].title() in us_states:
        return ", ".join(parts[-2:]).strip()
    else:
        return None

## Applying Title Cleaning Function to DataFrame Columns
This block applies the clean_title function to the 'title' column of the DataFrame. It cleans the titles and updates the 'title' column with the cleaned versions, removing any irrelevant information and retaining only the city and state names.

In [107]:
# Applying the cleaning function to the 'title' column
df['cleaned_title'] = df['title'].apply(clean_title)

## Getting rows having titles without US states
Printing the rows which dont have US states in their names

In [108]:
# Prniting out rows that do not contain a US state
removed_rows = df[df['cleaned_title'].isnull()]  # Storing the removed rows

# Printing the rows to be removed - probable
print("The following rows can be removed, as they dont have state in their name:")
print(removed_rows)

The following rows can be removed, as they dont have state in their name:
                                          title      rev_id prediction   
1062                                  Utqiaġvik  1134590975       Stub  \
1901                                Los Angeles  1180364321          B   
2029                                  San Diego  1180363657         GA   
2032                              San Francisco  1180315260          B   
2144                  2020 United States census  1179894823          B   
2145                  2010 United States census  1179393707         FA   
2211                                     Denver  1180303115          B   
2419                  2020 United States census  1179894823          B   
2513                     County (United States)  1178988667         GA   
2756                                      Miami  1180007550          B   
2925                     County (United States)  1178988667         GA   
2953                                  

# Filtering out the rows that are not cities
Now, after getting the rows which don't have state name in them, we can see that there are few valis rows which are returned. Such as New York City, St. Louis, Riverview, St. Louis, etc. We have to get these rows back as these are relevant and remove the rest.

In [109]:
# List of known cities or use an external dataset for this purpose
known_cities = [
    "Los Angeles", "San Diego", "San Francisco", "Denver", "Miami", "Atlanta", "New Orleans", "Eastwood, Syracuse",
    "Boston", "Nantucket", "Hyde Park, Boston", "Detroit", "Minneapolis", "Philadelphia","Echols County","Riverview, St. Louis","Wailua, Kauai",
    "Pittsburgh", "Oklahoma City", "Salt Lake City", "Seattle", "Milwaukee", "Indianapolis", "St. Louis", "Las Vegas", "New York City", ""
]

# Filtering out the rows that are not cities
to_be_removed = df[df['cleaned_title'].isnull() & ~df['title'].isin(known_cities)]
to_be_removed

Unnamed: 0,title,rev_id,prediction,cleaned_title
1062,Utqiaġvik,1134590975,Stub,
2144,2020 United States census,1179894823,B,
2145,2010 United States census,1179393707,FA,
2419,2020 United States census,1179894823,B,
2513,County (United States),1178988667,GA,
2925,County (United States),1178988667,GA,
5680,County (United States),1178988667,GA,
13430,Population,1179591354,C,
13431,Square mile,1179747974,Start,
14027,Population,1179591354,C,


In [110]:
to_be_removed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 34 entries, 1062 to 22054
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   title          34 non-null     object
 1   rev_id         34 non-null     int64 
 2   prediction     34 non-null     object
 3   cleaned_title  0 non-null      object
dtypes: int64(1), object(3)
memory usage: 1.3+ KB


## Filtering Out Non-US State Entries from the DataFrame
This section filters out rows from the DataFrame where the 'title' column does not contain a US state. It drops rows with None values in the 'title' column, presumably removing entries that do not correspond to cities in the United States. At the same time removing the additional field we created.

In [111]:
# Filtering out rows that do not contain a US state or are not US city or county
df = df[~df['title'].isin(to_be_removed['title'])]
df = df.drop('cleaned_title', axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22123 entries, 0 to 22156
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       22123 non-null  object
 1   rev_id      22123 non-null  int64 
 2   prediction  22123 non-null  object
dtypes: int64(1), object(2)
memory usage: 691.3+ KB


## Saving Cleaned Data to a New CSV File
This part saves the cleaned DataFrame, which now contains only city and state information, to a new CSV file named 'cleaned_data.csv'. The index column is excluded from the saved data.

In [112]:
# Saving the cleaned data to a new CSV file
df.to_csv('../data/cleaned_data.csv', index=False)
