### Test function to clean & un-split city names data, from the scraped rental listings CSV files:

In [1]:
# imports-- file processing
import os
import glob

# data analysis libraries & SQL libraries
import numpy as np
import pandas as pd
# SQL ODBC for API connection between Python & SQL Server
import pyodbc
import sqlalchemy as sa

In [2]:
def recursively_import_all_CSV_and_concat_to_single_df(parent_direc, fn_regex=r'*.csv'):
    """Recursively search parent directory, and look up all CSV files.
    Then, import all CSV files to a single Pandas' df using pd.concat()"""
    path =  parent_direc # specify parent path of directories containing the scraped rental listings CSV data -- NB: use raw text--as in r'path...', or can we use the double-back slashes to escape back-slashes??
    df_concat = pd.concat((pd.read_csv(file) for file in glob.iglob(
        os.path.join(path, '**', fn_regex), 
        recursive=True)), ignore_index=True)  # os.path.join helps ensure this concatenation is OS independent
    return df_concat

## Import Dataset
# import all scraped SF bay area rental listings data
scraped_data_path = r"D:\\Coding and Code projects\\Python\\craigslist_data_proj\\CraigslistWebScraper\\scraped_data\\sfbay"

df = recursively_import_all_CSV_and_concat_to_single_df(scraped_data_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18645 entries, 0 to 18644
Data columns (total 48 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   listing_urls             18645 non-null  object 
 1   ids                      17236 non-null  float64
 2   sqft                     13209 non-null  float64
 3   cities                   17219 non-null  object 
 4   prices                   17227 non-null  object 
 5   bedrooms                 17184 non-null  float64
 6   bathrooms                17184 non-null  object 
 7   attr_vars                17220 non-null  object 
 8   listing_descrip          17220 non-null  object 
 9   date_of_webcrawler       17236 non-null  object 
 10  kitchen                  17220 non-null  float64
 11  date_posted              17220 non-null  object 
 12  region                   18645 non-null  object 
 13  sub_region               18645 non-null  object 
 14  cats_OK               

In [17]:
# get copy so we can test dataset without having to repeatedly re-load dataset
df1 = df.copy()

In [21]:
def clean_split_city_names(df, address_critera: list, neighborhood_criteria:list, split_city_delimiters: list, incorrect_city_names:dict, cities_not_in_region:dict, cities_that_need_extra_cleaning:dict):
    """Clean city names data in several ways:
    a.) Remove extraneous address & neighborhood data placed in the city names HTML object, such as 'Rd', 'Blvd', or 'Downtown'.
    b.) Unsplit city names data that are split via ',' & '/' delimiters.
    c.) Replace abbreviated or mispelled city names, and remove city names that do not exist within the SF Bay Area (e.g., 'Redding').
    d.) Remove any digits/integers within the city names data--ie, by using a '\d+' regex as the argument of str.replace() and replace it with empty strings.
    e.) Remove any city names records thast are left with merely empty strings (ie, the other steps removed all data for that given cities record).
    f.) Remove any whitespace to avoid the same city names from being treated as different entities by Pandas, Python, or SQL. 
    g.) Use str.capwords() to capitalize words (ie, excluding apostrophes).
    h.) Replace city names that are mispelled after having removed various street and neighborhood substrings such as 'St' or 'Ca'--e.g., '. Helena' should be 'St. Helena'. """
    # specify extraneous street & address data (e.g., 'Rd') that we want to remove from the city names column:
    addr_criteria = '|'.join(address_critera) # Join pipe ('|') symbols to address list so we can str.split() on any one of these criteria (ie, 'or' condition splitting on each element separated by pipes):
    # specify extraneous neighborhood criteria we should also remove from col
    nbhood_criteria = '|'.join(neighborhood_criteria) # remove neighborhood names as well as state abbreviation (shown on website as 'Ca') that is shown without the usual comma delimiter!
    # b.) specify delimiters we need to refer to un-split city names:
    split_city_delimiters = '|'.join(split_city_delimiters) # join pipes to delimiters so we can use str.split() based on multiple 'or' criteria simultaneously
    # clean city names data by removing extraneous address & neighborhood data, and unsplitting city names based on ',' & '\' delimiters
    df['cities'] =  df['cities'].str.split(addr_criteria).str[-1].str.replace(nbhood_criteria, '', case=True).str.lstrip()
    df['cities'] = df['cities'].str.split(split_city_delimiters).str[0] #unsplit city names based on comma or forward-slash delimiters
    # c.) replace specific abbreviated or mispelled city names, and remove cities that are not actually located in the sfbay region:
    df = df.replace({'cities':incorrect_city_names}) # replace mispelled & abbreviated city names
    df = df.replace({'cities':cities_not_in_region})  # remove (via empty string) cities that are not actually located in the sfbay region
    # d.) Remove digits/integer-like data from cities column:
    df['cities'] = df['cities'].str.replace('\d+', '')  # remove any digits by using '/d+' regex to look up digits, and then replace with empty string
    # e.) Remove any rows that have empty strings or null values for cities col (having performed the various data filtering and cleaning above)
    df = df[df['cities'].str.strip().astype(bool)] # remove rows with empty strings (ie, '') for cities col 
    df = df.dropna(subset=['cities']) # remove any remaining 'cities' null records
    # f.) Remove whitespace
    df['cities'] = df['cities'].str.strip() 
    # g.) capitalize the city names using str.capwords() 
    df['cities'] = df['cities'].str.split().apply(lambda x: [val.capitalize() for val in x]).str.join(' ')
    # h) Replace city names that are mispelled after having removed various street and neighborhood substrings such as 'St' or 'Ca'--e.g., '. Helena' should be 'St. Helena' & 'San los' should be 'San Carlos'. Also, remove any non-Bay Area cities such as Redding:
    # df['cities'] = df['cities'].str.lower() # transform all records to lower-case, for ease of cleaning the data
    df = df.replace({'cities':cities_that_need_extra_cleaning})
    return df

# specify various address and street name that we need to remove from the city names 
address_criteria = ['Boulevard', 'Blvd', 'Road', 'Rd', 'Avenue', 'Ave', 'Street', 'St', 'Drive', 'Dr', 'Real', 'E Hillsdale Blvd'] 
# specify various extraneous neighborhood names such as 'Downtown' 
neighborhood_criteria = ['Downtown', 'Central/Downtown', 'North', 'California', 'Ca.', 'Bay Area', 'St. Helena', 'St', 'nyon', 
'Jack London Square', 'Walking Distance To', 'El Camino', 'Mendocino County', 'San Mateo County', 'Alameda County', 'Rio Nido Nr', 'Mission Elementary', 
'Napa County', 'Golden Gate', 'Jennings', 'South Lake Tahoe', 'Tahoe Paradise', 'Kingswood Estates', 'South Bay', 'Skyline', 'San Antonio Tx', 
'East Bay', 'Morton Dr'] 

# specify what delimiters we want to search for to unsplit the split city names data:
split_city_delimiters =  [',', '/']
# specify dictionary of abbreviated & mispelled cities:
incorrect_city_names = {'Rohnert Pk':'Rohnert Park', 'Hillsborough Ca': 'Hillsborough', 'South Sf': 'South San Francisco', 'Ca':'', 'East San Jose':'San Jose', 'Vallejo Ca':'Vallejo', 'Westgate On Saratoga .':'San Jose', 'Bodega':'Bodega Bay', 'Briarwood At Central Park':'Fremont', 'Campbell Ca':'Campbell', 'Almaden':'San Jose', '.':'', 'East Foothills':'San Jose', 'Lake County':'', 'Redwood Shores':'Redwood City'}

# specify dictionary of cities that are not located in sfbay (ie, not located in the region):
cities_not_in_region = {'Ketchum':'', 'Baypoinr':'', 'Quito': '', 'Redding':'', 'Bend' :''}

# specify dictionary of city names that are mispelled after having removed various street and neighborhood substrings:
cities_that_need_extra_cleaning = {'. Helena': 'St. Helena', '. Helena Deer Park': 'St. Helena', 'San Los':'San Carlos', 'Tro Valley':'Castro Valley', 'Rohnert Pk':'Rohnert Park',
'Pbell':'Campbell', 'Pbell Ca':'Campbell', 'American Yon':'American Canyon'}

# clean city names data:
df1 = clean_split_city_names(df1, address_criteria, neighborhood_criteria, split_city_delimiters, incorrect_city_names, cities_not_in_region, cities_that_need_extra_cleaning)
# sanity check
print(f"Sanity check--after cleaning the city names, let's examine some of the cleaned data: {df1.cities.value_counts().tail(10)}")

Sanity check--after cleaning the city names, let's examine some of the cleaned data: Green Valley        1
San Pablo           1
Livermore           1
Mariner's Island    1
Pleasanton          1
Martinez            1
Portola Valley      1
American Canyon     1
Orinda              1
Discovery Bay       1
Name: cities, dtype: int64




In [18]:
import os
os.getcwd()

'd:\\Coding and Code projects\\Python\\craigslist_data_proj\\CraigslistWebScraper\\Rentals'

In [23]:
# export to csv
df1[['ids', 'cities']].to_csv('cleaned_cities_demo.csv')

In [16]:
df.loc[df['ids'] == 7408703227]

Unnamed: 0.1,listing_urls,ids,sqft,cities,prices,bedrooms,bathrooms,attr_vars,listing_descrip,date_of_webcrawler,...,is_furnished,attached_garage,detached_garage,carport,off_street_parking,no_parking,EV_charging,air_condition,no_smoking,Unnamed: 0
3523,https://sfbay.craigslist.org/eby/apa/d/concord...,7408703000.0,,American Canyon,2887,1.0,1,air conditioning\nflooring: wood\nfurnished\na...,1 bedroom available in 2 story house in americ...,2021-11-28,...,1,0,0,0,1,0,0,1,1,
