In [1]:
# imports-- file processing
import datetime
import os
import glob

import datetime

# data analysis libraries & SQL libraries
import numpy as np
import pandas as pd

In [23]:
# import scraped  sfc data--ie, San Francisco rental listings data
def recursively_import_all_CSV_and_concat_to_single_df(parent_direc, fn_regex=r'*.csv'):
    """Recursively search parent directory, and look up all CSV files.
    Then, import all CSV files to a single Pandas' df using pd.concat()"""
    path =  parent_direc # specify parent path of directories containing the scraped rental listings CSV data -- NB: use raw text--as in r'path...', or can we use the double-back slashes to escape back-slashes??
    df_concat = pd.concat((pd.read_csv(file) for file in glob.iglob(
        os.path.join(path, '**', fn_regex), 
        recursive=True)), ignore_index=True)  # os.path.join helps ensure this concatenation is OS independent
    return df_concat


# 
sf_scraped_direc = r'D:\Coding and Code projects\Python\craigslist_data_proj\CraigslistWebScraper\scraped_data\sfbay\sfc'

sf_data = recursively_import_all_CSV_and_concat_to_single_df(sf_scraped_direc)
sf_data.info() # sanity check



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 758 entries, 0 to 757
Data columns (total 47 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   listing_urls             758 non-null    object 
 1   ids                      751 non-null    float64
 2   sqft                     488 non-null    float64
 3   cities                   745 non-null    object 
 4   prices                   751 non-null    object 
 5   bedrooms                 745 non-null    float64
 6   bathrooms                745 non-null    object 
 7   attr_vars                745 non-null    object 
 8   listing_descrip          745 non-null    object 
 9   date_of_webcrawler       751 non-null    object 
 10  kitchen                  745 non-null    float64
 11  date_posted              745 non-null    object 
 12  region                   758 non-null    object 
 13  sub_region               758 non-null    object 
 14  cats_OK                  7

In [17]:
sf_data['cities'].value_counts()

Downtown / Civic / Van Ness    87
Soma / South Beach             86
Mission District               59
Lower Nob Hill                 32
Sunset / Parkside              31
                               ..
Oakland, Ca                     1
Mission                         1
Sunnyside/Tahoe City            1
South Lake Tahoe                1
San Francico, Inner Sunset      1
Name: cities, Length: 62, dtype: int64

In [18]:
sf_data['cities'].str.split('/', expand = False).str[0]   # split cities based on presence of the forward-slash delimiter (ie: '/'), and then parse only the 1st such element. This way, we will have only the primary (first) city listed for each 'split' city name. 


0                 Soma 
1         Lower Pac Hts
2      Mission District
3               Sunset 
4          Lower Haight
             ...       
753                 NaN
754                 NaN
755                 NaN
756                 NaN
757                 NaN
Name: cities, Length: 758, dtype: object

In [32]:
# clean SF data--change neighborhood names to simply SF city name
def clean_city_names_for_sf(list_of_sf_neighborhoods, df):
    """In sfbay craiglist listings,
    the name of the 'small' span HTML object
    --which contains the city names--shows the 
    names of SF neighborhoods instead of the city itself.
    
    Since we are only interested in city-level data,
    not within-city data such as city neighborhoods,
    we will replace any SF neighborhood names
    with simply the name of the city (ie, San Francisco).
    
    Make a case-insensitive search, since we only care 
    about basic spellings of neighborhood names. Denote
    any non-SF data as empty strings, and then remove 
    these records from dataset since we only want SF data."""

    sf_neighborhoods = list_of_sf_neighborhoods # input list of all neighborhoods in SF
    # 1.) remove records that are missing city names from dataset
    filtered_df = df.dropna(subset = ['cities'], how='any') # remove any records that are missing city names
    #2.) split cities that are listed as having multiple city names with a slash delimiter (ie, '/'), and parse the first city name of each such row
    filtered_df['cities'] = filtered_df['cities'].str.split('/', expand = False).str[0]   # split cities based on presence of the forward-slash delimiter (ie: '/'), and then parse only the 1st such element. This way, we will have only the primary (first) city listed for each 'split' city name. 
    
    # 3.) use str.contains() to look up any neighborhoods that are located within SF, and use np.where() to replace the neighborhood names with simply the city name (ie, 'San Francisco'). NB: if no SF neighborhoods are found, then impute row as null using np.null.  
    filtered_df['cities'] = pd.np.where(filtered_df['cities'].str.contains('|'.join(sf_neighborhoods), case=False), "San Francisco", '') # assign city name for any rows contain sf neighborhood names for the cities col
    # 4.) Remove all of the rows with cities imputed as empty strings--ie, because they are not actually located within San Francisco
    filtered_df_final = filtered_df[filtered_df['cities'].str.strip().astype(bool)]
    return filtered_df_final


# specify list of all sf neighborhoods
sf_neighborhoods = [
    'Anza Vista', 'Ashbury Heights', 'Ashbury Hts', 'Alamo Square'
    'Balboa Hollow', 'Balboa Terrace', 'Bayview', 'Belden Place', 'Bernal Heights',
    'Lower Pac Hts', 'Mission District', 'Lower Nob Hill', 'Downtown',
    'Buena Vista', 'Butchertown (Old and New)','Castro', 'Cathedral Hill', 'Cayuga Terrace',
    'China Basin','Chinatown', 'Civic Center','Clarendon Heights', 'Cole Valley',
    'Corona Heights', 'Cow Hollow','Crocker-Amazon','Design District','Diamond Heights', 'Diamond Hts', 'Dogpatch',
    'Dolores Heights', 'Duboce Triangle',' Embarcadero', 'Eureka Valley',
    'Excelsior', 'Fillmore', 'Financial District', "Fisherman's Wharf", 'Forest Hill', 'Forest Knolls', 'Glen Park', 'Golden Gate Heights', 
    'Haight', 'Hayes Valley', 'Hunters Point', 'India Basin', 'Ingleside', 'Inner Sunset', 'Jackson Square', 'Japantown',
    'Jordan Park', 'Laguna Honda', 'Lake Street', 'Lakeside', 'Lakeshore', 'Laurel Heights', 'Lincoln Manor', 'Little Hollywood', 'Little Russia',
    'Little Saigon', 'Lone Mountain', ' Lower Haight', 'Lower Pacific Heights', 'Lower Pac Hts', 'Lower Nob Hill', ' Marina', 'Merced Heights', 'Merced Manor', 'Midtown Terrace',
    'Mid-Market', 'Miraloma Park', 'Mission Bay', 'Mission', 'Mission Dolores', 'Mission Terrace', 'Monterey Heights', 'Mount Davidson',
    'nob hill', 'Noe Valley', 'nopa', 'North Beach', 'Panhandle', 'Oceanview','Outer Mission', ' Outer Sunset', 'Pacific Heights', 'Parkmerced',
    'Parkside', 'Parnassus', 'Polk Gulch', 'Portola', 'Portola Place', 'portola district'
    'Potrero Hill', 'Presidio', 'Presidio Heights', 'Richmond', 'Richmond District', 'Rincon Hill', 'Russian Hill', 'Saint Francis Wood', 'Sea Cliff', 
    'Sherwood Forest', 'South Beach', 'Silver Terrace', 'South End', 'South of Market', 'soma', 'Sunnydale', 'Sunnyside', 'Sunset', 
    'Telegraph Hill', 'Tenderloin', 'Treasure Island', 'Twin Peaks','Union Square', 'University Mound',
    'USF', 'Upper Market', 'Visitacion Valley', 'Vista del Mar', 'West Portal', 'Western Addition',
    'Westwood Highlands', 'Westwood Park', 'Yerba Buena', 'SFSU', 'CCSF', 'Fort Mason', 'Laurel Hts', 'UCSF', 'San Francisco'
    'Turk St', 'Showplace Square'
    ]


# re-label all SF neighborhoods as city name of SF, and remove all non-SF data:
sf_data = clean_city_names_for_sf(sf_neighborhoods, sf_data)


print(f"Sanity check--there should now only be city names of 'San Francisco':\n{sf_data['cities'].value_counts()}") # sanity check

Sanity check--there should now only be city names of 'San Francisco': San Francisco    669
Name: cities, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(self, *args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Replace the existing 'sfc' Nov 19, 2021 CSV file since we scraped these 'sfc' data prior to having this function ready for deployment:


In [43]:
# replace existing CSV file since we scraped these 'sfc' data prior to having this function ready for deployment:

direc_for_csv = r'D:/Coding and Code projects/Python/craigslist_data_proj/craigslist_webscraper/scraped_data/sfbay/sfc/craigslist_rental_sfbay_sfc_11_19_2021.csv'

sf_data.to_csv(direc_for_csv, index=False) # export
# "D:\\Coding and Code projects\\Python\\craigslist_data_proj\\craigslist_webscraper\\scraped_dat\\sfbay\\sfc\\craigslist_rental_sfbay_sfc_11_19_2021.csv")

# sanity check
sf_data_cleaned = recursively_import_all_CSV_and_concat_to_single_df(sf_scraped_direc)

sf_data_cleaned.cities.value_counts()

San Francisco    669
Name: cities, dtype: int64

## Next, clean the East Bay & South Bay data we've scraped prior to creating this data cleaning functions:

### Let's start with South Bay data:

In [2]:
# import South Bay data--Oct 2021 & mid-Nov 2021:
import pandas as pd
sby_Oct, sby_Nov = pd.read_csv('D:\Coding and Code projects\Python\craigslist_data_proj\craigslist_webscraper\scraped_data\sfbay\sby\craigslist_rental_sfbay_sby_10_28_2021.csv'), pd.read_csv('D:\Coding and Code projects\Python\craigslist_data_proj\craigslist_webscraper\scraped_data\sfbay\sby\craigslist_rental_sfbay_sby_11_12_2021.csv')

# sanity check
sby_Oct.head(), sby_Nov.head()

(                                        listing_urls           ids    sqft  \
 0  https://sfbay.craigslist.org/sby/apa/d/sunnyva...  7.400518e+09   300.0   
 1  https://sfbay.craigslist.org/sby/apa/d/palo-al...  7.400511e+09  1196.0   
 2  https://sfbay.craigslist.org/sby/apa/d/sunnyva...  7.400505e+09   571.0   
 3  https://sfbay.craigslist.org/sby/apa/d/san-jos...  7.400508e+09     NaN   
 4  https://sfbay.craigslist.org/sby/apa/d/mountai...  7.400506e+09   750.0   
 
            cities prices  bedrooms bathrooms  \
 0       Sunnyvale  2,100       0.0         1   
 1  San Jose South  5,583       3.0         2   
 2       Sunnyvale  2,665       0.0         1   
 3  San Jose North  1,000       1.0    shared   
 4   Mountain View  2,812       1.0         1   
 
                                            attr_vars  \
 0  air conditioning\ncats are OK - purrr\ndogs ar...   
 1  EV charging\nair conditioning\ncats are OK - p...   
 2  EV charging\nair conditioning\ncats are OK - p...   


In [6]:
# clean SJ or other city name data
def clean_given_city_names_data(city_name, list_of_neighborhood_names, df):
    """In the craigslist listings, the name of the 'small' span HTML object
    --which contains the city names--shows various names for regions of 
    cities such as San Jose, such as 'San Jose downtown', 'San Jose South', etc.

    We will use str contains to look up the given city substring (e.g., 'San Jose'),
    and replace these with simply the name of the city itself: ie, 'San Jose'."""
    # 1.) remove records that are missing city names from dataset
    filtered_df = df.dropna(subset = ['cities'], how='any') # remove any records that are missing city names
    #2.) split cities that are listed as having multiple city names with a slash delimiter (ie, '/'), and parse the first city name of each such row
    filtered_df['cities'] = filtered_df['cities'].str.split('/', expand = False).str[0]   # split cities based on presence of the forward-slash delimiter (ie: '/'), and then parse only the 1st such element. This way, we will have only the primary (first) city listed for each 'split' city name. 

    # 3.) use str.contains() to look up city names with given neighborhood names. Replace these with simply the city name of 'San Jose'
    filtered_df['cities'] =  pd.np.where(filtered_df['cities'].str.contains('|'.join(list_of_neighborhood_names), case=False), city_name, filtered_df['cities']) # assign city name for any rows containing given city neighborhood names, else simply leave row unchanged.     
    return filtered_df


# specify list of neighborhoods for South Bay data that we need to clean:
sj_neighborhoods = ['San Jose', 'Cambrian' 'East Foothills', 'Evergreen']

santa_clara_neighborhoods = ['Santa Clara', 'Willow Glen']

# clean San Jose data-- for Oct & Nov data, respectively 
sby_Oct, sby_Nov = clean_given_city_names_data('San Jose', sj_neighborhoods, sby_Oct), clean_given_city_names_data('San Jose', sj_neighborhoods, sby_Nov)

# clean Santa Clara data
sby_Oct, sby_Nov = clean_given_city_names_data('Santa Clara', santa_clara_neighborhoods, sby_Oct), clean_given_city_names_data('Santa Clara', santa_clara_neighborhoods, sby_Nov)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [10]:
sby_Oct.cities.value_counts()

San Jose             469
Sunnyvale            166
Cupertino            139
Santa Clara          114
Mountain View         84
Campbell              44
Milpitas              18
Los Gatos             16
Morgan Hill           14
Hollister              6
Gilroy                 5
Saratoga               3
El Sobrante, Ca        2
Fremont                1
Hercules               1
San Juan Bautista      1
East Foothills         1
Name: cities, dtype: int64

### Replace existing CSV scraped files with cleaned variants:

In [12]:
# replace existing CSV files for South Bay data:
# Oct data
sby_Oct.to_csv('D:\Coding and Code projects\Python\craigslist_data_proj\craigslist_webscraper\scraped_data\sfbay\sby\craigslist_rental_sfbay_sby_10_28_2021.csv')
 # Nov data
sby_Nov.to_csv('D:\Coding and Code projects\Python\craigslist_data_proj\craigslist_webscraper\scraped_data\sfbay\sby\craigslist_rental_sfbay_sby_11_12_2021.csv')


# 

### Clean East Bay data:

In [13]:
# import East Bay data
import pandas as pd
eby_Oct = pd.read_csv('D:\Coding and Code projects\Python\craigslist_data_proj\craigslist_webscraper\scraped_data\sfbay\eby\craigslist_rental_sfbay_eby_10_29_2021.csv')

eby_early_Nov = pd.read_csv('D:\Coding and Code projects\Python\craigslist_data_proj\craigslist_webscraper\scraped_data\sfbay\eby\craigslist_rental_sfbay_eby_11_08_2021.csv')

eby_late_Nov = pd.read_csv('D:\Coding and Code projects\Python\craigslist_data_proj\craigslist_webscraper\scraped_data\sfbay\eby\craigslist_rental_sfbay_eby_11_28_2021.csv')

# sanity checks
eby_Oct.head(),  eby_early_Nov.head(), eby_late_Nov.head()


(                                        listing_urls           ids   sqft  \
 0  https://sfbay.craigslist.org/eby/apa/d/berkele...  7.401006e+09    NaN   
 1  https://sfbay.craigslist.org/eby/apa/d/san-ram...  7.401008e+09  898.0   
 2  https://sfbay.craigslist.org/eby/apa/d/fremont...  7.401007e+09  615.0   
 3  https://sfbay.craigslist.org/eby/apa/d/pleasan...  7.400999e+09    NaN   
 4  https://sfbay.craigslist.org/eby/apa/d/oakland...  7.397958e+09  800.0   
 
                             cities prices  bedrooms bathrooms  \
 0                         Berkeley  4,450       3.0         1   
 1             Danville / San Ramon  2,676       1.0         1   
 2    Fremont / Union City / Newark  2,013       1.0         1   
 3  Dublin / Pleasanton / Livermore  2,850       0.0         1   
 4     Oakland Lake Merritt / Grand  2,200       2.0         1   
 
                                            attr_vars  \
 0  cats are OK - purrr\nflooring: wood\nfurnished...   
 1  EV charging\na

In [14]:
# clean East Bay data:

# specify lists of city & neighborhood names to look up: 
oakland_neighborhoods = ['Oakland', 'Lake Merritt']

alameda_neighborhoods = ['Alameda']

hayward_neighborhoods = ['Hayward']

richmond_neighborhoods = ['Richmond']

# clean Oakland data:
eby_Oct,  eby_early_Nov, eby_late_Nov = clean_given_city_names_data('Oakland', oakland_neighborhoods, eby_Oct), clean_given_city_names_data('Oakland', oakland_neighborhoods, eby_early_Nov), clean_given_city_names_data('Oakland', oakland_neighborhoods, eby_late_Nov)

# clean Alameda
eby_Oct,  eby_early_Nov, eby_late_Nov = clean_given_city_names_data('Alameda', alameda_neighborhoods, eby_Oct), clean_given_city_names_data('Alameda', alameda_neighborhoods, eby_early_Nov), clean_given_city_names_data('Alameda', alameda_neighborhoods, eby_late_Nov)


# clean Richmond
eby_Oct,  eby_early_Nov, eby_late_Nov = clean_given_city_names_data('Richmond', richmond_neighborhoods, eby_Oct), clean_given_city_names_data('Richmond', richmond_neighborhoods, eby_early_Nov), clean_given_city_names_data('Richmond', richmond_neighborhoods, eby_late_Nov)

# clean Hayward
eby_Oct,  eby_early_Nov, eby_late_Nov = clean_given_city_names_data('Hayward ', hayward_neighborhoods, eby_Oct), clean_given_city_names_data('Hayward ', hayward_neighborhoods, eby_early_Nov), clean_given_city_names_data('Hayward ', hayward_neighborhoods, eby_late_Nov)

# sanity check
eby_Oct.cities.value_counts()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Oakland                                261
Dublin                                 159
Concord                                123
Hayward                                110
Berkeley                               101
Fremont                                 93
Alameda                                 84
Pittsburg                               80
San Leandro                             55
Walnut Creek                            51
Hercules, Pinole, San Pablo, El Sob     44
Danville                                36
Lafayette                               28
Vallejo                                 19
Emeryville                              18
Fairfield                               18
Berkeley North                          16
Albany                                  15
Richmond                                15
Brentwood                                9
South Lake Tahoe                         5
Albany                                   4
Vallejo                                  3
Briarwood A

## Replace CSV files with cleaned data


In [19]:
def to_csv_no_index(path_and_csv_file, df):
    return df.to_csv(path_and_csv_file, index=False)

# eby_Oct
to_csv_no_index('D:\Coding and Code projects\Python\craigslist_data_proj\craigslist_webscraper\scraped_data\sfbay\eby\craigslist_rental_sfbay_eby_10_29_2021.csv', eby_Oct)
eby_Oct.to_csv('D:\Coding and Code projects\Python\craigslist_data_proj\craigslist_webscraper\scraped_data\sfbay\eby\craigslist_rental_sfbay_eby_10_29_2021.csv', index=False)

# ealy Nov
to_csv_no_index('D:\Coding and Code projects\Python\craigslist_data_proj\craigslist_webscraper\scraped_data\sfbay\eby\craigslist_rental_sfbay_eby_11_08_2021.csv', eby_early_Nov)
# late Nov
to_csv_no_index('D:\Coding and Code projects\Python\craigslist_data_proj\craigslist_webscraper\scraped_data\sfbay\eby\craigslist_rental_sfbay_eby_11_28_2021.csv', eby_late_Nov)