## In Jan 2022, I added 2 new cols that the webcrawler will scrape: namely, flat & land housing/rental property types. 

## However, I accidentally commented out the 'duplex' housing type, and we need to re-enter this col type to the CSV files that contain Jan 1-Jan 20, 2022 data.

## How shoudl we accomplish this?

### 1.) Create a function that recursively imports each CSV file from a given subregion directory/path as separate dataframes stored within a dictionary of dataframes.

### -- This way, we can import separate dictionaries of dataframes, and each df within each dictionary comprises a specific CSV file (ie, data scraped for a given subregion on a given date). Import each CSV file from 

### 2.) 2ndly, we need to add in the 'duplex' col, but we need to ensure the col is located in the correct position.

### 3.) Finally, export each df from the dictionary of dfs back to the original CSV files. 

## 1.) Import each CSV file from a given subregion as separate dfs in a dict of dataframes

In [1]:
## library imports
# imports-- file processing
import os
import glob

# data analysis libraries
import numpy as np
import pandas as pd

# for datetime manipulation and filtering
import datetime
datetime.datetime.strptime


<function datetime.strptime>

In [2]:
## Import data function
def import_csvs_from_path_as_separate_dfs(path):
    # change working direc to where CSVs are located
    os.chdir(path)

    # obtain a list of all CSV files stored in specified path
    csvs = [x for x in os.listdir('.') if x.endswith('.csv')]
    
    # obtain a list of all file names of the CSV files
    csv_file_names = [os.path.splitext(os.path.basename(x))[0] for x in csvs]
    
    # initialize empty dict to contain the soon-to-be DataFrames
    dict_of_dfs = {}
    
    # iterate over each element (ie, file name) in the CSV file names list, and import the CSVs as separate dfs while also appending the file names as keys to the dictionary of DataFrames: 
    for file in range(len(csv_file_names)):
        dict_of_dfs[csv_file_names[file]] = pd.read_csv(csvs[file])  # import CSV files as separate dfs and store as the values of the dictionary, and append the file names as the keys for the dict
    
    return dict_of_dfs



## 1.) Import each CSV file from each subregion as separate dictionaries of dfs: 

In [3]:
# NB: let's start by importing South Bay data:

# 1st, specify path of South Bay data:
sby_data_path = "D:\Coding and Code projects\Python\craigslist_data_proj\CraigslistWebScraper\scraped_data\sfbay\\sby"


# then, import all sby CSV files as separate DataFrames contained within a dictionary of DataFrames
sby_dict = import_csvs_from_path_as_separate_dfs(sby_data_path)  # import sby data as a dict of dfs

# sanity check
print(f"South Bay data: {sby_dict}")



South Bay data: {'craigslist_rental_sfbay_sby_01_05_2022':                                           listing_urls           ids    sqft  \
0    https://sfbay.craigslist.org/sby/apa/d/san-jos...  7.428833e+09     NaN   
1    https://sfbay.craigslist.org/sby/apa/d/san-jos...  7.428836e+09     NaN   
2    https://sfbay.craigslist.org/sby/apa/d/san-jos...  7.428833e+09     NaN   
3    https://sfbay.craigslist.org/sby/apa/d/saratog...  7.426714e+09  1650.0   
4    https://sfbay.craigslist.org/sby/apa/d/san-jos...  7.428813e+09  1100.0   
..                                                 ...           ...     ...   
689  https://sfbay.craigslist.org/sby/apa/d/san-jos...  7.426201e+09     NaN   
690  https://sfbay.craigslist.org/sby/apa/d/san-jos...  7.426196e+09     NaN   
691  https://sfbay.craigslist.org/sby/apa/d/san-jos...  7.426194e+09     NaN   
692  https://sfbay.craigslist.org/sby/apa/d/san-jos...  7.426190e+09     NaN   
693  https://sfbay.craigslist.org/sby/apa/d/san-jos...  7.426

In [4]:
# import North Bay data
# specify path 
nby_data_path = "D:\Coding and Code projects\Python\craigslist_data_proj\CraigslistWebScraper\scraped_data\sfbay\\nby"


# then, import all CSV files as separate DataFrames contained within a dictionary of DataFrames
nby_dict = import_csvs_from_path_as_separate_dfs(nby_data_path)  # import nby data as a dict of dfs


In [5]:
# import SF data
# specify path 
sfc_data_path = "D:\Coding and Code projects\Python\craigslist_data_proj\CraigslistWebScraper\scraped_data\sfbay\\sfc"


# then, import all CSV files as separate DataFrames contained within a dictionary of DataFrames
sfc_dict = import_csvs_from_path_as_separate_dfs(sfc_data_path)  # import nby data as a dict of dfs

In [6]:
# specify path 
pen_data_path = "D:\Coding and Code projects\Python\craigslist_data_proj\CraigslistWebScraper\scraped_data\sfbay\\pen"


# then, import all CSV files as separate DataFrames contained within a dictionary of DataFrames
pen_dict = import_csvs_from_path_as_separate_dfs(pen_data_path)  # import nby data as a dict of dfs


In [7]:
# import East Bay data
# specify path 
eby_data_path = "D:\Coding and Code projects\Python\craigslist_data_proj\CraigslistWebScraper\scraped_data\sfbay\\eby"


# then, import all CSV files as separate DataFrames contained within a dictionary of DataFrames
eby_dict = import_csvs_from_path_as_separate_dfs(eby_data_path)  # import nby data as a dict of dfs


In [8]:
# import Santa Cruz data
# specify path 
scz_data_path = "D:\Coding and Code projects\Python\craigslist_data_proj\CraigslistWebScraper\scraped_data\sfbay\\scz"


# then, import all CSV files as separate DataFrames contained within a dictionary of DataFrames
scz_dict = import_csvs_from_path_as_separate_dfs(scz_data_path)  # import nby data as a dict of dfs


## 2.) Next, we need to add duplex housting type col, 

## a.) but we need to ensure it's in the correct position:
### --namely, 'duplex' should be located in *between* the 'single_fam' & 'is_furnished' columns!!"""

### Look up index location of 'single_fam' col since we need this to be a contiguous col as we add 'duplex': 

In [9]:
# --NB: ensure that we place the new col at the correct index location
# We can look up index location of given col name via the .columns and .get_loc(column_name) methods:

# look up index location of 'single_fam' col
def look_up_index_loc_of_col(df, col):
    """ Return index location of given column, given column name"""
    index_of_col = df.columns.get_loc(col)
    return index_of_col

# The column locations are uniform across each df stored within each subregion dictionary of dfs
# Ergo: We can apply this function to any *one* of the dfs (and from any subregion dict)

# Let's apply the function to the first df stored in the South Bay dictionary, to get index location of col:
index_for_single_fam = look_up_index_loc_of_col(
    sby_dict['craigslist_rental_sfbay_sby_01_05_2022'],  # refer to the first df in South Bay dict
    'single_fam'  # look up index location for 'single_fam' col
    )

print(f"The index location of the 'single_fam' col--which is uniform across all of the dfs and will be contiguous to the new 'duplex' col is:\n{index_for_single_fam}")

The index location of the 'single_fam' col--which is uniform across all of the dfs and will be contiguous to the new 'duplex' col is:
36


### 2 a) ii) Data cleaning--remove dataframes that * already* contain the 'duplex' col (see below for more details) 

## NB!: The 'duplex' col is *already* located in several dataframes (so apparently, the mistake in commenting out the 'duplex' col )

### a) the Jan 2nd North Bay (ie, nby) file, b.) Jan 2nd Peninsula (ie, pen) file; c.) Jan 3rd East Bay file; & d) Jan 3rd SF  (sfc) file

### However,  the other cols do *not* have the 'duplex' col--ie, Jan 9th & Jan 17th for the North Bay data.

### Also, observe that none of the South Bay (sby) or Santa Cruz (scz) files  have any files that include the duplex col. This means I can freely add the 'duplex' col to each of the dfs in these *2* dictionaries of dataframes.

### But this means we need to more carefully manage the dataframes from all of the other dictionaries of dataframes!!


### Namely: we need to remove each of the 4 aforementioned dfs from their respective subregion dictionaries of dataframes. We can use the Python del keyword to remove a key from a dictionary:


## Remove the dataframes --ie, which already have the duplex cols--from the relevant dictionaries: 

In [10]:
# remove df from nby data
del nby_dict['craigslist_rental_sfbay_nby_01_02_2022']  # remove the Jan 2nd North Bay df key

# sanity check-- the Jan 2nd df should have been removed
print(f"North Bay data after removing df from dictionary:\n{nby_dict}")

North Bay data after removing df from dictionary:
{'craigslist_rental_sfbay_nby_01_09_2022':                                           listing_urls         ids    sqft  \
0    https://sfbay.craigslist.org/nby/apa/d/napa-yo...  7427249064   400.0   
1    https://sfbay.craigslist.org/nby/apa/d/san-raf...  7426349094   920.0   
2    https://sfbay.craigslist.org/nby/apa/d/santa-r...  7430776818     NaN   
3    https://sfbay.craigslist.org/nby/apa/d/ross-bd...  7427839766     NaN   
4    https://sfbay.craigslist.org/nby/apa/d/mill-va...  7427185241   380.0   
..                                                 ...         ...     ...   
205  https://sfbay.craigslist.org/nby/apa/d/fairfax...  7418393626     NaN   
206  https://sfbay.craigslist.org/nby/apa/d/occiden...  7427780846  2850.0   
207  https://sfbay.craigslist.org/nby/apa/d/eldridg...  7427732992     NaN   
208  https://sfbay.craigslist.org/nby/apa/d/santa-r...  7427652267     NaN   
209  https://sfbay.craigslist.org/nby/apa/d/caspa

In [11]:
# remove df from pen data
del pen_dict['craigslist_rental_sfbay_pen_01_02_2022']  # remove the Jan 2nd Peninsula df key


#remove df from eby data
del eby_dict['craigslist_rental_sfbay_eby_01_03_2022'] # remove the Jan 3rd East Bay df key




In [12]:
#remove df from sfc data
del sfc_dict['craigslist_rental_sfbay_sfc_01_03_2022'] # remove the Jan 3rd SF df key


### NB: Now that we've removed any dfs that already include the duplex col, we can proceed with the data transformations:

## 2. b) Add 'duplex' as a new col to each df:

In [13]:
# create indicator var using numpy and Pandas' str.contains() based on scraped rental listing attributes and descriptions  
def indicator_vars_from_scraped_data(df, col_to_parse, attr_substr):
    return pd.Series(np.where(df[col_to_parse].str.contains(attr_substr), 1, 0))


# Now, apply indicator_vars...() function to each df in the dictionaries of dfs:
def apply_func_to_dict_of_dfs_and_create_col(dict):
    """Create duplex indicator var col for each dataframe in a dictionary of dataframes"""
    for key in dict:
        dict[key]['duplex'] = indicator_vars_from_scraped_data(dict[key], 'attr_vars', 'duplex')
    return dict 


### Create and add duplex housing type indicator var via indicator_vars...() function to each subregion dictionary of dfs:

# South Bay
sby_dict = apply_func_to_dict_of_dfs_and_create_col(sby_dict)

# North Bay
nby_dict = apply_func_to_dict_of_dfs_and_create_col(nby_dict)

# Peninsula
pen_dict = apply_func_to_dict_of_dfs_and_create_col(pen_dict)


# East Bay

eby_dict = apply_func_to_dict_of_dfs_and_create_col(eby_dict)

# SF
sfc_dict = apply_func_to_dict_of_dfs_and_create_col(sfc_dict)

# Santa Cruz
scz_dict = apply_func_to_dict_of_dfs_and_create_col(scz_dict)

## 2.) c.) Next, we need to move the location of the 'duplex' col to match the new webcrawler specifications:



In [14]:
def move_col_loc_for_df_dict(dict):
    for key in dict:
        # col = dict[key]['duplex']  # obtain duplex Series/col data from each df 
        col = dict[key].pop('duplex')  # obtain duplex Series/col data from each df 
        # dict[key] = dict[key].drop('duplex')  # remove duplex col from each df
        print(f"Col object:\n{col}")
        dict[key].insert(index_for_duplex, 'duplex', col)  # move location of duplex col within each dataframe
    return dict 


## 1.) Before applying func, determine the index location where we want to move the 'duplex' col to: 
### --NB: How do we know the correct index location to place the 'duplex' col?-- Add 1 to the single_fam_index to get the location where we want to place the new 'duplex' col within the dataframe: 

# add 1 to the single_fam_index val to get the index location where we want to place the new 'duplex' col in each df:
index_for_duplex = index_for_single_fam + 1 # add 1 to get index location of where we will place the new 'duplex' col in df

# sanity check on intended index location for duplex:
print(f"Index location where we want to move 'duplex' col to:\n{index_for_duplex}")

Index location where we want to move 'duplex' col to:
37


In [71]:
sby_dict['craigslist_rental_sfbay_sby_01_05_2022']

0      0
1      0
2      0
3      0
4      0
      ..
689    0
690    0
691    0
692    0
693    0
Name: duplex, Length: 694, dtype: int32

In [15]:
## 2.) Now, move the 'duplex' col for each subregion dictionary of dataframes:
# South Bay 
sby_dict = move_col_loc_for_df_dict(sby_dict)


# North Bay data
nby_dict = move_col_loc_for_df_dict(nby_dict)


# Peninsula
pen_dict = move_col_loc_for_df_dict(pen_dict)

# East Bay
eby_dict = move_col_loc_for_df_dict(eby_dict)

# SF
sfc_dict = move_col_loc_for_df_dict(sfc_dict)

# Santa Cruz
scz_dict = move_col_loc_for_df_dict(scz_dict)

# sanity check
print(f"The 38th column in this South Bay df should be the new 'duplex' col:\n{sby_dict['craigslist_rental_sfbay_sby_01_05_2022'].iloc[:, 37]}")


Col object:
0      0
1      0
2      0
3      0
4      0
      ..
689    0
690    0
691    0
692    0
693    0
Name: duplex, Length: 694, dtype: int32
Col object:
0      0
1      0
2      0
3      0
4      0
      ..
784    0
785    0
786    0
787    0
788    0
Name: duplex, Length: 789, dtype: int32
Col object:
0      0
1      1
2      0
3      0
4      0
      ..
793    0
794    0
795    0
796    0
797    0
Name: duplex, Length: 798, dtype: int32
Col object:
0      0
1      0
2      0
3      1
4      0
      ..
205    0
206    0
207    0
208    0
209    0
Name: duplex, Length: 210, dtype: int32
Col object:
0      0
1      0
2      0
3      0
4      0
      ..
220    0
221    0
222    0
223    0
224    0
Name: duplex, Length: 225, dtype: int32
Col object:
0      0
1      0
2      0
3      0
4      0
      ..
378    0
379    0
380    0
381    0
382    0
Name: duplex, Length: 383, dtype: int32
Col object:
0      0
1      0
2      0
3      0
4      0
      ..
391    1
392    1
393    1
3

In [23]:
print(f"The 38th column in this South Bay df should be the new 'duplex' col:\n{sby_dict['craigslist_rental_sfbay_sby_01_05_2022'].iloc[:, 38]}")


The 38th column in this South Bay df should be the new 'duplex' col:
0      None
1      None
2      None
3      None
4      None
       ... 
689    None
690    None
691    None
692    None
693    None
Name: duplex, Length: 694, dtype: object


In [18]:
print(f"The 37th column in this South Bay df should be the new 'duplex' col:\n{sby_dict['craigslist_rental_sfbay_sby_01_05_2022'].iloc[:, 37]}")


The 37th column in this South Bay df should be the new 'duplex' col:
0      0
1      0
2      0
3      0
4      0
      ..
689    0
690    0
691    0
692    0
693    0
Name: duplex, Length: 694, dtype: int32


## 3.) Finally, export each respective df (ie, for each subregion) to over-write the original csv files!

In [28]:
def overwrite_csvs_with_cleaned_data_dfs(df_dict, path):
    """Overwrite all of the respective original CSV files with the corresponding cleaned dataframes."""
    for df in df_dict:  # iterate over each DataFrame in dict
        df_dict[df].to_csv(path + "\\" + str(df)+'.csv', index=False) # use each key value (ie, df name from dict) as file name, and add .csv extension to each


# Santa Cruz
overwrite_csvs_with_cleaned_data_dfs(scz_dict, 'D:\\Coding and Code projects\\Python\\craigslist_data_proj\\CraigslistWebScraper\\scraped_data\\sfbay\\scz')

In [29]:

# South Bay 
overwrite_csvs_with_cleaned_data_dfs(sby_dict, 'D:\\Coding and Code projects\\Python\\craigslist_data_proj\\CraigslistWebScraper\\scraped_data\\sfbay\\sby')

# North Bay data
overwrite_csvs_with_cleaned_data_dfs(nby_dict, 'D:\\Coding and Code projects\\Python\\craigslist_data_proj\\CraigslistWebScraper\\scraped_data\\sfbay\\nby')


# Peninsula
overwrite_csvs_with_cleaned_data_dfs(pen_dict, 'D:\\Coding and Code projects\\Python\\craigslist_data_proj\\CraigslistWebScraper\\scraped_data\\sfbay\\pen')


# East Bay
overwrite_csvs_with_cleaned_data_dfs(eby_dict, 'D:\\Coding and Code projects\\Python\\craigslist_data_proj\\CraigslistWebScraper\\scraped_data\\sfbay\\eby')

# SF
overwrite_csvs_with_cleaned_data_dfs(sfc_dict, 'D:\\Coding and Code projects\\Python\\craigslist_data_proj\\CraigslistWebScraper\\scraped_data\\sfbay\\sfc')
