## Part 2 in the pandas df to SQL Server data pipeline

### In Part 2, we assume that we've already run the Part 1 notebook (or equivalent scripts)

##### Ie: the rental table has already been created within the craigslist database, and we have successfully inserted at least some data into the rental table!

##### If so, then we can check for the last date of inserted date--ie, MAX() of date_possted--and then filter the scraped data > the MAX()date in the SQL table, clean the data, and then insert the new data into the table

In [1]:
# imports-- file processing & json libraries
import os
import glob
import json

# data analysis libraries & SQL libraries
import numpy as np
import pandas as pd
# SQL ODBC for API connection between Python & SQL Server
import pyodbc
import sqlalchemy as sa

### Import all scraped data:

In [2]:
def recursively_import_all_CSV_and_concat_to_single_df(parent_direc, fn_regex=r'*.csv'):
    """Recursively search parent directory, and look up all CSV files.
    Then, import all CSV files to a single Pandas' df using pd.concat()"""
    path =  parent_direc # specify parent path of directories containing the scraped rental listings CSV data -- NB: use raw text--as in r'path...', or can we use the double-back slashes to escape back-slashes??
    df_concat = pd.concat((pd.read_csv(file) for file in glob.iglob(
        os.path.join(path, '**', fn_regex), 
        recursive=True)), ignore_index=True)  # os.path.join helps ensure this concatenation is OS independent
    return df_concat

## Import Dataset
# import all scraped SF bay area rental listings data
scraped_data_path = r"D:\\Coding and Code projects\\Python\\craigslist_data_proj\\CraigslistWebScraper\\scraped_data\\sfbay"

clist_rental = recursively_import_all_CSV_and_concat_to_single_df(scraped_data_path)
clist_rental.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18645 entries, 0 to 18644
Data columns (total 48 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   listing_urls             18645 non-null  object 
 1   ids                      17236 non-null  float64
 2   sqft                     13209 non-null  float64
 3   cities                   17219 non-null  object 
 4   prices                   17227 non-null  object 
 5   bedrooms                 17184 non-null  float64
 6   bathrooms                17184 non-null  object 
 7   attr_vars                17220 non-null  object 
 8   listing_descrip          17220 non-null  object 
 9   date_of_webcrawler       17236 non-null  object 
 10  kitchen                  17220 non-null  float64
 11  date_posted              17220 non-null  object 
 12  region                   18645 non-null  object 
 13  sub_region               18645 non-null  object 
 14  cats_OK               

### Determine last date (ie, MAX()) of the data stored in the rental table:

In [3]:
# Perform SQL query on the date_posted col to determine the most recent date of data stored in the table  
class SQL_Database:
    def __init__(self, path_for_SQL_config):

        with open(path_for_SQL_config,'r') as fh:
            config = json.load(fh)

        self.driver = config['driver']
        self.server = config['server']
        self.database = config['database']
        self.username = config['username']
        self.password = config['password']

        print(self.database)

    def determine_latest_date(self, sql_query):
        """Insert scraped Craigslist rental listings data (ie, the Pandas' dataframe)
        to SQL Server database 'rentals' table"""

        conn = pyodbc.connect(
        f'DRIVER={self.driver};'
        f'SERVER={self.server};'
        f'DATABASE={self.database};'
        f'UID={self.username};'
        f'PWD={self.password};'
        'Trusted_Connection=yes;'
        )

        # initialize cursor so we can execute SQL code
        cursor = conn.cursor() 

        # specify SQL query
        sql_query = sql_query 

        # perform query, and convert query results to Pandas' df
        max_date = pd.read_sql(sql_query, conn)

        conn.commit()

        cursor.close()
        conn.close()

        ## sanity check:
        
        print(f"Latest date of scraped data inserted into the SQL table:\n{max_date}")

        return max_date

# specify path to json file containing SQL configuration/username data
sql_config_path = "D:\\Coding and Code projects\\Python\\craigslist_data_proj\\CraigslistWebScraper\\SQL_config\\config.json" 

SQL_db = SQL_Database(sql_config_path)  # NB: be sure to pass in path to the json SQL configuration file so we can load in the needed username, password, and configuration data to be able to access the SQL database

# specify query to select the latest date based on date_posted:
query = "SELECT MAX(date_posted) AS latest_date FROM rental;"

latest_date = SQL_db.determine_latest_date(query)

craigslist
Latest date of scraped data inserted into the SQL table:
          latest_date
0 2021-12-14 03:57:00


In [4]:
## next, convert this latest_date to a string value, so we can use this to filter the scraped dataframe dataset
def datetime_col_to_str_of_datetime(df, datetime_col):
    """Given datetime col from pandas' DataFrame,
    transform to a string of the datetime value."""
    return df[datetime_col].head(1).astype(str).reset_index().loc[0, datetime_col] 

# specify name of df and datetime col:
df, dt_col = latest_date, 'latest_date' 
#apply function using the 2 arguments shown above
latest_date_str = datetime_col_to_str_of_datetime(df, dt_col)
# sanity check
print(f"The latest date among the scraped data stored in the SQL table is:\n{latest_date_str}")


The latest date among the scraped data stored in the SQL table is:
2021-12-14 03:57:00


In [5]:
# filter the dataframe > MAX() of latest_date stored in SQL rental table
# %%
# filter dataset since specific date
def filter_df_since_specified_date(df, target_date):
    """Filter the imported scraped dataset
    to all data since user-specified date.
    NB: specify date in YYYY-MM-DD format"""
    df = df.loc[df.date_posted > target_date]
    return df

# get all data since the latest stored data from SQL table (via the query on MAX(posted_date)) 
clist_rental_subset = filter_df_since_specified_date(clist_rental, latest_date_str)

# sanity check
print(f"The newest scraped data not stored in the SQL table is:\n{clist_rental_subset.date_posted}")


The newest scraped data not stored in the SQL table is:
5076     2021-12-16 23:56
5078     2021-12-16 23:02
5079     2021-12-16 22:39
5080     2021-12-16 22:23
5081     2021-12-16 21:59
               ...       
18460    2021-12-14 09:42
18464    2021-12-14 09:07
18465    2021-12-14 08:41
18467    2021-12-14 07:28
18469    2021-12-14 06:51
Name: date_posted, Length: 1120, dtype: object


### Perform all data cleaning and wrangling features, as in the Part 1 notebook

In [6]:
def remove_price_city_kitch_and_id_nulls(df):
    """Remove rows that do not have price, city name, kitchen, or listing ID data, as these are essential variables in this rental listings dataset."""
    return df.dropna(subset=['prices', 'ids', 'kitchen', 'cities'])

clist_rental = remove_price_city_kitch_and_id_nulls(clist_rental)

# sanity check
print(f"Remaining price, city name, kitchen, & listing id nulls: \n{clist_rental[['prices', 'cities', 'kitchen', 'ids']].isnull().sum()}")


Remaining price, city name, kitchen, & listing id nulls: 
prices     0
cities     0
kitchen    0
ids        0
dtype: int64


In [14]:
def clean_split_city_names(df, address_critera: list, neighborhood_criteria:list, split_city_delimiters: list):
    """Clean city names data in several ways:
    a.) Remove extraneous address & neighborhood data placed in the city names HTML object, such as 'Rd', 'Blvd', or 'Downtown'.
    b.) Unsplit city names data that are split via ',' & '/' delimiters.
    c.) Replace abbreviated or mispelled city names.
    d.) Remove any digits/integers within the city names data--ie, by using a '\d+' regex as the argument of str.replace() and replace it with empty strings.
    e.) Remove any city names records thast are left with merely empty strings (ie, the other steps removed all data for that given cities record).
    f.) Remove any whitespace to avoid the same city names from being treated as different entities by Pandas, Python, or SQL. 
    g.) Also, use str.capwords() to capitalize words (ie, excluding apostrophes)."""
    addr_criteria = address_critera
    # Join pipe ('|') symbols to address list so we can str.split() on any one of these criteria:
    addr_criteria = '|'.join(addr_criteria)
    # specify extraneous neighborhood criteria we should also remove from col
    nbhood_criteria = neighborhood_criteria # remove nieghborhood names as well as state abbreviation (shown on website as 'Ca') that is shown without the usual comma delimiter!
    nbhood_criteria = '|'.join(nbhood_criteria) # join pipe symbols so we can perform 'or' condition splitting on each element separated by pipes 
    # b.) specify delimiters we need to refer to un-split city names:
    split_city_delimiters = split_city_delimiters
    split_city_delim_criteria = '|'.join(split_city_delimiters) # join pipes to delimiters so we can use str.split() based on multiple 'or' criteria simultaneously
    # clean city names data by removing address delimiters, and unsplitting city names based on ',' & '\' delimiters
    df['cities'] =  df['cities'].str.split(addr_criteria).str[-1].str.replace(nbhood_criteria, '', case=True).str.lstrip()
    df['cities'] = df['cities'].str.split(split_city_delim_criteria).str[0] #unsplit city names
    # c.) replace specific abbreviated or mispelled city names:
    df = df.replace({'cities':{'Rohnert Pk':'Rohnert Park', 'Hillsborough Ca': 'Hillsborough', 'South Sf': 
    'South San Francisco', 'East San Jose':'San Jose', 'Vallejo Ca':'Vallejo', 'Westgate On Saratoga .':'San Jose',
    'Bodega':'Bodega Bay', 'Briarwood At Central Park':'Fremont', 'Campbell Ca': 'Campbell', 
    'Almaden':'San Jose', 'Ca':np.nan}}) 
    # d.) Remove digits/integers from cities column records:
    df['cities'] = df['cities'].str.replace('\d+', '')  # remove any digits by using '/d+' regex to look up digits, and then replace with empty string
    # e.) Remove any rows that have empty strings or null values for cities col (having performed the various data filtering and cleaning above)
    df = df[df['cities'].str.strip().astype(bool)] # remove rows with empty strings (ie, '') for cities col 
    df = df.dropna(subset=['cities']) # remove any remaining 'cities' null records
    # f.) Remove whitespace
    df['cities'] = df['cities'].str.strip() 
    # g.) capitalize the city names using str.capwords() 
    df['cities'] = df['cities'].str.split().apply(lambda x: [val.capitalize() for val in x]).str.join(' ')
    return df

# specify various address and street name that we need to remove from the city names 
address_criteria = ['Boulevard', 'Blvd', 'Road', 'Rd', 'Avenue', 'Ave', 'Street', 'St', ' Dr' 'Drive', 'Real', 'E Hillsdale Blvd'] 
# specify various extraneous neighborhood names such as 'Downtown' 
neighborhood_criteria = ['Downtown', 'Central/Downtown', 'North', 'California', 'Bay Area', 'St. Helena', 'St', 'nyon', 'Jack London Square', 'Walking Distance To', 'El Camino', 'Mendocino County', 'San Mateo County', 'Alameda County', 'Rio Nido Nr', 'Mission Elementary', 'Napa County', 'Golden Gate', 'Jennings', 'South Lake Tahoe', 'Tahoe Paradise', 'Kingswood Estates', 'South Bay', 'Skyline', 'San Antonio Tx', 'East Bay', 'Morton Dr']
# specify what delimiters we want to search for to unsplit the split city names data:
split_city_delimiters =  [',', '/']

# clean city names data:
clist_rental = clean_split_city_names(clist_rental, neighborhood_criteria, address_criteria, split_city_delimiters)
# sanity check
clist_rental.cities.value_counts().tail(10)




Pleasanton          1
Antioch             1
Mariner's Island    1
Benicia             1
Rockridge           1
Portola Valley      1
Livermore           1
Discovery Bay       1
Bloomsdale          1
Green Valley        1
Name: cities, dtype: int64

In [None]:
def transform_cols_to_indicators(df, list_of_cols):
    """ Transform relevant attribute columns to numeric, and specify NaNs for any missing or non-numeric data."""
    df[list_of_cols] = df[list_of_cols].astype('uint8', errors='ignore') # convert any missing data to NaN 
    print(f"Sanity check: The data types of {list_of_cols} are now: \n{df[list_of_cols].dtypes}") # sanity check on columns' data types
    return df

# specify a list of cols to convert to numeric -- # since there are many cols we want to transform to indicator variables, it's easier to simply drop the few cols that comprise str (aka, object) data 
cols_to_indicators = clist_rental.drop(columns =['ids', 'listing_urls', 'region', 'sub_region', 'cities', 'attr_vars', 'listing_descrip', 'sqft', 'prices', 'bedrooms', 'bathrooms', 'date_posted', 'date_of_webcrawler']) 
cols_to_indicators_lis = list(cols_to_indicators.columns)
cols_to_indicators = [] # free space

clist_rental = transform_cols_to_indicators(clist_rental, cols_to_indicators_lis)

cols_to_indicators_lis = [] # free space


In [None]:
# also, transform kitchen var separately, since this tends to otherwise convert to float:
clist_rental = transform_cols_to_indicators(clist_rental, 'kitchen')


In [None]:
# convert sqft to object, so that we can more easily load it into the SQL Server table
def transform_cols_to_object(df, col_to_transform):
    return df[col_to_transform].astype('object')

clist_rental['sqft'] = transform_cols_to_object(clist_rental, 'sqft')
#sanity check
clist_rental['sqft'].dtype


In [16]:
# next, remove any bathroom or bedroom nulls:
def remove_bedroom_and_br_nulls(df):
    return df.dropna(subset=['bedrooms', 'bathrooms'])

clist_rental = remove_bedroom_and_br_nulls(clist_rental)

# sanity check
print(f"Remaining bedroom & bathroom nulls: \n{clist_rental[['bedrooms', 'bathrooms']].isnull().sum()}")


Remaining bedroom & bathroom nulls: 
bedrooms     0
bathrooms    0
dtype: int64


In [12]:
# next, remove all remaining null values with Python 'None' value 
def replace_nulls_with_None_val(df):
    df = df.fillna(np.nan).replace([np.nan], [None]) # 
    return df

clist_rental = replace_nulls_with_None_val(clist_rental)

# sanity check
clist_rental.sqft

0          None
1         898.0
2         615.0
3          None
4         800.0
          ...  
18176     450.0
18177     980.0
18178    1150.0
18179    2500.0
18180    1945.0
Name: sqft, Length: 16472, dtype: object

In [14]:
# re: # of bathrooms data, transform any records containing 'shared' or 'split' to 1
# Why?: Because we can assume that any rental units comprising a 'shared' bathroom is essentially 1 bathroom
def transform_shared_and_split_to_ones(df, col_to_transform):
    """Transform any records (from given col) containing the string values of 'shared' or 'split' to a value of 1."""
    # transform col to object, so we can use Python str methods to transform the data
    df[col_to_transform] = df[col_to_transform].astype('object') 
    bedroom_replace_criteria = ['shared', 'split']
    bedroom_replace_criteria = '|'.join(bedroom_replace_criteria) # join pipe symbols so we can use str.replace() on multiple 'or' conditions simultaneously 
    return df[col_to_transform].str.replace(bedroom_replace_criteria,'1')

# clean bathrooms data by replacing the 'split' and 'shared' string values:
clist_rental['bathrooms'] = transform_shared_and_split_to_ones(clist_rental, 'bathrooms')

#sanity check
print(f"Sanity check: \n{clist_rental['bathrooms'].value_counts()}")


Sanity check: 
1      9880
2      3677
1.5     638
2.5     402
3       221
3.5      53
4        20
5        11
9+        7
7         4
5.5       3
Name: bathrooms, dtype: int64


In [18]:
# replace any ambiguous # of bathrooms data--such as '9+' with empty strings (ie, essentially nulls) 
def replace_ambiguous_data_plus_signs_with_empty_str(df, col_to_transform):
    """Replace ambiguous rows of data (ie, any containing a plus sign) for bathrooms col with empty strings"""
    return df[col_to_transform].str.replace(r'\+', '')  # use str.replace() to use a regex to search for plus signs, and in effect remove these by replacing them with empty strings 

clist_rental['bathrooms']  = replace_ambiguous_data_plus_signs_with_empty_str(clist_rental, 'bathrooms')
# sanity check
print(f"New value counts for bathrooms data--having cleaned ambiguous records: \n{clist_rental['bathrooms'].value_counts()}")

New value counts for bathrooms data--having cleaned ambiguous records: 
1      9880
2      3677
1.5     638
2.5     402
3       221
3.5      53
4        20
5        11
9         7
7         4
5.5       3
Name: bathrooms, dtype: int64


In [19]:
# transform bathrooms data to float
# Why float?: Because some listings specify half bathrooms--e.g., 1.5 denotes one-and-half bathrooms. Re: ids, integer data type not store the entire id value due to maximum (byte) storage constraints. 
def transform_cols_to_float(df, col_to_transform):
    return df[col_to_transform].astype('float')

# convert bathrooms to float:
clist_rental['bathrooms'] = transform_cols_to_float(clist_rental, 'bathrooms')    

# convert ids to float:
clist_rental['ids'] = transform_cols_to_float(clist_rental, 'ids')    

#sanity check
print(f"Sanity check on data type of ids & bathrooms data: {clist_rental[['bathrooms', 'ids']].info()}")


<class 'pandas.core.frame.DataFrame'>
Int64Index: 14916 entries, 0 to 18180
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   bathrooms  14916 non-null  float64
 1   ids        14916 non-null  float64
dtypes: float64(2)
memory usage: 349.6 KB
Sanity check on data type of ids & bathrooms data: None


In [None]:
def transform_cols_to_int(df, list_of_cols_to_num):
    """ Transform relevant attribute columns to numeric.
    NB: Since the scraped 'prices' data can contain commas, we need to use str.replace(',','') to remove them before converting to numeric."""
    df['prices'] = pd.to_numeric(df['prices'].str.replace(",",""), errors='coerce') # remove commas from prices data
    df[list_of_cols_to_num] = df[list_of_cols_to_num].astype('int')
    print(f"Sanity check: The data types of {list_of_cols_to_num} are now: \n{df[list_of_cols_to_num].dtypes}") # sanity check on columns' data types
    return df

# specify a list of cols to convert to integer
cols_to_int = clist_rental[['prices', 'bedrooms']]
cols_to_int_lis = list(cols_to_int.columns)  # convert relevant cols to list of col names

cols_to_int = [] # free space

clist_rental = transform_cols_to_int(clist_rental, cols_to_int_lis)

In [None]:
def transform_cols_to_datetime(df, col_to_convert):
    """Transform relevant column(s) to datetime using pd.to_datetime() method, and use infer_datetime_format=True to enable allow for datetime conversion using differing formats (ie, date_posted has a somewhat more precise format). """
    return pd.to_datetime(df[col_to_convert], infer_datetime_format=True)

# apply transformations to datetime for the 2 relevant cols:
clist_rental['date_of_webcrawler'] =  transform_cols_to_datetime(clist_rental,'date_of_webcrawler')
clist_rental['date_posted'] = transform_cols_to_datetime(clist_rental,'date_posted')

#sanity check
clist_rental[['date_posted', 'date_of_webcrawler']].head()


In [None]:
def deduplicate_df(df):
    """Remove duplicate rows based on listing ids"""
    return df.drop_duplicates(keep='first', subset = ['ids'])

clist_rental = deduplicate_df(clist_rental)

# sanity check -- 
clist_duplicate_ids_check = clist_rental[clist_rental.duplicated("ids", keep= False)]
print(f"There should be no remaining duplicate listing ids (ie, 0 rows): \n{clist_duplicate_ids_check.shape[0]}")  # check that number of duplicate rows is false (ie, wrt duplicate listing ids)

# free memory
clist_duplicate_ids_check = [] 

In [23]:
def remove_col_with_given_starting_name(df, col_starting_name):
    """Remove each column from df that has a given starting name substring."""
    return df.loc[:, ~df.columns.str.startswith(col_starting_name)] 

# remove 'Unnamed' columns, which might be imported errouneously via pd.read_csv()
clist_rental = remove_col_with_given_starting_name(clist_rental, 'Unnamed')

# remove listing_urls column since we do not want to store these data into the SQL Server table-- why?: a.) because listing urls are not relevent to rental prices and b.) the listing urls quickly become invalid or dead links, so we have no need to refer back to them at this stage in the webscraping project.
clist_rental = remove_col_with_given_starting_name(clist_rental, 'listing_urls')

# remove listing_descrip column since we do not want to store these data into the SQL Server table-- why?: a.) because listing urls are not relevent to rental prices and b.) the listing urls quickly become invalid or dead links, so we have no need to refer back to them at this stage in the webscraping project.
clist_rental = remove_col_with_given_starting_name(clist_rental, 'listing_descrip')


# sanity check
print(f"Sanity check--the remaining columns in the dataset are:\n{clist_rental.columns}")

Sanity check--the remaining columns in the dataset are:
 Index(['ids', 'sqft', 'cities', 'prices', 'bedrooms', 'bathrooms', 'attr_vars',
       'date_of_webcrawler', 'kitchen', 'date_posted', 'region', 'sub_region',
       'cats_OK', 'dogs_OK', 'wheelchair_accessible', 'laundry_in_bldg',
       'no_laundry', 'washer_and_dryer', 'washer_and_dryer_hookup',
       'laundry_on_site', 'full_kitchen', 'dishwasher', 'refrigerator', 'oven',
       'flooring_carpet', 'flooring_wood', 'flooring_tile',
       'flooring_hardwood', 'flooring_other', 'apt_type', 'in_law_apt_type',
       'condo_type', 'townhouse_type', 'cottage_or_cabin_type',
       'single_fam_type', 'duplex_type', 'is_furnished', 'attached_garage',
       'detached_garage', 'carport', 'off_street_parking', 'no_parking',
       'EV_charging', 'air_condition', 'no_smoking'],
      dtype='object')


### Insert the new, cleaned data into SQL rental table:

In [None]:
class SQL_Database:
    def __init__(self, path_for_SQL_config):

        with open(path_for_SQL_config,'r') as fh:
            config = json.load(fh)

        self.driver = config['driver']
        self.server = config['server']
        self.database = config['database']
        self.username = config['username']
        self.password = config['password']

        print(self.database)

    def insert_df_to_SQL_ETL(self, df):
        """Insert scraped Craigslist rental listings data (ie, the Pandas' dataframe)
        to SQL Server database 'rental' table"""

        # establish connection to SQL Server database-specify login credentials:
        conn = pyodbc.connect(
        f'DRIVER={self.driver};'
        f'SERVER={self.server};'
        f'DATABASE={self.database};'
        f'UID={self.username};'
        f'PWD={self.password};'
        'Trusted_Connection=yes;'
        )

        # initialize cursor so we can execute SQL code
        cursor = conn.cursor() 

        cursor.fast_executemany = True  # speed up data ingesting by reducing the numbers of calls to server for inserts

        # convert all variables from dataframe to str to avoid following SQL Server pyodbc error: 'ProgrammingError: ('Invalid parameter type.  param-index=2 param-type=function', 'HY105')'
        df = df.astype(str) # convert all df variables to str for ease of loading data into SQl Server table
        
        # insert scraped data from df to SQL table-- iterate over each row of each df col via .itertuples() method

        # # # NB: since there are 45 cols we will insert, we will need 45 '?' char marks  
        # q_mark_list = ['?']*45

        # # # unpack list as string, and join() commas to each '?' char
        # q_mark_str = ','.join(q_mark_list)
        
        # Get the number of needed '?' placeholders by looking up the # of cols (ie, len()) of the dataframe), and use .join() to have each question mark seprated by commas:
        q_mark_str = ','.join('?'*len(clist_rental.columns))  

        # specify INSERT INTO SQL statement--iterate over each row in df, and insert into SQL database:
        for row in df.itertuples():  # iterate over each row from df
            cursor.execute(f"""INSERT INTO rental (listing_id, sqft, city, price, bedrooms, bathrooms, attr_vars,
            date_of_webcrawler, kitchen, date_posted, region, sub_region, cats_OK, dogs_OK, wheelchair_accessible,laundry_in_bldg, no_laundry, 
            washer_and_dryer, washer_and_dryer_hookup, laundry_on_site, full_kitchen, dishwasher, refrigerator,
            oven,flooring_carpet, flooring_wood, flooring_tile, flooring_hardwood, flooring_other,apt_type, in_law_apt_type, condo_type, townhouse_type, cottage_or_cabin_type, single_fam_type, duplex_type, is_furnished, attached_garage,
            detached_garage, carport, off_street_parking, no_parking, EV_charging, air_condition, no_smoking) 
            VALUES ({q_mark_str})""",
            (row.ids,
            row.sqft,
            row.cities,
            row.prices, 
            row.bedrooms,
            row.bathrooms,
            row.attr_vars, 
            row.date_of_webcrawler,
            row.kitchen,
            row.date_posted,
            row.region,
            row.sub_region,
            row.cats_OK,
            row.dogs_OK,
            row.wheelchair_accessible,
            row.laundry_in_bldg, 
            row.no_laundry,
            row.washer_and_dryer,
            row.washer_and_dryer_hookup,
            row.laundry_on_site,
            row.full_kitchen,
            row.dishwasher,
            row.refrigerator,
            row.oven,
            row.flooring_carpet,
            row.flooring_wood,
            row.flooring_tile,
            row.flooring_hardwood,
            row.flooring_other,
            row.apt_type, 
            row.in_law_apt_type,
            row.condo_type,
            row.townhouse_type,
            row.cottage_or_cabin_type,
            row.single_fam_type, 
            row.duplex_type,
            row.is_furnished,
            row.attached_garage,
            row.detached_garage,
            row.carport,
            row.off_street_parking,
            row.no_parking,
            row.EV_charging,
            row.air_condition,
            row.no_smoking)
            )
            
        # save and commit changes to database
        conn.commit()

        # # sanity check-- ensure some data has been inserted into new SQL table
        sql_table_count_records = conn.execute("""SELECT COUNT(*) FROM rental;""").fetchall()
        print(f"The number of records stored in the SQL table is: {sql_table_count_records[0]}")     
        
        sql_query_for_record_samples = conn.execute("""SELECT TOP 3 * FROM rental;""").fetchall() # check out several of the records
        print(f"\nA few of the inserted records are: {sql_query_for_record_samples}")

        cursor.close()
        conn.close()

# specify path to json file containing SQL configuration/username data
sql_config_path = "D:\\Coding and Code projects\\Python\\craigslist_data_proj\\CraigslistWebScraper\\SQL_config\\config.json" 

SQL_db = SQL_Database(sql_config_path)  # NB: be sure to pass in path to the json SQL configuration file so we can load in the needed username, password, and configuration data to be able to access the SQL database
# Ingest data from pandas' dataframe to SQL server--data pipeline: 
SQL_db.insert_df_to_SQL_ETL(clist_rental)
