In [None]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import os
import seaborn as sns

In [116]:
leases = pd.read_csv('datafest-data-read-only/Leases.csv')
priceANDavailability = pd.read_csv('datafest-data-read-only/Price_and_Availability_Data.csv')
majorMarket = pd.read_csv('datafest-data-read-only/Major_Market_Occupancy_Data-revised.csv')
unemployment = pd.read_csv('datafest-data-read-only/Unemployment.csv')
main_without_majorMarketInfo2018 = pd.merge(leases, priceANDavailability, on=['year', 'quarter', 'market'], how='inner')
main_with_majorMarketInfo2020 = pd.merge(main_without_majorMarketInfo2018, majorMarket, on=['year', 'quarter', 'market'], how='inner')

columns_to_drop = [
    "building_name", "building_id", "address", "zip", 
    "direct_available_space", "direct_availability_proportion", 
    "direct_internal_class_rent", "direct_overall_rent", 
    "sublet_available_space", "sublet_availability_proportion", 
    "sublet_internal_class_rent", "sublet_overall_rent", "company_name", "internal_market_cluster"
    
]

# Keep only columns that are present
existing_cols = [col for col in columns_to_drop if col in main_without_majorMarketInfo2018.columns]

# Drop safely
main_without_majorMarketInfo2018_drop = main_without_majorMarketInfo2018.drop(existing_cols, axis=1)
main_without_majorMarketInfo2018_drop.info()

main_without_majorMarketInfo2018_drop.drop(columns = "internal_industry").dropna().info()

x = 'internal_class_x'
y = 'internal_class_y'
diff_count = (main_without_majorMarketInfo2018_drop[x] != main_without_majorMarketInfo2018_drop[y]).sum()
print(f"Number of differing values of {x}: {diff_count}")

cols_to_drop = [col for col in main_without_majorMarketInfo2018_drop.columns if col.endswith('_y')]
main_without_majorMarketInfo2018_drop.drop(columns=cols_to_drop, inplace=True)
main_without_majorMarketInfo2018_drop.rename(
    columns={col: col[:-2] for col in main_without_majorMarketInfo2018.columns if col.endswith('_x')},
    inplace=True
)
main_without_majorMarketInfo2018_drop = main_without_majorMarketInfo2018_drop[main_without_majorMarketInfo2018_drop['leasedSF'] >= 10000]
main_without_majorMarketInfo2018_drop.info()


columns_to_drop = [
    "building_name", "building_id", "address", "zip", 
    "direct_available_space", "direct_availability_proportion", 
    "direct_internal_class_rent", "direct_overall_rent", 
    "sublet_available_space", "sublet_availability_proportion", 
    "sublet_internal_class_rent", "sublet_overall_rent", "company_name", "internal_market_cluster", "costarID"
    
]

# Keep only columns that are present
existing_cols = [col for col in columns_to_drop if col in main_with_majorMarketInfo2020.columns]

# Drop safely
main_with_majorMarketInfo2020_drop = main_without_majorMarketInfo2018.drop(existing_cols, axis=1)
cols_to_drop = [col for col in main_with_majorMarketInfo2020_drop.columns if col.endswith('_y')]
main_with_majorMarketInfo2020_drop.drop(columns=cols_to_drop, inplace=True)
main_with_majorMarketInfo2020_drop.rename(
    columns={col: col[:-2] for col in main_with_majorMarketInfo2020_drop.columns if col.endswith('_x')},
    inplace=True
)
main_with_majorMarketInfo2020_drop = main_with_majorMarketInfo2020_drop[main_with_majorMarketInfo2020_drop['leasedSF'] >= 10000]
main_with_majorMarketInfo2020_drop.info()

main_with_majorMarketInfo2020_drop = main_with_majorMarketInfo2020_drop.dropna()
main_with_majorMarketInfo2020_drop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291088 entries, 0 to 291087
Data columns (total 44 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   year                              291088 non-null  int64  
 1   quarter                           291088 non-null  object 
 2   monthsigned                       289722 non-null  float64
 3   market                            291088 non-null  object 
 4   region                            291088 non-null  object 
 5   city                              291088 non-null  object 
 6   state                             291088 non-null  object 
 7   internal_submarket                291088 non-null  object 
 8   internal_class_x                  291082 non-null  object 
 9   leasedSF                          291088 non-null  float64
 10  internal_industry                 41176 non-null   object 
 11  transaction_type                  290340 non-null  o

In [120]:
from tqdm import tqdm
import requests
import time

# === CONFIG ===
API_KEY = "AIzaSyDq2ZdVgLozW_Ev6cxsqno4Q5i5_FNY0Ko"  # Replace with your actual key
INPUT_FILE = main_with_majorMarketInfo2020_drop   # Replace with your input file
OUTPUT_FILE = "geocoded_output.csv"
BATCH_SIZE = 5000  # Adjust based on API quota/speed
SLEEP_TIME = 0.002  # 200ms del0ay between requests

# === FUNCTIONS ===
def geocode_address(address):
    if pd.isna(address) or address.strip() == "":
        return None, None
    try:
        url = f"https://maps.googleapis.com/maps/api/geocode/json?address={requests.utils.quote(address)}&key={API_KEY}"
        response = requests.get(url)
        data = response.json()
        if data["status"] == "OK":
            location = data["results"][0]["geometry"]["location"]
            return location["lat"], location["lng"]
        else:
            return None, None
    except Exception as e:
        return None, None

# === MAIN ===
df = main_with_majorMarketInfo2020_drop
df["latitude"] = None
df["longitude"] = None

for i in tqdm(range(len(df))):
    if pd.notna(df.loc[i, "latitude"]) and pd.notna(df.loc[i, "longitude"]):
        continue  # already filled

    address = df.loc[i, "address"]
    lat, lng = geocode_address(address)
    df.loc[i, "latitude"] = lat
    df.loc[i, "longitude"] = lng

    time.sleep(SLEEP_TIME)  # avoid rate limit

    if i % BATCH_SIZE == 0 and i > 0:
        df.to_csv(OUTPUT_FILE, index=False)  # save progress

# Final save
df.to_csv(OUTPUT_FILE, index=False)
print("Done. Results saved to:", OUTPUT_FILE)


  0%|                                                                              | 0/18408 [00:00<?, ?it/s]


KeyError: 0