In [1]:
from satellite import Satellite
from datetime import datetime
from PIL import Image
import pandas as pd
import json
import os

In [2]:
s = Satellite()

# Data Cleaning

In [3]:
buildings = pd.read_csv("More Atlanta Supply.csv")

## Setting up different coordinate options

# Default Coords
buildings['coords'] = [f"{row['Latitude']},{row['Longitude']}" for ii, row in buildings.iterrows()]


# Address Coords
address_coords = []

for ii, row in buildings.iterrows():
    try:
        address_coords.append(s.address_to_coordinates(row['Address']))
    except:
        address_coords.append(None)

buildings['address_coords'] = address_coords


# Mean Coords
for ii, row in buildings.iterrows():
    c1, c2 = row['coords'].split(",")
    ac1, ac2 = row['address_coords'].split(",")

    mc1 = round((float(c1) + float(ac1)) / 2, 8)
    mc2 = round((float(c2) + float(ac2)) / 2, 8)

    buildings.loc[ii, 'mid_coords'] = f"{mc1},{mc2}"

In [140]:
def convert_df_date(date):
    try:
        year, quarter = str(date).split(".")
        month = (int(quarter)-1)*3
        return f"{year}-{month}-01"
    except:
        return date

def convert_bing_date(date):
    date_format = "%d %b %Y %Z"
    date_obj = datetime.strptime(date, date_format)
    return date_obj.strftime("%Y-%m-%d")
    
buildings['EstimatedCompletionYearQuarter'] = buildings['EstimatedCompletionYearQuarter'].apply(convert_df_date)
buildings['YearQuarterGroundBroken'] = buildings['YearQuarterGroundBroken'].apply(convert_df_date)

# buildings.to_csv("buildings_w_coords.csv", index=False)

In [4]:
buildings = pd.read_csv("buildings_w_coords.csv")

# Saving Images

In [5]:
def processDf(df, zoomLevel, suffix=None, coords_field='coords', save_metadata=False):
    failed = []
    images_dir = os.listdir("Images")
    for ii, row in df.iterrows():
        try:
            propertyId = row['PropertyID']
            if str(propertyId) not in images_dir:
                print(f"Creating directory for {propertyId}")
                os.mkdir(f"Images/{propertyId}")
            
            print(propertyId)
            coords = row[coords_field]
            try:
                image, metadata = s.bing_pipeline(coords=coords, imagerySet="Aerial", zoomLevel=zoomLevel)
                rgb_image = image.convert('RGB')
                print("Saving Image...")
                rgb_image.save(f'Images/{propertyId}/base{suffix}.jpg')
                if save_metadata:
                    print("Saving Metadata...")
                    json.dump(metadata, open(f'Images/{propertyId}/metadata{suffix}.json', 'w'))
            except Exception as e:
                print("Failed for reason: ", e)
                failed.append((propertyId, str(e)))
        except:
            print("Failed for unknown reason")
            print(row)
    
    return failed

In [None]:
### Default Coords
## Base = 18 zoom
processDf(buildings, 18, suffix="_18_base", save_metadata=True)
processDf(buildings, 18, coords_field='address_coords', suffix="_18_address", save_metadata=True)

processDf(buildings, 17, suffix="_17_base", save_metadata=True)
processDf(buildings, 17, coords_field='address_coords', suffix="_17_address", save_metadata=True)


# Testing

In [22]:
def load_files(propertyId, images=False, metadata=False):
    if images:
        return [file for file in os.listdir(f"Images/{propertyId}") if not file.startswith(".") and file.endswith(".jpg")]
    elif metadata:
        return [{file: json.load(open(f"Images/{propertyId}/{file}", "r"))} for file in os.listdir(f"Images/{propertyId}") if not file.startswith(".") and file.endswith(".json")]
    else:    
        return [file for file in os.listdir(f"Images/{propertyId}") if not file.startswith(".")]

In [129]:
import base64
import requests
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())
openai_api_key = os.getenv("OPENAI_API_KEY")


def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

def label_image(image_path):
    categories = """
    1:
    - Undeveloped land
    - Free of structures
    - No sign of land having been cleared or bulldozed
    - Vegetation or sand might appear to be unmoved

    2:
    - Ground Broken
    - Free of structures
    - Discernable movement of land with possible temporary roads
    - May be construction vehicles present
    - Vegetation cleared

    3:
    - Concrete Pad:
    - Land contains a large flat surface of poured concrete
    - The land around that pad is still unfinished surface usually dirt and free of vegetation
    - May be surrounded by construction vehicles
    
    4:
    - Framing Going up:
    - Contains many of the same elements of concrete pad
    - Instead of a large flat surface you begin to see more variation in pixel color as the complexity of
    the structure begins to take shape
    - Walls may be casting shadows depending on the time of day of the satellite photo.
    - There may be a roof present but there is no paved parking lot present.
    
    5:
    - Near completion or completed:
    - A polished appearance.
    - Structure has a roof (often white but not always
    - Freshly paved parking lot. May have vehicles in the lot or trailers (containers) backed up along at
    least one side of the building.
    - May have attractive plantings to demarcate portions of the parking lot.

"""
    format = "Reply with just the number of the correct category. For example, if the correct category is 1, reply with 1. No other text is allowed"
    
    message = f"""
    This image is a satellite image view of an address. Please label the image based on the following categories:
    The hardest categories to define accurately are 2, 3, and 4. 1 and 5 are usually easy to define.
    Spend extra time on 2, 3, and 4. Attention to detail is critical, follow the category information provided as close as possible.

    {categories}

    Follow this output:
    {format}
    """
    base64_image = encode_image(image_path)

    headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {openai_api_key}"
    }

    payload = {
    "model": "gpt-4-turbo",
    "messages": [
        {
        "role": "user",
        "content": [
            {
            "type": "text",
            "text": message
            },
            {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}",
                "detail": "low"
            }
            }
        ]
        }
    ],
    "max_tokens": 300
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    label = response.json()['choices'][0]['message']['content']
    return label

# Looking for images after ground broke

In [None]:
imgs_after_ground_broken = []
for ii, row in buildings.iterrows():
    groundBrokenDate = row['YearQuarterGroundBroken']
    files = load_files(row['PropertyID'], metadata=True)
    for file in files:
        try:
            img_date = list(file.values())[0]['resourceSets'][0]['resources'][0]['vintageEnd']
            img_date = convert_bing_date(img_date)
            if img_date > groundBrokenDate:
                print("Image is after ground broken date")
                print(img_date, groundBrokenDate)
                print(row['PropertyID'])
                print(file)
                imgs_after_ground_broken.append((row['PropertyID'], list(file.keys())[0]))
        except:
            print("Failed for ", row['PropertyID'])
            print(file)
            print("\n")

after_ground_broke = pd.DataFrame(imgs_after_ground_broken, columns = ['propertyID', 'metadata_file'])


def metadata_to_img_file(metadata_file):
    base = metadata_file.split(".json")[0].replace("metadata", "base")
    return f"{base}.jpg"

after_ground_broke['image_file'] = after_ground_broke['metadata_file'].apply(metadata_to_img_file)

In [None]:
# Tagging after ground broke images
for ii, row in after_ground_broke.iterrows():
    label = label_image(f"Images/{row['propertyID']}/{row['image_file']}")
    print(row['propertyID'], row['image_file'], label)
    after_ground_broke.loc[ii, 'label'] = label

In [134]:
# Checking the results
from PIL import Image
for ii, row in after_ground_broke[after_ground_broke['label'] == "5"].iterrows():
    file_path = f"Images/{row['propertyID']}/{row['image_file']}"
    Image.open(file_path).show()

### Good results from openAI

# Tagging every image

In [144]:
## Set up a dataframe to be tagged

image_list = []
for propertyID in buildings['PropertyID'].unique():
    base_names = [
        "base_18_address.jpg",
        "base_18_base.jpg",
        "base_17_address.jpg",
        "base_17_base.jpg"
    ]
    for bn in base_names:
        image_list.append((propertyID, f"Images/{propertyID}/{bn}"))

image_df = pd.DataFrame(image_list, columns=['propertyID', 'image_file'])
image_df.head(3)

Unnamed: 0,propertyID,image_file
0,1652888862,Images/1652888862/base_18_address.jpg
1,1652888862,Images/1652888862/base_18_base.jpg
2,1652888862,Images/1652888862/base_17_address.jpg


In [145]:
image_df['label'] = ""

In [152]:
for ii, row in image_df[image_df['label'] == ""].iterrows():
    # Write to file every 100 images in case of rate limits or crashes
    if ii % 100 == 0:
        image_df.to_csv("image_df.csv", index=False)
    if image_df.loc[ii, 'label'] == "":
        try:
            label = label_image(row['image_file'])
            print(row['propertyID'], row['image_file'], label)
            image_df.loc[ii, 'label'] = label
        except:
            print("Failed for ", row['propertyID'], row['image_file'])
            image_df.loc[ii, 'label'] = "Failed"

1652918829 Images/1652918829/base_18_base.jpg 1
1652918829 Images/1652918829/base_17_address.jpg 5
1652918829 Images/1652918829/base_17_base.jpg 2
1652887799 Images/1652887799/base_18_address.jpg 2
1652887799 Images/1652887799/base_18_base.jpg 2
1652887799 Images/1652887799/base_17_address.jpg 2
1652887799 Images/1652887799/base_17_base.jpg 2
1652915194 Images/1652915194/base_18_address.jpg 1
1652915194 Images/1652915194/base_18_base.jpg 1
1652915194 Images/1652915194/base_17_address.jpg 2
1652915194 Images/1652915194/base_17_base.jpg 1
1652901405 Images/1652901405/base_18_address.jpg 3
1652901405 Images/1652901405/base_18_base.jpg 3
1652901405 Images/1652901405/base_17_address.jpg 2
1652901405 Images/1652901405/base_17_base.jpg 4
1652891217 Images/1652891217/base_18_address.jpg 2
1652891217 Images/1652891217/base_18_base.jpg 2
1652891217 Images/1652891217/base_17_address.jpg 2
1652891217 Images/1652891217/base_17_base.jpg 2
1652895806 Images/1652895806/base_18_address.jpg 1
1652895806

# Data Augmentation

In [154]:
image_df = image_df[image_df['label'] != "Failed"]

In [157]:
from augmentor import Augmentor

aug = Augmentor()

In [167]:
### BEFORE RUNNING AGAIN

### SET IT TO NOT RUN ON IMAGES THAT ALREADY HAVE _1.jpg FOR THAT IMAGE
new_files = []
for ii, row in image_df.iterrows():
    if os.path.isfile(f"{row['image_file'].split('.jpg')[0]}_1.jpg"):
        print("Already ran on this image")
        continue
    else:
        resp = aug.augment_image(row['image_file'], num_augentations=20, label=row['label'])
        new_files = new_files + resp

In [176]:
## Add all new files to the image_df (if not already there)
new_files = []
for ii, row in image_df.iterrows():
    for ii in range(20):
        new_file = row['image_file'].replace(".jpg", f"_{ii}.jpg")
        if new_file not in image_df['image_file'].values:
            new_files.append({'propertyID': row['propertyID'], 'image_file': new_file, 'label': row['label']})


files_to_add = pd.DataFrame(new_files)
image_df = pd.concat([image_df, files_to_add], axis = 0)

image_df.to_csv("image_df.csv", index=False)

In [18]:
image_df = pd.read_csv("image_df.csv")

In [39]:
image_df = pd.read_csv("image_df.csv")
image_df = image_df[image_df['image_file'].apply(os.path.isfile)]
image_df['label'] = image_df['label'].apply(int).apply(str)

sample_size = 1000
samples = [image_df[image_df['label'] == label].sample(sample_size) for label in ['1', '2','3','4','5']]
data = pd.concat(samples, axis=0)


pid_train, pid_test = train_test_split(data['propertyID'].unique(), train_size=0.8, test_size=0.2, shuffle=True)
train = data[data['propertyID'].isin(pid_train)]
test = data[data['propertyID'].isin(pid_test)]

## Training a model

In [None]:
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam



image_df = pd.read_csv("image_df.csv")
image_df = image_df[image_df['image_file'].apply(os.path.isfile)]
image_df['label'] = image_df['label'].apply(int).apply(str)

pid_train, pid_test = train_test_split(image_df['propertyID'].unique(), train_size=0.8, test_size=0.2, shuffle=True)
train = data[data['propertyID'].isin(pid_train)]
test = data[data['propertyID'].isin(pid_test)]


train_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True
)

train_generator = train_datagen.flow_from_dataframe(
    dataframe=train,
    x_col='image_file',
    y_col='label',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='training'
)

val_generator = train_datagen.flow_from_dataframe(
    dataframe=train,
    x_col='image_file',
    y_col='label',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='validation'
)

# Load the pre-trained model
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Add a new classification layer
x = base_model.output
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(5, activation='softmax')(x)
model = tf.keras.Model(inputs=base_model.input, outputs=x)

# Freeze the layers of the pre-trained model
for layer in base_model.layers:
    layer.trainable = False

# Compile the model
model.compile(optimizer=Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
epochs = 10
history = model.fit(
    train_generator,
    steps_per_epoch=train_generator.samples // train_generator.batch_size,
    validation_data=val_generator,
    validation_steps=val_generator.samples // val_generator.batch_size,
    epochs=epochs
)

# Evaluate the model
loss, accuracy = model.evaluate(val_generator)
print(f'Validation loss: {loss:.4f}')
print(f'Validation accuracy: {accuracy:.4f}')

# Save the fine-tuned model
model.save('fine_tuned_model.h5')

In [104]:
from PIL import Image
import numpy as np
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.resnet import preprocess_input
from tensorflow.keras.models import Model

def preprocess_for_resnet(pil_image, target_size=(224, 224)):
    resized_image = pil_image.resize(target_size)
    image_array = img_to_array(resized_image)
    preprocessed_image = preprocess_input(image_array)
    image_batch = np.expand_dims(preprocessed_image, axis=0)
    return image_batch

def predict_with_resnet(pil_image, model):
    processed_image = preprocess_for_resnet(pil_image)
    predictions = model.predict(processed_image)
    
    return predictions


# Predictions

In [46]:
new_model = tf.keras.models.load_model('fine_tuned_model.h5')

new_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 conv1_pad (ZeroPadding2D)   (None, 230, 230, 3)          0         ['input_2[0][0]']             
                                                                                                  
 conv1_conv (Conv2D)         (None, 112, 112, 64)         9472      ['conv1_pad[0][0]']           
                                                                                                  
 conv1_bn (BatchNormalizati  (None, 112, 112, 64)         256       ['conv1_conv[0][0]']          
 on)                                                                                        

In [3]:
buildings = pd.read_csv("buildings_w_coords.csv")
atl = buildings[buildings['MarketCode'] == "ATLANT"]

In [None]:
## Getting data and predictions

s = Satellite()

img_data = {}
for ii, row in atl.iterrows():
    img, metadata = s.bing_pipeline(row['coords'], imagerySet="Aerial", zoomLevel=17)
    img_data[row['PropertyID']] = img

preds = []
for k,v in img_data.items():
    resp = predict_with_resnet(v, new_model)[0]
    preds.append((k, list(resp).index(max(resp)) + 1))

atl = atl.merge(pd.DataFrame(preds, columns=['PropertyID', 'predicted_stage']), how='left', on='PropertyID')
atl['predicted_stage'] = atl['predicted_stage'].astype(int)

In [13]:
## Calculating quarters left

qtc_mapping = {
    100000: 2,
    300000: 3,
    600000: 4,
    1000000: 5,
    9999999: 6
}

stage_perc_left = {
    1: 1,
    2: 0.75,
    3: 0.5,
    4: 0.25,
    5: 0
}

for ii, row in atl.iterrows():
    for k,v in qtc_mapping.items():
        if k < row['Available_sf']:
            continue
        else:
            qtc = v
            break

    perc_left = stage_perc_left[int(row['predicted_stage'])]
    quarters_left = round(qtc*perc_left)

    current_quarter = "2024.2"

    year, quarter = map(int, current_quarter.split('.'))
    total_quarters = quarter + quarters_left - 1
    new_year = year + (total_quarters // 4)
    new_quarter = (total_quarters % 4) + 1 
    atl.loc[ii, 'PredictedCompletionTime'] = f"{new_year}.{new_quarter}"

In [50]:
## Updating dat file

atl_dat = pd.read_csv('Atlanta_supply_dat.xlsx - Atlanta_dat.csv')

projections = atl.groupby('PredictedCompletionTime').sum()['Size_sf'].reset_index()

projections['MarketCode'] = "ATLANT"
projections['MarketName'] = "Atlanta"

projections['YearQuarter'] = projections['PredictedCompletionTime']
projections['Stock_sf'] = projections.cumsum(axis=0)['Size_sf'] + atl_dat["Stock_sf"].iloc[-1]
projections['Completions_sf'] = projections['Size_sf']
projections["Stock_Under_Construction_sf"] = projections['Size_sf'].sum() - projections.cumsum(axis=0)['Size_sf']
projections['under_construction_as_a_percentage_of_Stock'] = projections['Stock_Under_Construction_sf'] / projections['Stock_sf']

pd.concat([atl_dat,projections.drop(['PredictedCompletionTime', 'Size_sf'] ,axis=1)], axis=0).to_csv('Atlanta_supply_dat.xlsx - Atlanta_dat.csv')