# Import

In [None]:
pip install googlemaps

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting googlemaps
  Downloading googlemaps-4.6.0.tar.gz (31 kB)
Building wheels for collected packages: googlemaps
  Building wheel for googlemaps (setup.py) ... [?25l[?25hdone
  Created wheel for googlemaps: filename=googlemaps-4.6.0-py3-none-any.whl size=38554 sha256=71ae91838fa9a8df8ec58f5cc57f1f12f3fef0472d63f31f1cf10df0d1738319
  Stored in directory: /root/.cache/pip/wheels/80/db/c0/6d958585fa97b20e250bf437acf7e6e715b4809c2dd4e55367
Successfully built googlemaps
Installing collected packages: googlemaps
Successfully installed googlemaps-4.6.0


In [None]:
import pandas as pd
import os
from tqdm import tqdm
import googlemaps
import datetime
import geopy.distance
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
animalDF = pd.read_excel("/content/drive/My Drive/animal_shelter/data/ampa_wmt_rto_hackathon_july-22_data.xlsx")
gmaps = googlemaps.Client(key='AIzaSyBj47Ce7_0rkOTPHZllLAfonfYqu6oW1x0')

# Multiprocess Cleaning + Weather Features

In [None]:
# # Geocoding an address
geocode_result = gmaps.geocode('1600 Amphitheatre Parkway, Mountain View, CA')
geocode_result[0]["geometry"]["location"]

In [None]:
def getLngLat(address):
    try:
        geocode_result = gmaps.geocode(address)

        latLngDict = geocode_result[0]["geometry"]["location"]

        return latLngDict["lng"], latLngDict["lat"]
    except Exception as e:
        print("address extract error|","Address：",address,"|", e)

        return None, None

getLngLat('1600 Amphitheatre Parkway, Mountain View, CA')

In [None]:
def process_lnglat(name, df, loc_df):
    df.index = range(df.shape[0])
    ##1.1 crorect coordinates##
    for row in tqdm(range(df.shape[0]), desc=f'{name}:'):
        for address in ['found_address', 'outcome_address']:
            Address = df.loc[row, address]
            Address = " ".join([item for item in Address.split(' ')][:-1]) if type(
                Address) == str else Address

            lng, lat = getLngLat(Address)

            if lng != None and lat != None:
                df.loc[row, "found_lng"] = lng
                df.loc[row, "found_lat"] = lat
    ##1.2 get distance##
    distanceList = []
    for row in range(df.shape[0]):
        coords_1 = (df.loc[row, "found_lat"], df.loc[row, "found_lng"])
        coords_2 = (df.loc[row, "outcome_lat"], df.loc[row, "outcome_lng"])
        try:
            distance = geopy.distance.geodesic(coords_1, coords_2).miles
        except Exception as e:
            print(f'distance cal error, because:{e}')
            distance = 99999
        df.loc[row, "distance"] = distance

    maxColsDict = {
        'distance': 10000
    }

    ###if col's value greater than setted max value, then replace by mean value###
    for col, max_val in maxColsDict.items():
        index = df[col] > max_val
        df.loc[index, col] = round(df.loc[~index, col].mean())
    #1.3 get weather info
    for row in range(df.shape[0]):
        loc, date = df.loc[row, ['shelter_id', 'intake_date']]

        this_loc_df = loc_df[loc]
        meta_info = this_loc_df[this_loc_df['datetime'] == str(date)]

        for attr in ['temp', 'humidity', 'winddir', 'visibility']:
            try:
                df.loc[row, attr] = meta_info[attr].tolist()[0]
            except Exception as e:
                df.loc[row, attr] = -1
                print(f'no this {date} info, with  {e}')

    df = df[df.temp != -1]
    df.to_csv(f'{name}.csv', index=False)

In [None]:
def concat_data(save_name, process_num, prefix='process_', remove_sub=True):
    df = pd.read_csv(prefix + '0.csv')
    for i in range(1, process_num):
        next_df = pd.read_csv(prefix + str(i) + '.csv')
        df = pd.concat((df, next_df), axis=0, join='inner')
        print(df.shape)
    if save_name.endswith('xlsx'):
        df.to_excel(save_name, index=False)
    else:
        df.to_csv(save_name, index=False)

    if remove_sub:
        for i in range(process_num):
            os.remove(prefix + str(i) + '.csv')

In [None]:
if __name__ == '__main__':

    infos = animalDF
    infos = infos.dropna()  # drop nan

    # reformat address delete space
    for col in ['found_address', 'outcome_address']:
        infos[col] = infos[col].apply(
            lambda x: " ".join(item.strip() for item in x.split(' ')) if type(x) == str else x)

    # get valid lng lat
    infos = infos[(infos.found_lng < 180) & (infos.found_lng > -180) &
                  (infos.found_lat < 90) & (infos.found_lat > -90)]


    def date(para):
        if type(para) == int:
            delta = pd.Timedelta(str(int(para)) + 'days')
            time = pd.to_datetime('1899-12-30') + delta
            return time.date()
        else:
            return para


    # get valid time range
    infos['intake_date'] = infos['intake_date'].apply(date)

    df = infos[infos.intake_date <= datetime.datetime.strptime('2021-12-31', '%Y-%m-%d').date()]

    # get extra weather info
    locs = set(infos['shelter_id'].tolist())
    loc_df = {}
    for loc in locs:
        loc_df[loc] = pd.read_csv(f'weather_infos/{loc}.csv')

    # infos = infos.iloc[5::16, :]
    # infos.reindex()

    print(f'main process（{os.getpid()}）start...')
    process_list = []
    prefix = 'process_'
    process_num = 16
    for i in range(process_num):
        part_info = infos.loc[i::process_num, :]
        p = Process(target=process_lnglat, args=(prefix + f'{i}', part_info, loc_df,))
        process_list.append(p)

    for i in range(process_num):
        process_list[i].start()

    for i in range(process_num):
        process_list[i].join()

    concat_data('clean_data.csv', process_num, prefix=prefix)