# Imports

In [1]:
import pandas as pd
from pyproj import Transformer
import geopandas as gpd
from shapely.geometry import Point

## Reading in Data and Initial Overview

In [2]:
# 587273 rows
df = pd.read_csv('allservicecalls.csv')

df_original = df.shape[0]
print(f'original df: {df_original} rows')

original df: 564209 rows


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564209 entries, 0 to 564208
Data columns (total 17 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Category              564209 non-null  object 
 1   CASEID                564209 non-null  int64  
 2   OPENEDDATETIME        564209 non-null  object 
 3   SLA_Date              563955 non-null  object 
 4   CLOSEDDATETIME        535740 non-null  object 
 5   Late (Yes/No)         564209 non-null  object 
 6   Dept                  564209 non-null  object 
 7   REASONNAME            564209 non-null  object 
 8   TYPENAME              564209 non-null  object 
 9   CaseStatus            564209 non-null  object 
 10  SourceID              564209 non-null  object 
 11  OBJECTDESC            564209 non-null  object 
 12  Council District      564209 non-null  int64  
 13  XCOORD                564180 non-null  float64
 14  YCOORD                564180 non-null  float64
 15  

## Renaming Relevant Columns and Dropping the Others

In [4]:
# Get relevant columns
df = df[['OPENEDDATETIME',
         'TYPENAME', 
         'OBJECTDESC', 
         'XCOORD',
         'YCOORD']]


df = df.rename(columns = {'OPENEDDATETIME' : 'open_date',
                          'TYPENAME' : 'type', 
                          'OBJECTDESC' : 'location', 
                          'Council District' : 'district',
                          'XCOORD' : 'x_coord',
                          'YCOORD' : 'y_coord'})

df.head()

Unnamed: 0,open_date,type,location,x_coord,y_coord
0,6/3/2018,Damaged Cart,"220 BROADWAY, San Antonio, 78205",2131743.0,13703551.0
1,2/2/2019,Animal Permits Request,"2033 FRIO CITY RD, San Antonio, 78226",2114624.0,13689978.0
2,2/6/2019,Animal Permits Request,"2018 FITCH ST, San Antonio, 78211",2116303.0,13682489.0
3,3/4/2019,Animal Permits Request,"117 LORETTA, San Antonio, 78210",2132578.0,13692347.0
4,3/13/2019,Animal Permits Request,"4032 E SOUTHCROSS, San Antonio, 78222",2154623.0,13684095.0


## Dropping Rows not Associated With Homelessness
* Only keeping rows with the following values in the type column:
    * Homeless Encampment
    * Homeless Outreach
    * Encampment Abatement
    * Sanitation_Encampment_Abatement
    * Sanitation_UF-Encampment Abatement
    * Sanitation_NA-Encampment Abatement

In [5]:
#Dropping Rows not Associated With Homelessness
homeless = ['Homeless Encampment',
            'Homeless Outreach',
            'Encampment Abatement',
            'Sanitation_Encampment_Abatement',
            'Sanitation_UF-Encampment Abatement',
            'Sanitation_NA-Encampment Abatement']

df = df[df['type'].isin(homeless)].reset_index(drop=True)

df_drop_type = df.shape[0]

print(f'{df_original - df_drop_type} rows were dropped.')
print(f'{df_drop_type} rows remain.')

561242 rows were dropped.
2967 rows remain.


## Converting X_coord and Y_coord to Latitude and Longitude
* Coordinates stored as NAD 1983 State Plane Texas South Central FIPS 4204 Feet coordinate system
* Converting to Tableau friendly lattitude and longitude coordinates

In [6]:
# define transformer: from EPSG:2278 (Texas South Central, ft) to EPSG:4326 (WGS84)
transformer = Transformer.from_crs("EPSG:2278", "EPSG:4326", always_xy=True)

# create new columns using x and y _coord columns
df["longitude"], df["latitude"] = transformer.transform(df["x_coord"].values, df["y_coord"].values)

# drop original coordinates
df = df.drop(columns =['x_coord','y_coord'])

df.head(10)

Unnamed: 0,open_date,type,location,longitude,latitude
0,10/20/2023,Homeless Encampment,CULEBRA RD and N TRINITY ST,-98.516167,29.444219
1,11/2/2023,Homeless Encampment,"7811 S IH 35, SAN ANTONIO, 78224",-98.534868,29.351602
2,11/6/2023,Homeless Encampment,"10576 DREAMLAND, SAN ANTONIO",-98.541934,29.535004
3,11/28/2023,Homeless Encampment,"36th Street Park, 2542 NW 36TH ST, SAN ANTONI...",-98.569024,29.461083
4,12/4/2023,Homeless Encampment,POWHATAN DR and WILLIAM PENN,-98.556656,29.528313
5,12/6/2023,Homeless Encampment,"5002 WEST AVE, SAN ANTONIO, 78213",-98.525528,29.512414
6,12/16/2023,Homeless Encampment,"1703 FARWELL, SAN ANTONIO, 78213",-98.519163,29.507367
7,12/19/2023,Homeless Encampment,REDLAND RD and US HWY 281 N,-98.464573,29.621459
8,12/19/2023,Homeless Encampment,"Stinson Park, 8214 S FLORES ST, SAN ANTONIO, ...",-98.484157,29.343525
9,12/20/2023,Homeless Encampment,"10103 MOURSUND BLVD, SAN ANTONIO, 78221",-98.503728,29.317004


## Adding Zip Column
   * Get zips from latitude and longitude 

In [7]:
# get list with Point(longitude, latitude) for data frame
geometry = [Point(xy) for xy in zip(df['longitude'], df['latitude'])]

# convert df to geodataframe
gdf_points = gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:4326')

# load 2024 ZCTA shapefile into a geodataframe
gdf_zips = gpd.read_file('tl_2024_us_zcta520.shp')

# ensure both geoDataFrames use the same CRS
gdf_zips = gdf_zips.to_crs(gdf_points.crs)

# spatialy join geodataframes adding zipcode info to gdf_points
gdf_joined = gpd.sjoin(gdf_points, gdf_zips, how='left', predicate='within')

# add ZIP code column from shapefile 
gdf_joined['zips'] = gdf_joined['ZCTA5CE20']

# drop spatial join metadata
gdf_joined = gdf_joined.drop(columns=['index_right'])

# convert back to regular dataframe
df = pd.DataFrame(gdf_joined)

# drop parsing columns
df = df[['open_date',
         'type',
         'latitude',
         'longitude',
         'zips']]

df.head(10)

Unnamed: 0,open_date,type,latitude,longitude,zips
0,10/20/2023,Homeless Encampment,29.444219,-98.516167,78201
1,11/2/2023,Homeless Encampment,29.351602,-98.534868,78224
2,11/6/2023,Homeless Encampment,29.535004,-98.541934,78230
3,11/28/2023,Homeless Encampment,29.461083,-98.569024,78228
4,12/4/2023,Homeless Encampment,29.528313,-98.556656,78230
5,12/6/2023,Homeless Encampment,29.512414,-98.525528,78213
6,12/16/2023,Homeless Encampment,29.507367,-98.519163,78213
7,12/19/2023,Homeless Encampment,29.621459,-98.464573,78259
8,12/19/2023,Homeless Encampment,29.343525,-98.484157,78214
9,12/20/2023,Homeless Encampment,29.317004,-98.503728,78221


## Dropping Rows with Zips not in San Antonio

In [8]:
print(len(df))

# list of zips in San Antonio
sa_zips = ['78201', '78202', '78203', '78204', '78205', 
           '78206', '78207', '78208', '78209', '78210', 
           '78211', '78212', '78213', '78214', '78215', 
           '78216', '78217', '78218', '78219', '78220', 
           '78221', '78222', '78223', '78224', '78225', 
           '78226', '78227', '78228', '78229', '78230', 
           '78231', '78232', '78233', '78234', '78235', 
           '78236', '78237', '78238', '78239', '78240', 
           '78241', '78242', '78243', '78244', '78245', 
           '78246', '78247', '78248', '78249', '78250', 
           '78251', '78252', '78253', '78254', '78255', 
           '78256', '78257', '78258', '78259', '78260', 
           '78261', '78262', '78263', '78264', '78265', 
           '78266', '78268', '78269', '78270', '78275', 
           '78278', '78279', '78280', '78283', '78284', 
           '78285', '78286', '78287', '78288', '78289', 
           '78291', '78292', '78293', '78294', '78295', 
           '78296', '78297', '78298', '78299']

df = df[df.zips.isin(sa_zips)]

df_drop_zip = df.shape[0]
print(f'{df_drop_type - df_drop_zip} rows were dropped.')
print(f'{df_drop_zip} rows remain.')

2967
2963


Unnamed: 0,open_date,type,latitude,longitude,zips
0,10/20/2023,Homeless Encampment,29.444219,-98.516167,78201
1,11/2/2023,Homeless Encampment,29.351602,-98.534868,78224
2,11/6/2023,Homeless Encampment,29.535004,-98.541934,78230
3,11/28/2023,Homeless Encampment,29.461083,-98.569024,78228
4,12/4/2023,Homeless Encampment,29.528313,-98.556656,78230
...,...,...,...,...,...
2962,4/11/2025,Homeless Outreach,29.500309,-98.503722,78216
2963,4/11/2025,Homeless Encampment,29.342548,-98.516226,78221
2964,4/12/2025,Homeless Encampment,29.490789,-98.483043,78212
2965,4/12/2025,Homeless Encampment,29.519262,-98.597456,78240


## Checking For Nulls
* Data contains no null values after preperation

In [9]:
df.isnull().sum()

open_date    0
type         0
latitude     0
longitude    0
zips         0
dtype: int64

## Making Sure Data in Columns is the Desired Type

In [None]:
# casting open_date as datetime
df['open_date'] = pd.to_datetime(df['open_date'])

In [None]:
df.info()

## Getting Time Stamp for Data
* Data is between October 20th, 2003 and April 12th, 2005

In [20]:
print(f'Dataframe contains data from {df.open_date.min()} to {df.open_date.max()}')

Dataframe contains data from 2023-10-20 00:00:00 to 2025-04-12 00:00:00


## Full Prep
* Data represents 311 service requests taken between October 20th, 2003 and April 12th, 2005
* The original dataframe contained 564209 rows
* 2963 remain after dropping rows for the following reasons
    * 561242 rows were dropped due to type column not being associated with homelessness
    * 4 were dropped due to zipcodes being outside of San Antonio
    


In [21]:
def get_preped_311_data():
    '''Prepare 311 data for project'''
    # read in original data
    df = pd.read_csv('allservicecalls.csv')
    
    # get and rename only relevant columns
    df = df[['OPENEDDATETIME',
             'TYPENAME', 
             'OBJECTDESC', 
             'XCOORD',
             'YCOORD']]


    df = df.rename(columns = {'OPENEDDATETIME' : 'open_date',
                              'TYPENAME' : 'type', 
                              'OBJECTDESC' : 'location', 
                              'Council District' : 'district',
                              'XCOORD' : 'x_coord',
                              'YCOORD' : 'y_coord'})
    
    # keep only rows with values in type column that are associated with homelessness
    homeless = ['Homeless Encampment',
                'Homeless Outreach',
                'Encampment Abatement',
                'Sanitation_Encampment_Abatement',
                'Sanitation_UF-Encampment Abatement',
                'Sanitation_NA-Encampment Abatement']
    
    df = df[df['type'].isin(homeless)].reset_index(drop=True)
    
    # convert coordinates to latitude and longitude
    # define transformer: from EPSG:2278 (Texas South Central, ft) to EPSG:4326 (WGS84)
    transformer = Transformer.from_crs("EPSG:2278", "EPSG:4326", always_xy=True)

    # create new columns using x and y _coord columns
    df["longitude"], df["latitude"] = transformer.transform(df["x_coord"].values, df["y_coord"].values)

    # drop original coordinates
    df = df.drop(columns =['x_coord','y_coord'])
    
    # get zipcodes
    # get list with Point(longitude, latitude) for data frame
    geometry = [Point(xy) for xy in zip(df['longitude'], df['latitude'])]

    # convert df to geodataframe
    gdf_points = gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:4326')

    # load 2024 ZCTA shapefile into a geodataframe
    gdf_zips = gpd.read_file('tl_2024_us_zcta520.shp')

    # ensure both geoDataFrames use the same CRS
    gdf_zips = gdf_zips.to_crs(gdf_points.crs)

    # spatialy join geodataframes adding zipcode info to gdf_points
    gdf_joined = gpd.sjoin(gdf_points, gdf_zips, how='left', predicate='within')

    # add ZIP code column from shapefile 
    gdf_joined['zips'] = gdf_joined['ZCTA5CE20'] 
    
    # drop spatial join metadata
    gdf_joined = gdf_joined.drop(columns=['index_right'])

    # convert back to regular dataframe
    df = pd.DataFrame(gdf_joined)

    # drop parsing columns
    df = df[['open_date',
             'type',
             'latitude',
             'longitude',
             'zips']]
    
    # drop rows with zipcodes not in San Antonio
    # list of zips in San Antonio
    sa_zips = ['78201', '78202', '78203', '78204', '78205', 
               '78206', '78207', '78208', '78209', '78210', 
               '78211', '78212', '78213', '78214', '78215', 
               '78216', '78217', '78218', '78219', '78220', 
               '78221', '78222', '78223', '78224', '78225', 
               '78226', '78227', '78228', '78229', '78230', 
               '78231', '78232', '78233', '78234', '78235', 
               '78236', '78237', '78238', '78239', '78240', 
               '78241', '78242', '78243', '78244', '78245', 
               '78246', '78247', '78248', '78249', '78250', 
               '78251', '78252', '78253', '78254', '78255', 
               '78256', '78257', '78258', '78259', '78260', 
               '78261', '78262', '78263', '78264', '78265', 
               '78266', '78268', '78269', '78270', '78275', 
               '78278', '78279', '78280', '78283', '78284', 
               '78285', '78286', '78287', '78288', '78289', 
               '78291', '78292', '78293', '78294', '78295', 
               '78296', '78297', '78298', '78299']

    df = df[df.zips.isin(sa_zips)]
    
    # casting open_date as datetime
    df['open_date'] = pd.to_datetime(df['open_date'])
    
    df.to_excel('311_geopandas_prep.xlsx')