# Imports

In [1]:
import pandas as pd
from pyproj import Transformer
import geopandas as gpd
from shapely.geometry import Point

import warnings
warnings.filterwarnings("ignore")

## Reading in Data and Initial Overview

In [2]:
# read in data
df = pd.read_csv('311_service_requests.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564209 entries, 0 to 564208
Data columns (total 17 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Category              564209 non-null  object 
 1   CASEID                564209 non-null  int64  
 2   OPENEDDATETIME        564209 non-null  object 
 3   SLA_Date              563955 non-null  object 
 4   CLOSEDDATETIME        535740 non-null  object 
 5   Late (Yes/No)         564209 non-null  object 
 6   Dept                  564209 non-null  object 
 7   REASONNAME            564209 non-null  object 
 8   TYPENAME              564209 non-null  object 
 9   CaseStatus            564209 non-null  object 
 10  SourceID              564209 non-null  object 
 11  OBJECTDESC            564209 non-null  object 
 12  Council District      564209 non-null  int64  
 13  XCOORD                564180 non-null  float64
 14  YCOORD                564180 non-null  float64
 15  

In [4]:
df_original = df.shape[0]
print(f'Original Dataframe Length: {df_original} rows')

Original Dataframe Length: 564209 rows


## Renaming Relevant Columns and Dropping the Others

In [5]:
# Get relevant columns
df = df[['OPENEDDATETIME',
         'TYPENAME', 
         'XCOORD',
         'YCOORD']]


df = df.rename(columns = {'OPENEDDATETIME' : 'open_date',
                          'TYPENAME' : 'type',  
                          'Council District' : 'district',
                          'XCOORD' : 'x_coord',
                          'YCOORD' : 'y_coord'})

df.head()

Unnamed: 0,open_date,type,x_coord,y_coord
0,6/3/2018,Damaged Cart,2131743.0,13703551.0
1,2/2/2019,Animal Permits Request,2114624.0,13689978.0
2,2/6/2019,Animal Permits Request,2116303.0,13682489.0
3,3/4/2019,Animal Permits Request,2132578.0,13692347.0
4,3/13/2019,Animal Permits Request,2154623.0,13684095.0


## Dropping Rows not Associated With Homelessness
* Only keeping rows with the following values in the type column:
    * Homeless Encampment
    * Homeless Outreach
    * Encampment Abatement
    * Sanitation_Encampment_Abatement
    * Sanitation_UF-Encampment Abatement
    * Sanitation_NA-Encampment Abatement

In [6]:
#Dropping Rows not Associated With Homelessness
homeless = ['Homeless Encampment',
            'Homeless Outreach',
            'Encampment Abatement',
            'Sanitation_Encampment_Abatement',
            'Sanitation_UF-Encampment Abatement',
            'Sanitation_NA-Encampment Abatement']

df = df[df['type'].isin(homeless)].reset_index(drop=True)

df_drop_type = df.shape[0]

print(f'{df_original - df_drop_type} rows were dropped')
print(f'{df_drop_type} rows remain')

561242 rows were dropped
2967 rows remain


## Converting X_coord and Y_coord to Latitude and Longitude
* Coordinates stored as NAD 1983 State Plane Texas South Central FIPS 4204 Feet coordinate system
* Converting to Tableau friendly lattitude and longitude coordinates

In [7]:
# define transformer: from EPSG:2278 (Texas South Central, ft) to EPSG:4326 (WGS84)
transformer = Transformer.from_crs("EPSG:2278", "EPSG:4326", always_xy=True)

# create new columns using x and y _coord columns
df["longitude"], df["latitude"] = transformer.transform(df["x_coord"].values, df["y_coord"].values)

# drop original coordinates
df = df.drop(columns =['x_coord','y_coord'])

df.head(10)

Unnamed: 0,open_date,type,longitude,latitude
0,10/20/2023,Homeless Encampment,-98.516167,29.444219
1,11/2/2023,Homeless Encampment,-98.534868,29.351602
2,11/6/2023,Homeless Encampment,-98.541934,29.535004
3,11/28/2023,Homeless Encampment,-98.569024,29.461083
4,12/4/2023,Homeless Encampment,-98.556656,29.528313
5,12/6/2023,Homeless Encampment,-98.525528,29.512414
6,12/16/2023,Homeless Encampment,-98.519163,29.507367
7,12/19/2023,Homeless Encampment,-98.464573,29.621459
8,12/19/2023,Homeless Encampment,-98.484157,29.343525
9,12/20/2023,Homeless Encampment,-98.503728,29.317004


## Adding Zip Code Column
   * Get zips from latitude and longitude 

In [8]:
# get list with Point(longitude, latitude) for data frame
geometry = [Point(xy) for xy in zip(df['longitude'], df['latitude'])]

# convert df to geodataframe
gdf_points = gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:4326')

# load 2024 ZCTA shapefile into a geodataframe
gdf_zips = gpd.read_file('tl_2024_us_zcta520.shp')

# ensure both geoDataFrames use the same CRS
gdf_zips = gdf_zips.to_crs(gdf_points.crs)

# spatialy join geodataframes adding zipcode info to gdf_points
gdf_joined = gpd.sjoin(gdf_points, gdf_zips, how='left', predicate='within')

# add ZIP code column from shapefile 
gdf_joined['zip_code'] = gdf_joined['ZCTA5CE20']

# drop spatial join metadata
gdf_joined = gdf_joined.drop(columns=['index_right'])

# convert back to regular dataframe
df = pd.DataFrame(gdf_joined)

# drop parsing columns
df = df[['open_date',
         'type',
         'latitude',
         'longitude',
         'zip_code']]

df.head(10)

Unnamed: 0,open_date,type,latitude,longitude,zip_code
0,10/20/2023,Homeless Encampment,29.444219,-98.516167,78201
1,11/2/2023,Homeless Encampment,29.351602,-98.534868,78224
2,11/6/2023,Homeless Encampment,29.535004,-98.541934,78230
3,11/28/2023,Homeless Encampment,29.461083,-98.569024,78228
4,12/4/2023,Homeless Encampment,29.528313,-98.556656,78230
5,12/6/2023,Homeless Encampment,29.512414,-98.525528,78213
6,12/16/2023,Homeless Encampment,29.507367,-98.519163,78213
7,12/19/2023,Homeless Encampment,29.621459,-98.464573,78259
8,12/19/2023,Homeless Encampment,29.343525,-98.484157,78214
9,12/20/2023,Homeless Encampment,29.317004,-98.503728,78221


## Dropping Rows with Zips not in San Antonio
* 4 rows were dropped
* 2963 rows remain

In [9]:
print(len(df))

# list of zips in San Antonio
sa_zips = ['78201', '78202', '78203', '78204', '78205', 
           '78206', '78207', '78208', '78209', '78210', 
           '78211', '78212', '78213', '78214', '78215', 
           '78216', '78217', '78218', '78219', '78220', 
           '78221', '78222', '78223', '78224', '78225', 
           '78226', '78227', '78228', '78229', '78230', 
           '78231', '78232', '78233', '78234', '78235', 
           '78236', '78237', '78238', '78239', '78240', 
           '78241', '78242', '78243', '78244', '78245', 
           '78246', '78247', '78248', '78249', '78250', 
           '78251', '78252', '78253', '78254', '78255', 
           '78256', '78257', '78258', '78259', '78260', 
           '78261', '78262', '78263', '78264', '78265', 
           '78266', '78268', '78269', '78270', '78275', 
           '78278', '78279', '78280', '78283', '78284', 
           '78285', '78286', '78287', '78288', '78289', 
           '78291', '78292', '78293', '78294', '78295', 
           '78296', '78297', '78298', '78299']

df = df[df.zip_code.isin(sa_zips)]

df_drop_zip = df.shape[0]
print(f'{df_drop_type - df_drop_zip} rows were dropped')
print(f'{df_drop_zip} rows remain')

2967
4 rows were dropped
2963 rows remain


## Checking For Nulls
* Data contains no null values after preperation

In [10]:
df.isnull().sum()

open_date    0
type         0
latitude     0
longitude    0
zip_code     0
dtype: int64

## Making Sure Data in Columns is the Desired Type

In [11]:
# casting open_date as datetime
df['open_date'] = pd.to_datetime(df['open_date'])

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2963 entries, 0 to 2966
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   open_date  2963 non-null   datetime64[ns]
 1   type       2963 non-null   object        
 2   latitude   2963 non-null   float64       
 3   longitude  2963 non-null   float64       
 4   zip_code   2963 non-null   object        
dtypes: datetime64[ns](1), float64(2), object(2)
memory usage: 138.9+ KB


## Getting Time Stamp for Data
* Data is between October 20th, 2003 and April 12th, 2005

In [13]:
print(f'Dataframe contains data from {df.open_date.min()} to {df.open_date.max()}')

Dataframe contains data from 2023-10-20 00:00:00 to 2025-04-12 00:00:00


## Full Prep
* Data represents 311 service requests taken between October 20th, 2003 and April 12th, 2005
* The original dataframe contained 564209 rows
* 2963 remain after dropping rows for the following reasons
    * 561242 rows were dropped due to type column not being associated with homelessness
    * 4 were dropped due to zipcodes being outside of San Antonio
    


In [14]:
df.head(10)

Unnamed: 0,open_date,type,latitude,longitude,zip_code
0,2023-10-20,Homeless Encampment,29.444219,-98.516167,78201
1,2023-11-02,Homeless Encampment,29.351602,-98.534868,78224
2,2023-11-06,Homeless Encampment,29.535004,-98.541934,78230
3,2023-11-28,Homeless Encampment,29.461083,-98.569024,78228
4,2023-12-04,Homeless Encampment,29.528313,-98.556656,78230
5,2023-12-06,Homeless Encampment,29.512414,-98.525528,78213
6,2023-12-16,Homeless Encampment,29.507367,-98.519163,78213
7,2023-12-19,Homeless Encampment,29.621459,-98.464573,78259
8,2023-12-19,Homeless Encampment,29.343525,-98.484157,78214
9,2023-12-20,Homeless Encampment,29.317004,-98.503728,78221


In [15]:
def get_preped_311_data():
    '''Prepare 311 data for project'''
    # read in original data
    df = pd.read_csv('311_service_requests.csv')
    
    # Get relevant columns
    df = df[['OPENEDDATETIME',
             'TYPENAME', 
             'XCOORD',
             'YCOORD']]


    df = df.rename(columns = {'OPENEDDATETIME' : 'open_date',
                              'TYPENAME' : 'type',  
                              'Council District' : 'district',
                              'XCOORD' : 'x_coord',
                              'YCOORD' : 'y_coord'})
    
    # keep only rows with values in type column that are associated with homelessness
    homeless = ['Homeless Encampment',
                'Homeless Outreach',
                'Encampment Abatement',
                'Sanitation_Encampment_Abatement',
                'Sanitation_UF-Encampment Abatement',
                'Sanitation_NA-Encampment Abatement']
    
    df = df[df['type'].isin(homeless)].reset_index(drop=True)
    
    # convert coordinates to latitude and longitude
    # define transformer: from EPSG:2278 (Texas South Central, ft) to EPSG:4326 (WGS84)
    transformer = Transformer.from_crs("EPSG:2278", "EPSG:4326", always_xy=True)

    # create new columns using x and y _coord columns
    df["longitude"], df["latitude"] = transformer.transform(df["x_coord"].values, df["y_coord"].values)

    # drop original coordinates
    df = df.drop(columns =['x_coord','y_coord'])
    
    # get zipcodes
    # get list with Point(longitude, latitude) for data frame
    geometry = [Point(xy) for xy in zip(df['longitude'], df['latitude'])]

    # convert df to geodataframe
    gdf_points = gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:4326')

    # load 2024 ZCTA shapefile into a geodataframe
    gdf_zips = gpd.read_file('tl_2024_us_zcta520.shp')

    # ensure both geoDataFrames use the same CRS
    gdf_zips = gdf_zips.to_crs(gdf_points.crs)

    # spatialy join geodataframes adding zipcode info to gdf_points
    gdf_joined = gpd.sjoin(gdf_points, gdf_zips, how='left', predicate='within')

    # add ZIP code column from shapefile 
    gdf_joined['zip_codes'] = gdf_joined['ZCTA5CE20'] 
    
    # drop spatial join metadata
    gdf_joined = gdf_joined.drop(columns=['index_right'])

    # convert back to regular dataframe
    df = pd.DataFrame(gdf_joined)

    # drop parsing columns
    df = df[['open_date',
             'type',
             'latitude',
             'longitude',
             'zip_codes']]
    
    # drop rows with zipcodes not in San Antonio
    # list of zips in San Antonio
    sa_zips = ['78201', '78202', '78203', '78204', '78205', 
               '78206', '78207', '78208', '78209', '78210', 
               '78211', '78212', '78213', '78214', '78215', 
               '78216', '78217', '78218', '78219', '78220', 
               '78221', '78222', '78223', '78224', '78225', 
               '78226', '78227', '78228', '78229', '78230', 
               '78231', '78232', '78233', '78234', '78235', 
               '78236', '78237', '78238', '78239', '78240', 
               '78241', '78242', '78243', '78244', '78245', 
               '78246', '78247', '78248', '78249', '78250', 
               '78251', '78252', '78253', '78254', '78255', 
               '78256', '78257', '78258', '78259', '78260', 
               '78261', '78262', '78263', '78264', '78265', 
               '78266', '78268', '78269', '78270', '78275', 
               '78278', '78279', '78280', '78283', '78284', 
               '78285', '78286', '78287', '78288', '78289', 
               '78291', '78292', '78293', '78294', '78295', 
               '78296', '78297', '78298', '78299']

    df = df[df.zip_code.isin(sa_zips)]
    
    # casting open_date as datetime
    df['open_date'] = pd.to_datetime(df['open_date'])
    
    df.to_excel('requests_prepared.xlsx')