# Imports

In [1]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

## Reading in Data and Initial Overview

In [2]:
# read in data
df = pd.read_excel('eviction_cases.xlsx')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16657 entries, 0 to 16656
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   JP PRECINCT            16657 non-null  object        
 1   CaseNumber             16657 non-null  object        
 2   CaseFileDate           16657 non-null  datetime64[ns]
 3   CORP                   16657 non-null  object        
 4   Plaintiff              16657 non-null  object        
 5   PERSON_ALIAS_ID        16657 non-null  int64         
 6   PlaintiffPhone         16657 non-null  object        
 7   PlaintiffAddress       16657 non-null  object        
 8   Pl_City                16657 non-null  object        
 9   Pl_State               16657 non-null  object        
 10  Pl_Zip                 16657 non-null  object        
 11  DefendantAddress       16028 non-null  object        
 12  Unit Number            8116 non-null   object        
 13  C

In [4]:
og_data = df.shape[0]
print(f'Original Dataframe Length: {og_data}')

Original Dataframe Length: 16657


## Get and Rename Relevant Columns

In [5]:
# get relevant columns and rename for clarity
df = df[['CaseNumber',        # used to distinguish unique cases
         'JUDGMENT_DT',       # used for date case was ruled on
         'POSTAL_CD',         # used for zip code
         'Disposition']]      # used to determine result of case
            

df = df.rename(columns={'CaseNumber'      : 'case_number',
                        'JUDGMENT_DT'     : 'judgement_date',
                        'POSTAL_CD'       : 'zip_code',
                        'Disposition'     : 'disposition'})

df.head(10)

Unnamed: 0,case_number,judgement_date,zip_code,disposition
0,11E2102330,2024-09-17,78221,Judgment for Plaintiff (OCA)
1,11E2301639,2024-05-01,78227,Agreed Judgments (OCA)
2,11E2303045,2024-01-11,78214,Dismissed
3,11E2303045,2024-01-11,78214,Dismissed
4,11E2303297,2024-02-08,78251,Default Judgments (OCA)
5,11E2303925,2024-01-11,78253-4301,Judgment for Plaintiff (OCA)
6,11E2304048,2024-01-04,78226,Judgment for Plaintiff (OCA)
7,11E2304072,2024-07-09,78221,Take Nothing Judgment (OCA)
8,11E2304114,2024-01-11,78221,Judgment for Plaintiff (OCA)
9,11E2304258,2024-06-18,78214,Judgment for Plaintiff (OCA)


## Dropping Rows with Zip Codes Outside of San Antonio
* 1427 rows were dropped
* 15230 rows remain

In [6]:
# get first five digits of values in zip_code column
df['zip_code'] = df.zip_code.apply(lambda x : str(x)[:5])

# list of zips in San Antonio
sa_zips = ['78201', '78202', '78203', '78204', '78205', 
           '78206', '78207', '78208', '78209', '78210', 
           '78211', '78212', '78213', '78214', '78215', 
           '78216', '78217', '78218', '78219', '78220', 
           '78221', '78222', '78223', '78224', '78225', 
           '78226', '78227', '78228', '78229', '78230', 
           '78231', '78232', '78233', '78234', '78235', 
           '78236', '78237', '78238', '78239', '78240', 
           '78241', '78242', '78243', '78244', '78245', 
           '78246', '78247', '78248', '78249', '78250', 
           '78251', '78252', '78253', '78254', '78255', 
           '78256', '78257', '78258', '78259', '78260', 
           '78261', '78262', '78263', '78264', '78265', 
           '78266', '78268', '78269', '78270', '78275', 
           '78278', '78279', '78280', '78283', '78284', 
           '78285', '78286', '78287', '78288', '78289', 
           '78291', '78292', '78293', '78294', '78295', 
           '78296', '78297', '78298', '78299']

# keep only rows that have a zip code in sa_zips
df = df[df.zip_code.isin(sa_zips)]


# print row data
df_post_zip = df.shape[0]

print(f'{og_data - df_post_zip} rows were dropped')
print(f'{df_post_zip} rows remain')

1427 rows were dropped
15230 rows remain


## Eleminating Duplicate Case Numbers
* Keeping row with oldest disposition date for each case number
* 702 rows were dropped
* 14528 rows remain

In [7]:
# sort values in descending order
df = df.sort_values(by='judgement_date',ascending=False).reset_index(drop=True)

# drop rows with duplicate case numbers keeping only the first
df = df.drop_duplicates(subset='case_number', keep='first').reset_index(drop=True)

# print row data
df_post_case = df.shape[0]

print(f'{df_post_zip - df_post_case } rows were dropped')
print(f'{df_post_case} rows remain')

702 rows were dropped
14528 rows remain


# Dropping Cases not Likely to Result in Eviction
* **Rows were dropped if disposition indicated the case would not likely result in eviction**
* **Likely Eviction**
    * Default Judgments (OCA)
    * Judgment for Plaintiff (OCA)
* **Ambiguous or Likely No Eviction**
    * Appealed
    * Take Nothing Judgment (OCA)
    * Non-suited or Dismissed by Plaintiff (OCA)
    * Agreed Judgments (OCA)
    * Dismissed for Want of Prosecution (OCA)
    * Dismissed
    * Trial/Hearing by Judge/Hearing Officer (OCA)
    * Dismissed for Want of Jurisdiction (OCA)
* **Rows were dropped if disposition indicated the case would not likely result in eviction**
* 370 rows were dropped
* 14158 rows remain
    
    
    
    

In [8]:
# removing rows where disposition is not likely to result in eviction
evict = ['Default Judgments (OCA)',
         'Judgment for Plaintiff (OCA)']

df = df[df.disposition.isin(evict)]

# print row data
df_post_evict = df.shape[0]

print(f'{df_post_case - df_post_evict} rows were dropped')
print(f'{df_post_evict} rows remain')

370 rows were dropped
14158 rows remain


In [9]:
df.disposition.value_counts()

Default Judgments (OCA)         9239
Judgment for Plaintiff (OCA)    4919
Name: disposition, dtype: int64

## Making Sure Data in Columns is the Desired Type

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14158 entries, 0 to 14527
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   case_number     14158 non-null  object        
 1   judgement_date  14158 non-null  datetime64[ns]
 2   zip_code        14158 non-null  object        
 3   disposition     14158 non-null  object        
dtypes: datetime64[ns](1), object(3)
memory usage: 553.0+ KB


## Getting Time Stamp for Data

In [11]:
print(f'Dataframe contains data from {df.judgement_date.min()} to {df.judgement_date.max()}')

Dataframe contains data from 2024-01-02 00:00:00 to 2024-12-19 00:00:00


## Full Prep
* Data represents evictioncases between January 2nd, 2004 and December 19th, 2024
* The original dataframe contained 16,657 rows
* 1427 rows were dropped due to having zip codes outside of San Antonio
* 702 rows were dropped due to having duplicate case numbers
* 370 rows were dropped due to disposition column indicating the case would not likely result in an eviction
* 14,158 remain after preperation

In [12]:
df.head(10)

Unnamed: 0,case_number,judgement_date,zip_code,disposition
0,41E2407310,2024-12-19,78223,Judgment for Plaintiff (OCA)
1,21E2407657,2024-12-19,78229,Default Judgments (OCA)
2,21E2407655,2024-12-19,78229,Default Judgments (OCA)
3,21E2407654,2024-12-19,78229,Default Judgments (OCA)
4,21E2407650,2024-12-19,78251,Default Judgments (OCA)
5,21E2407649,2024-12-19,78229,Default Judgments (OCA)
6,21E2407648,2024-12-19,78229,Default Judgments (OCA)
7,21E2407647,2024-12-19,78229,Judgment for Plaintiff (OCA)
8,21E2407646,2024-12-19,78229,Judgment for Plaintiff (OCA)
9,21E2407645,2024-12-19,78228,Default Judgments (OCA)


In [13]:
def get_prepared_eviction_data():
    '''Prepare eviction data for project'''
    
    # read in unprepared data
    df = pd.read_excel('eviction_cases.xlsx')
    
    # get relevant columns and rename for clarity
    df = df[['CaseNumber',        # used to distinguish unique cases
             'JUDGMENT_DT',       # used for date case was ruled on
             'POSTAL_CD',         # used for zip code
             'Disposition']]      # used to determine result of case


    df = df.rename(columns={'CaseNumber'      : 'case_number',
                            'JUDGMENT_DT'     : 'judgement_date',
                            'POSTAL_CD'       : 'zip_code',
                            'Disposition'     : 'disposition'})

    # drop rows with zip codes not in San Antonio
    # get first five digits of values in zip_code column
    df['zip_code'] = df.zip_code.apply(lambda x : str(x)[:5])

    # list of zips in San Antonio
    sa_zips = ['78201', '78202', '78203', '78204', '78205', 
               '78206', '78207', '78208', '78209', '78210', 
               '78211', '78212', '78213', '78214', '78215', 
               '78216', '78217', '78218', '78219', '78220', 
               '78221', '78222', '78223', '78224', '78225', 
               '78226', '78227', '78228', '78229', '78230', 
               '78231', '78232', '78233', '78234', '78235', 
               '78236', '78237', '78238', '78239', '78240', 
               '78241', '78242', '78243', '78244', '78245', 
               '78246', '78247', '78248', '78249', '78250', 
               '78251', '78252', '78253', '78254', '78255', 
               '78256', '78257', '78258', '78259', '78260', 
               '78261', '78262', '78263', '78264', '78265', 
               '78266', '78268', '78269', '78270', '78275', 
               '78278', '78279', '78280', '78283', '78284', 
               '78285', '78286', '78287', '78288', '78289', 
               '78291', '78292', '78293', '78294', '78295', 
               '78296', '78297', '78298', '78299']

    # keep only rows that have a zip code in sa_zips
    df = df[df.zip_code.isin(sa_zips)]
    
    # drop rows with duplicate case numbers
    # sort values in descending order
    df = df.sort_values(by='judgement_date',ascending=False).reset_index(drop=True)

    # drop rows with duplicate case numbers keeping only the first
    df = df.drop_duplicates(subset='case_number', keep='first').reset_index(drop=True)

    # drop rows where disposition is not likely to result in eviction
    evict = ['Default Judgments (OCA)',
             'Judgment for Plaintiff (OCA)']

    df = df[df.disposition.isin(evict)]
    
    df.to_excel('evictions_prepared.xlsx')