In [1]:
#### Import the libraries needed
import pandas as pd
import datetime as dt

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
crime_file = '/content/drive/MyDrive/Omdena/Houston Local Chapter/Weather&Crime/Data/Cleaned data/all_crime_features_2010_2023_w_nibrs_class.csv'

In [4]:
all_df = pd.read_csv(crime_file,  dtype={'Offense Count':int, 'Beat': str, 'Block Range':str, 'City': str, 'ZIP Code': str, 'Street No': str, 'aggregate_cleaned_class': str})

In [5]:
all_df.replace({'aggregate_cleaned_class': '09'}, '9', inplace=True)
all_df.to_csv(crime_file, index=False)
all_df['aggregate_cleaned_class'].unique()

array(['23', '220', '120', '240', '13', '11', '9', '90Z', '290', '26',
       '520', '35', '90J', '250', '720', '90F', '90D', '90A', '36', '90C',
       '280', '90E', '40', '210', '64', '270', '90H', '90G', '200', '370',
       '39', '100', '90I', '510', '90B'], dtype=object)

In [6]:
all_df.head(5).T

Unnamed: 0,0,1,2,3,4
Occurrence Date,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01
Offense Count,1,1,1,1,1
Beat,3B10,5F20,1A10,7C10,18F20
Premise,20R,20D,05O,20R,18A
Block Range,4900-4999,8700-8799,400-499,1900-1999,3300-3399
Street Name,POINCIANA,HAMMERLY,MAIN,LOCKWOOD,MCCUE
Street Type,DR,-,ST,DR,RD
Suffix,-,-,-,-,-
Incident,,,,,
City,,,,,


In [7]:
all_df.dtypes

Occurrence Date             object
Offense Count                int64
Beat                        object
Premise                     object
Block Range                 object
Street Name                 object
Street Type                 object
Suffix                      object
Incident                   float64
City                        object
ZIP Code                    object
Street No                   object
MapLongitude               float64
MapLatitude                float64
cleaned_occurence_hour       int64
week                        object
month                        int64
year                         int64
mon_year                    object
season                      object
is_holiday                   int64
is_weekend                   int64
cleaned_description         object
cleaned_class               object
aggregate_cleaned_class     object
dtype: object

In [8]:
all_df[all_df['Offense Count']==0]

Unnamed: 0,Occurrence Date,Offense Count,Beat,Premise,Block Range,Street Name,Street Type,Suffix,Incident,City,...,week,month,year,mon_year,season,is_holiday,is_weekend,cleaned_description,cleaned_class,aggregate_cleaned_class
1082632,2018-08-05,0,10H20,"Residence, Home (Includes Apartment)",4100-4199,RUSK,ST,,,,...,Sunday,8,2018,Aug-2018,Summer,0,1,motor vehicle theft,240,240
1084765,2018-08-08,0,12D10,"Parking Lot, Garage",1200-1299,REDFORD,ST,,,,...,Wednesday,8,2018,Aug-2018,Summer,0,0,motor vehicle theft,240,240
1121089,2018-09-30,0,6B40,"Residence, Home (Includes Apartment)",900-999,HELMS,RD,,,,...,Sunday,9,2018,Sep-2018,Autumn,0,1,motor vehicle theft,240,240
1162858,2018-12-02,0,6B20,"Other, Unknown",3700-3799,PINEMONT,DR,,,,...,Sunday,12,2018,Dec-2018,Winter,0,1,forcible fondling,11D,11
1187774,2019-01-11,0,20G60,"Parking Lot, Garage",11999,KATY,FWY,,42367019.0,HOUSTON,...,Friday,1,2019,Jan-2019,Winter,0,0,motor vehicle theft,240,240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2283699,2023-01-22,0,8C10,"Residence, Home (Includes Apartment)",,TIDWELL,RD,,10797123.0,HOUSTON,...,Sunday,1,2023,Jan-2023,Winter,0,1,motor vehicle theft,240,240
2286950,2023-03-02,0,8C50,"Highway, Road, Street, Alley",,TIDWELL,RD,,32925723.0,HOUSTON,...,Thursday,3,2023,Mar-2023,Spring,0,0,motor vehicle theft,240,240
2287710,2023-02-19,0,8C60,"Highway, Road, Street, Alley",,BRETSHIRE,DR,,24986523.0,HOUSTON,...,Sunday,2,2023,Feb-2023,Winter,0,1,motor vehicle theft,240,240
2292204,2023-07-27,0,9C40,"Residence, Home (Includes Apartment)",,FLEMING,DR,,106438323.0,HOUSTON,...,Thursday,7,2023,Jul-2023,Summer,0,0,aggravated assault,13A,13


# Calculate daily crime counts

In [9]:
daily_crime_count = all_df.groupby(['Occurrence Date'], as_index=False)['Offense Count'].sum()
daily_crime_count

Unnamed: 0,Occurrence Date,Offense Count
0,2010-01-01,416
1,2010-01-02,344
2,2010-01-03,339
3,2010-01-04,384
4,2010-01-05,337
...,...,...
4955,2023-07-27,683
4956,2023-07-28,690
4957,2023-07-29,693
4958,2023-07-30,661


In [10]:
df = all_df[['Occurrence Date', 'week', 'month', 'year', 'mon_year', 'season', 'is_holiday', 'is_weekend','Offense Count', 'aggregate_cleaned_class']]

In [11]:
df.dtypes

Occurrence Date            object
week                       object
month                       int64
year                        int64
mon_year                   object
season                     object
is_holiday                  int64
is_weekend                  int64
Offense Count               int64
aggregate_cleaned_class    object
dtype: object

# Calculate specific crime rates per day

In [12]:
# long table to wide table
table = pd.pivot_table(df, values=['Offense Count'], index=['Occurrence Date'],
                       columns=['aggregate_cleaned_class'], aggfunc="sum", fill_value=0)

# drop multilevel column name
table.columns = table.columns.droplevel()
table.columns.name = None
table.reset_index(inplace=True)

In [13]:
# merge results with daily count and features in a same day
table = table.merge(daily_crime_count, on='Occurrence Date')
table = table.merge(df.drop(columns=['aggregate_cleaned_class','Offense Count']).drop_duplicates(), how='left', on = 'Occurrence Date')

In [14]:
label_names = pd.read_csv('/content/drive/MyDrive/Omdena/Houston Local Chapter/Weather&Crime/Data/Cleaned data/Label_meaning_crime_code_offense categories.csv')

In [15]:
#rename label names
label_names_dict = dict(zip(label_names['aggregate_cleaned_class'],label_names['offense categories']))
table.rename(columns=label_names_dict, inplace=True)

In [16]:
table

Unnamed: 0,Occurrence Date,Kidnapping/Abduction,"Sex Offenses, Forcible",Robbery,Assault Offenses,Arson,Extortion/Blackmail,Burglary/Breaking & Entering,Larceny/Theft Offenses,Motor Vehicle Theft,...,Trespass of Real Property,All Other Offenses,Offense Count,week,month,year,mon_year,season,is_holiday,is_weekend
0,2010-01-01,0,13,31,58,0,0,77,215,21,...,0,0,416,Friday,1,2010,Jan-2010,Winter,1,0
1,2010-01-02,0,2,32,15,0,0,67,200,28,...,0,0,344,Saturday,1,2010,Jan-2010,Winter,0,1
2,2010-01-03,0,0,38,35,0,0,52,179,35,...,0,0,339,Sunday,1,2010,Jan-2010,Winter,0,1
3,2010-01-04,0,1,24,25,0,0,94,211,29,...,0,0,384,Monday,1,2010,Jan-2010,Winter,0,0
4,2010-01-05,0,3,19,22,0,0,88,183,20,...,0,0,337,Tuesday,1,2010,Jan-2010,Winter,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4955,2023-07-27,0,1,28,142,0,1,27,215,56,...,26,42,683,Thursday,7,2023,Jul-2023,Summer,0,0
4956,2023-07-28,3,2,24,158,0,0,43,205,69,...,29,41,690,Friday,7,2023,Jul-2023,Summer,0,0
4957,2023-07-29,2,6,15,204,1,0,33,177,57,...,18,38,693,Saturday,7,2023,Jul-2023,Summer,0,1
4958,2023-07-30,0,9,16,200,1,0,28,139,50,...,18,44,661,Sunday,7,2023,Jul-2023,Summer,0,1


# Create data for model development

In [17]:
# read weather data
weather_df = pd.read_csv('/content/drive/MyDrive/Omdena/Houston Local Chapter/Weather&Crime/Data/Cleaned data/cleaned_weather.csv')
weather_df.drop(columns=['name'],inplace=True)
weather_df['sunrise'] = pd.to_datetime(weather_df['sunrise'])
weather_df['sunset'] = pd.to_datetime(weather_df['sunset'])
weather_df.rename(columns ={'datetime':'date'}, inplace=True)

In [18]:
weather_df.head(5)

Unnamed: 0,date,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,precip,...,solarradiation,solarenergy,uvindex,sunrise,sunset,moonphase,conditions,description,icon,stations
0,2010-01-01,53.9,41.3,46.9,53.9,33.0,42.3,34.0,63.0,0.0,...,174.8,15.0,7.0,2010-01-01 07:16:59,2010-01-01 17:33:24,0.53,Partially cloudy,Clearing in the afternoon.,partly-cloudy-day,"KHOU,72059400188,KIAH,KMCJ,72244012918,7224301..."
1,2010-02-01,53.7,40.1,46.3,53.7,34.9,43.1,30.9,56.9,0.0,...,156.6,13.4,6.0,2010-01-02 07:17:13,2010-01-02 17:34:07,0.57,Clear,Clear conditions throughout the day.,clear-day,"KHOU,72059400188,KIAH,KMCJ,72244012918,7224301..."
2,2010-03-01,47.4,41.1,44.8,46.5,33.8,40.8,35.6,70.2,0.0,...,80.2,6.7,3.0,2010-01-03 07:17:25,2010-01-03 17:34:50,0.6,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"KHOU,72059400188,KIAH,KMCJ,72244012918,7224301..."
3,2010-04-01,46.4,36.2,41.8,42.1,28.7,35.2,29.4,62.0,0.0,...,167.4,14.5,7.0,2010-01-04 07:17:35,2010-01-04 17:35:35,0.64,Partially cloudy,Clearing in the afternoon.,partly-cloudy-day,"KHOU,72059400188,KIAH,KMCJ,72244012918,7224301..."
4,2010-05-01,48.2,31.7,38.7,47.5,24.0,34.3,23.6,56.1,0.0,...,174.1,15.0,7.0,2010-01-05 07:17:44,2010-01-05 17:36:20,0.67,Clear,Clear conditions throughout the day.,clear-day,"KHOU,72059400188,KIAH,KMCJ,72244012918,7224301..."


**Some `date` is in the format of YYYY-DD-MM. Change all incorrect date with the format of YYYY-MM-DD**

In [19]:
# update the data column with the 'sunrise' column
print('inconsistent date records number between columns of sunrise and sunset: ', (weather_df['sunrise'].dt.date != weather_df['sunset'].dt.date).sum())
weather_df['date'] = weather_df['sunrise'].dt.date
weather_df['date'] = pd.to_datetime(weather_df['date'])

inconsistent date records number between columns of sunrise and sunset:  0


In [20]:
weather_df['sunrise'] = weather_df['sunrise'].dt.time
weather_df['sunset'] = weather_df['sunset'].dt.time
weather_df.head()

Unnamed: 0,date,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,precip,...,solarradiation,solarenergy,uvindex,sunrise,sunset,moonphase,conditions,description,icon,stations
0,2010-01-01,53.9,41.3,46.9,53.9,33.0,42.3,34.0,63.0,0.0,...,174.8,15.0,7.0,07:16:59,17:33:24,0.53,Partially cloudy,Clearing in the afternoon.,partly-cloudy-day,"KHOU,72059400188,KIAH,KMCJ,72244012918,7224301..."
1,2010-01-02,53.7,40.1,46.3,53.7,34.9,43.1,30.9,56.9,0.0,...,156.6,13.4,6.0,07:17:13,17:34:07,0.57,Clear,Clear conditions throughout the day.,clear-day,"KHOU,72059400188,KIAH,KMCJ,72244012918,7224301..."
2,2010-01-03,47.4,41.1,44.8,46.5,33.8,40.8,35.6,70.2,0.0,...,80.2,6.7,3.0,07:17:25,17:34:50,0.6,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"KHOU,72059400188,KIAH,KMCJ,72244012918,7224301..."
3,2010-01-04,46.4,36.2,41.8,42.1,28.7,35.2,29.4,62.0,0.0,...,167.4,14.5,7.0,07:17:35,17:35:35,0.64,Partially cloudy,Clearing in the afternoon.,partly-cloudy-day,"KHOU,72059400188,KIAH,KMCJ,72244012918,7224301..."
4,2010-01-05,48.2,31.7,38.7,47.5,24.0,34.3,23.6,56.1,0.0,...,174.1,15.0,7.0,07:17:44,17:36:20,0.67,Clear,Clear conditions throughout the day.,clear-day,"KHOU,72059400188,KIAH,KMCJ,72244012918,7224301..."


In [21]:
# merge crime and weather data
table.rename(columns={'Occurrence Date':'date'}, inplace=True)
table['date'] = pd.to_datetime(table['date'])
table = table.merge(weather_df, how='left', on='date')

In [22]:
table.tail()

Unnamed: 0,date,Kidnapping/Abduction,"Sex Offenses, Forcible",Robbery,Assault Offenses,Arson,Extortion/Blackmail,Burglary/Breaking & Entering,Larceny/Theft Offenses,Motor Vehicle Theft,...,solarradiation,solarenergy,uvindex,sunrise,sunset,moonphase,conditions,description,icon,stations
4955,2023-07-27,0,1,28,142,0,1,27,215,56,...,127.1,10.7,8.0,06:37:57,20:17:50,0.33,Clear,Clear conditions throughout the day.,clear-day,"KHOU,72059400188,KIAH,KMCJ,72244012918,7224301..."
4956,2023-07-28,3,2,24,158,0,0,43,205,69,...,168.0,14.5,9.0,06:38:33,20:17:12,0.36,"Rain, Partially cloudy",Partly cloudy throughout the day with late aft...,rain,"KHOU,72059400188,KIAH,KMCJ,72244012918,7224301..."
4957,2023-07-29,2,6,15,204,1,0,33,177,57,...,199.7,17.1,10.0,06:39:08,20:16:33,0.4,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"KHOU,72059400188,KIAH,KMCJ,72244012918,7224301..."
4958,2023-07-30,0,9,16,200,1,0,28,139,50,...,221.6,19.1,9.0,06:39:43,20:15:53,0.43,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"KHOU,72059400188,KIAH,KMCJ,72244012918,7224301..."
4959,2023-07-31,0,4,21,173,0,0,34,141,53,...,197.5,17.0,9.0,06:40:19,20:15:11,0.46,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"KHOU,72059400188,KIAH,KMCJ,72244012918,7224301..."


In [23]:
table.columns

Index(['date', 'Kidnapping/Abduction', 'Sex Offenses, Forcible', 'Robbery',
       'Assault Offenses', 'Arson', 'Extortion/Blackmail',
       'Burglary/Breaking & Entering', 'Larceny/Theft Offenses',
       'Motor Vehicle Theft', 'Counterfeiting/Forgery', 'Fraud Offenses',
       'Embezzlement', 'Stolen Property Offenses',
       'Destruction/Damage/Vandalism of Property', 'Drug/Narcotic Offenses',
       'Sex Offenses, Nonforcible', 'Pornography/Obscene Material',
       'Gambling Offenses', 'Prostitution Offenses', 'Bribery',
       'Weapon Law Violations', 'Human Trafficking or Kidnapping/Abduction',
       'Animal Cruelty', 'Homicide Offenses', 'Bad Checks',
       'Curfew/Loitering/Vagrancy Violations', 'Disorderly Conduct',
       'Driving Under the Influence', 'Drunkenness',
       'Family Offenses, Nonviolent', 'Liquor Law Violations', 'Peeping Tom',
       'Runaway', 'Trespass of Real Property', 'All Other Offenses',
       'Offense Count', 'week', 'month', 'year', 'mon_year',

The following columns could be used as a **Label**:
       'Kidnapping/Abduction', 'Sex Offenses, Forcible', 'Robbery',
       'Assault Offenses', 'Arson', 'Extortion/Blackmail',
       'Burglary/Breaking & Entering', 'Larceny/Theft Offenses',
       'Motor Vehicle Theft', 'Counterfeiting/Forgery', 'Fraud Offenses',
       'Embezzlement', 'Stolen Property Offenses',
       'Destruction/Damage/Vandalism of Property', 'Drug/Narcotic Offenses',
       'Sex Offenses, Nonforcible', 'Pornography/Obscene Material',
       'Gambling Offenses', 'Prostitution Offenses', 'Bribery',
       'Weapon Law Violations', 'Human Trafficking or Kidnapping/Abduction',
       'Animal Cruelty', 'Homicide Offenses', 'Bad Checks',
       'Curfew/Loitering/Vagrancy Violations', 'Disorderly Conduct',
       'Driving Under the Influence', 'Drunkenness',
       'Family Offenses, Nonviolent', 'Liquor Law Violations', 'Peeping Tom',
       'Runaway', 'Trespass of Real Property', 'All Other Offenses',
       'Offense Count'
* **'Offense Count'** represents the daily crime number in total

* Other offense categories represents specific crime numbers per day

In [24]:
# save the data to a csv file
table.to_csv("/content/drive/MyDrive/Omdena/Houston Local Chapter/Weather&Crime/Data/Cleaned data/daily crime numbers and weather data for time series analysis.csv",index=False)