In [1]:
import pandas as pd
import numpy as np
import folium
from folium.plugins import HeatMapWithTime, MarkerCluster
from folium.features import DivIcon
from geopy.geocoders import Nominatim

# County Data

The data in this section are aggregate records by county. Due to privacy issues, LA Times did not want to reveal death counts by city. I've designed the table to provide the most up-to-date data with every kernel run. Our information is as up to date as the LA Times records.

In [2]:
county = pd.read_csv('https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/latimes-county-totals.csv')

county.head()

Unnamed: 0,date,county,fips,confirmed_cases,deaths,new_confirmed_cases,new_deaths
0,2020-01-26,Alameda,1,0,0,,
1,2020-01-26,Calaveras,9,0,0,,
2,2020-01-26,Contra Costa,13,0,0,,
3,2020-01-26,Humboldt,23,0,0,,
4,2020-01-26,Los Angeles,37,1,0,,


In [3]:
county['date'] = pd.to_datetime(county['date'])

grouped_by_county = county.iloc[county.groupby('county')['date'].agg(pd.Series.idxmax)].copy()

In [4]:
# Color code counties by confirmed case count. Red for counties with more than 300 confirmed cases and Blue otherwise.
grouped_by_county['color'] = grouped_by_county['confirmed_cases'].map(lambda x: 'red' if x > 300 else 'teal')

In [5]:
print(grouped_by_county.dtypes)
display(grouped_by_county)

date                   datetime64[ns]
county                         object
fips                            int64
confirmed_cases                 int64
deaths                          int64
new_confirmed_cases           float64
new_deaths                    float64
color                          object
dtype: object


Unnamed: 0,date,county,fips,confirmed_cases,deaths,new_confirmed_cases,new_deaths,color
3616,2020-05-13,Alameda,1,2178,76,45.0,2.0,red
3617,2020-05-13,Alpine,3,1,0,0.0,0.0,teal
3618,2020-05-13,Amador,5,9,0,0.0,0.0,teal
3619,2020-05-13,Butte,7,20,0,0.0,0.0,teal
3620,2020-05-13,Calaveras,9,13,0,0.0,0.0,teal
3621,2020-05-13,Colusa,11,3,0,0.0,0.0,teal
3622,2020-05-13,Contra Costa,13,1080,33,14.0,1.0,red
3623,2020-05-13,Del Norte,15,4,0,1.0,0.0,teal
3624,2020-05-13,El Dorado,17,60,0,0.0,0.0,teal
3625,2020-05-13,Fresno,19,1014,13,30.0,3.0,red


In [6]:
geolocater = Nominatim(user_agent='jonathancheung12@yahoo.com')

latitudes = []
longitudes = []
for i in grouped_by_county['county']:
    location = geolocater.geocode(i+', California')
    latitudes.append(location.latitude)
    longitudes.append(location.longitude) 

grouped_by_county['latitude'] = np.array(latitudes)
grouped_by_county['longitude'] = np.array(longitudes)

In [7]:
def base_map(default_location = [36.7783, -119.4179], default_zoom = 6):
    base = folium.Map(location = default_location, 
                      control_scale = True,
#                       tiles = 'CartoDB positron',
                      zoom_start = default_zoom)
    return base

In [8]:
# California's coordinates: 36.7783 lat 119.4179 long
county_map = base_map()

clusters = MarkerCluster().add_to(county_map)

for county, color, lat, lon, death, cases in zip(grouped_by_county['county'], 
                                                 grouped_by_county['color'], 
                                                 grouped_by_county['latitude'], 
                                                 grouped_by_county['longitude'], 
                                                 grouped_by_county['deaths'], 
                                                 grouped_by_county['confirmed_cases']):
    folium.CircleMarker(
        [lat, lon],
        radius = 10,
        fill = True,
        color = color,
        popup = f'{county} Deaths:{death} Cases:{cases}'
    ).add_to(clusters)

county_map

# City Data

In [9]:
city = pd.read_csv('https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/latimes-place-totals.csv')
city.head()

Unnamed: 0,date,county,fips,place,confirmed_cases,note,x,y
0,2020-03-16,Los Angeles,37,Alhambra,2,,-118.135504,34.083961
1,2020-03-16,Los Angeles,37,Arcadia,1,,-118.037297,34.134186
2,2020-03-16,Los Angeles,37,Beverly Hills,1,,-118.402109,34.078543
3,2020-03-16,Los Angeles,37,Boyle Heights,5,,-118.20533,34.03815
4,2020-03-16,Los Angeles,37,Carson,1,,-118.255878,33.837391


In [10]:
print(city.dtypes)
print()
print(city.isna().sum())
print()
print(city.shape)

date                object
county              object
fips                 int64
place               object
confirmed_cases      int64
note                object
x                  float64
y                  float64
dtype: object

date                   0
county                 0
fips                   0
place                  0
confirmed_cases        0
note               28274
x                    676
y                    676
dtype: int64

(30606, 8)


In [11]:
# These are locations that do not have coordinates associated with them. Given our volume of existing data,
# I am going to assume that these were mistakes made during the surveying process. Drop as MAR
city.loc[city['x'].isna()]['place'].unique()

array(['Santa Clarita and Stevenson Ranch',
       'Smaller Los Angeles neighborhoods', 'Pico', 'Unknown', 'Other',
       'Other/Unknown', 'Outside of cities', 'Undetermined',
       'Other/Unspecified', 'County', 'Unincorporated', 'Known Homeless',
       'No Address (May Be Homeless)', 'Other/unknown', 'unincorporated',
       'Outside of Cities', 'Other/unknowns', 'Greater Porterville',
       'Greater Tulare', 'Greater Visalia', 'Mid', 'North',
       'Remainder of County', 'Covina (Charter Oak) 13 ( 98.9 )',
       'unincorporated / districts', 'District 1', 'District 2',
       'District 3', 'District 4', 'District 5', '94002: Belmont',
       '94005: Brisbane', '94010: Burlingame/Hillsborough',
       '94014: Daly City/Colma', '94015: Daly City',
       '94019: Half Moon Bay', '94020: La Honda', '94021: Loma Mar',
       '94025: Menlo Park', '94027: Menlo Park/Atherton',
       '94028: Menlo Park/Portola Valley', '94030: Millbrae',
       '94037: Montara', '94038: Moss Beach', 

In [12]:
# Filter out all NaN values
city = city.loc[city['x'].notna()].drop(columns = 'note').copy()

# Convert date column to datetime object
city['date'] = pd.to_datetime(city['date'])

In [13]:
city_list = []

for date in city['date'].sort_values().unique():
    city_list.append(city.loc[city['date'] == date, ['y', 'x', 'confirmed_cases']].groupby(['y', 'x']).sum().reset_index().values.tolist())

In [28]:
city_map = base_map()

HeatMapWithTime(
    data = city_list,
    radius = 15,
    gradient = {0.1: 'blue', 0.2: 'lime', 0.6: 'orange', 1: 'red'},
    use_local_extrema = True,
    min_opacity = 0.2,
    max_opacity = 1,
    position = 'topright',
    auto_play = True
).add_to(city_map)


city_map

In [26]:
city_map.save('city_map.html')

# Twitter Data

In [16]:
df = pd.read_csv('~/OneDrive/Desktop/model_loc_sentiment.csv')
df.head()

Unnamed: 0,tweet_id,text,region,tweettokens,processedtweet,sentiment,Predicted_x,Predicted_y
0,1246588222903214080,Shaukat Khanum Memorial Cancer Hospital offers...,LA,"['shaukat', 'khanum', 'memorial', 'cancer', 'h...",shaukat khanum memorial cancer hospital offer ...,0,0.0,
1,1246587645779574784,Church congregants insisting on attending serv...,LA,"['church', 'congregant', 'insist', 'attend', '...",church congregant insist attend service wake c...,0,,0.0
2,1246585369899892738,Rendering of a new 44-unit affordable housing ...,LA,"['render', 'new', 'unit', 'affordable', 'housi...",render new unit affordable housing project hav...,-1,-1.0,
3,1246584511732695040,"Double date, covid-style. pic.twitter.com/LWh2...",LA,"['double', 'date', 'covidstyle']",double date covidstyle,0,,0.0
4,1246584386583068672,Si tuvieran que sacrificar un pueblo para acab...,LA,"['si', 'tuvieran', 'que', 'sacrificar', 'un', ...",si tuvieran que sacrificar un pueblo para acab...,0,,0.0


In [17]:
df['region'].value_counts()

LA             46666
SF             41709
SD             28348
SAC            18513
BAKERSFIELD     3861
REDDING          912
CHICO            579
Name: region, dtype: int64

In [18]:
tweets_df = pd.DataFrame(df.groupby('region')['sentiment'].value_counts()).rename(columns = {'sentiment' : 'Count'}).reset_index()

tweets_df

Unnamed: 0,region,sentiment,Count
0,BAKERSFIELD,0,1529
1,BAKERSFIELD,-1,1408
2,BAKERSFIELD,1,924
3,CHICO,0,228
4,CHICO,-1,214
5,CHICO,1,137
6,LA,0,17182
7,LA,-1,15115
8,LA,1,14369
9,REDDING,-1,375


In [19]:
coordinates = {
    'LA': (34.053691, -118.242767),
    'SF': (37.779026, -122.419906),
    'SD': (32.717421, -117.162771),
    'SAC': (38.581572, -121.4944),
    'BAKERSFIELD': (35.31457, -118.753822),
    'REDDING': (40.796512, -121.997919),
    'CHICO': (39.651927, -121.585844)
}

In [20]:
tweets_df['region'].unique()

array(['BAKERSFIELD', 'CHICO', 'LA', 'REDDING', 'SAC', 'SD', 'SF'],
      dtype=object)

In [21]:
positive_pct = pd.DataFrame(round((tweets_df[tweets_df['sentiment'] == 1]['Count'] / tweets_df.groupby('region')['Count'].sum().values)*100, 2))
positive_pct.set_index(keys = tweets_df['region'].unique(), inplace = True)

positive_pct['latitude'] = positive_pct.index.map(lambda x: coordinates[x][0])
positive_pct['longitude'] = positive_pct.index.map(lambda x: coordinates[x][1])

In [22]:
positive_pct

Unnamed: 0,Count,latitude,longitude
BAKERSFIELD,23.93,35.31457,-118.753822
CHICO,23.66,39.651927,-121.585844
LA,30.79,34.053691,-118.242767
REDDING,25.0,40.796512,-121.997919
SAC,30.05,38.581572,-121.4944
SD,26.46,32.717421,-117.162771
SF,30.58,37.779026,-122.419906


In [23]:
for lat, lon, count in zip(positive_pct['latitude'], positive_pct['longitude'], positive_pct['Count']):
    folium.Marker(
        [lat, lon],
        icon = DivIcon(
            icon_size = (1, 1),
            icon_anchor = (25, 10),
            html = f'<div style="font-size: 15pt; color : black; font-weight: bold">{count}%</div>'
        )
    ).add_to(county_map)
    
county_map

In [24]:
county_map.save('county_map.html')