In [1]:
#!pip install plotly

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import plotly

import plotly.offline as py
import plotly.graph_objs as go
import plotly.express as px
from IPython.display import Image

In [80]:
import os
if not os.path.exists('images'):
    os.mkdir('images')
    
import plotly.io as pio

### Data Cleaning###

In [2]:
#shorten column names
def cleanData(infile_name):
    df = pd.read_csv(infile_name)
    
    #keeps only the columns we need
    keep_cols = df[['url','published', 'sentiment', 'content','extra_author_attributes.world_data.longitude',
                'extra_author_attributes.world_data.latitude','extra_author_attributes.short_name','extra_author_attributes.world_data.region',
                'extra_author_attributes.world_data.city', 'reach']]
    new_df = keep_cols.copy()
    
    #shortens column names
    new_df.rename(columns = {'extra_author_attributes.world_data.longitude': 'long'},inplace = True) 
    new_df.rename(columns = {'extra_author_attributes.world_data.latitude': 'lat'},inplace = True) 
    new_df.rename(columns = {'extra_author_attributes.short_name': 'name'},inplace = True) 
    new_df.rename(columns = {'extra_author_attributes.world_data.region': 'state'},inplace = True) 
    new_df.rename(columns = {'extra_author_attributes.world_data.city': 'city'},inplace = True) 

    #breaks published date into date, time, am/pm columns instead of all in one
    new_df[['date','time','ampm']] = new_df.published.str.split(expand=True) 
    

    return new_df

In [3]:
breyers = cleanData('breyers.csv')
bnj = cleanData('benandjerrys.csv')
tal = cleanData('talenti.csv')
hd = cleanData('haagendazs.csv')

In [19]:
breyers['brand']='breyers'
bnj['brand']='bnj'
tal['brand']='tal'
hd['brand']='hd'

In [None]:
#merge 4 brands together
all4=pd.concat([breyers,bnj,tal,hd], axis=0)
all4.tail()

In [None]:
#check missing values

all4.isnull().sum()
len(all4.state)

In [41]:
#check all unique values for the four states
all4.state.unique()

array(['Texas', 'Washington, D.C.', 'West Virginia', 'Rhode Island',
       'Illinois', 'Ohio', 'Arizona', 'Florida', 'California',
       'Pennsylvania', 'Virginia', 'Georgia (U.S. state)', 'Utah',
       'Colorado', 'Massachusetts', 'Hawaii', 'Michigan',
       'North Carolina', 'New York', 'Indiana', 'Missouri',
       'South Carolina', 'Nevada', 'Tennessee', 'New Jersey', 'Maryland',
       'Connecticut', 'Oklahoma', 'Kentucky', 'Delaware', 'New Mexico',
       'Idaho', 'Kansas', 'Iowa', 'Oregon', 'Washington (state)', 'Maine',
       'Alabama', 'Wisconsin', 'Louisiana', 'Minnesota', 'Mississippi',
       'New Hampshire', 'Vermont', 'Alaska', 'Arkansas', 'Nebraska',
       'North Dakota', 'South Dakota', 'Montana', 'Wyoming'], dtype=object)

convert state to its two letter abbreviation

In [22]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Washington, D.C.': 'DC',
    'Florida': 'FL',
    'Georgia (U.S. state)': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington (state)': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
}

In [37]:
all4['State_abb']= None
for i in range(len(all4.state)):
    a=str(all4['state'].iloc[i])
    all4['State_abb'].iloc[i]= us_state_abbrev[a]



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [125]:
#convert column date from string type to datetime
def str2time(data):
    for i in range(len(data["date"])):
        data["date"].iloc[i]=datetime.strptime(data["date"].iloc[i], '%m/%d/%y')

        
str2time(all4)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [126]:
all4.head()

Unnamed: 0,url,published,sentiment,content,long,lat,name,state,city,reach,date,time,ampm,brand,State_abb
0,http://twitter.com/genericancitizn/status/1289...,07/31/20 2:01:38 PM,-5,@formerlydoc Not some of that fake ass ice cre...,-97.739868,30.264587,genericancitizn,Texas,"Austin, Texas",433,2020-07-31 00:00:00,2:01:38,PM,breyers,TX
1,http://twitter.com/mason_missy/status/12565793...,05/02/20 6:40:38 AM,5,Happy birthday to my dear friend Sarah Bryers!...,-77.008667,38.899841,mason_missy,"Washington, D.C.","Washington, D.C.",522,2020-05-02 00:00:00,6:40:38,AM,breyers,DC
2,http://twitter.com/Corrilo_/status/12259531532...,02/07/20 5:23:03 PM,5,"@Breyers hey, I love your vanilla, but I wante...",-77.008667,38.899841,Corrilo_,"Washington, D.C.","Washington, D.C.",17,2020-02-07 00:00:00,5:23:03,PM,breyers,DC
3,http://twitter.com/FutrueMrsDavis/status/12468...,04/05/20 12:14:14 PM,5,📷 squish-this: luidilovins: i-am-the-narwhal: ...,-81.710815,38.350525,FutrueMrsDavis,West Virginia,"South Charleston, West Virginia",3,2020-04-05 00:00:00,12:14:14,PM,breyers,WV
4,http://twitter.com/connor_dushane/status/12652...,05/26/20 6:49:08 AM,0,@pancakelover_69 CEO of @Breyers https://t.co/...,-77.008667,38.899841,connor_dushane,"Washington, D.C.","Washington, D.C.",134,2020-05-26 00:00:00,6:49:08,AM,breyers,DC


In [129]:
all4.isnull().sum()

all4pre=all4[all4.date<=].groupby(['State_abb']).mean()
all4['date'].max()

datetime.datetime(2020, 11, 9, 0, 0)

### Geospatial analysis###
1. Distribution of the number of tweets by state, four brands together
2. Pre/post covid time comparison: Average sentiment score by state for all of the four brands
3. Average sentiment score by state for each of the four brands

In [183]:

#Distribution of the number of tweets 

map_data = [ go.Scattergeo(
        lon = list(all4['long']),
        lat = list(all4['lat']),
        mode='markers',
        hovertext=list(all4['state']),
        marker=dict(
            color = "rgb(138, 219, 203)",
            size=8,
            opacity=0.8,
            line = dict(
                color = "rgb(28, 71, 62)",
                width = 0.2   
            )
        )    
    )]

map_layout = go.Layout(
        title = 'Distribution of the number of tweets for all four brands',
        geo = dict(
            scope = 'usa',
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = "rgb(250, 250, 250)",
            subunitcolor = "rgb(217, 217, 217)",
            countrycolor = "rgb(217, 217, 217)",
            countrywidth = 0.5,
            subunitwidth = 0.5            
        )
    )

map_object = dict(data=map_data, layout=map_layout)
pio.show(map_object)

### Pre/post covid time comparison ###
I chose the date when the first case occurred in the United States as the dividing point and split the dataset into pre-covid part and post-covid part. 

In [143]:
a='2020-01-21' #the time when the first case occurred in the United States
b='2020-03-17' #the time when all of the 50 states had confirmed cases

first=datetime.strptime(a,'%Y-%m-%d')
first

allcases=datetime.strptime(b,'%Y-%m-%d')

(all4['date']>allcases).sum()

22742

In [176]:
all4_pre=all4[all4['date']<=first].groupby(['State_abb']).mean()
all4_pre['State_abb']=all4_pre.index

In [178]:
all4_post=all4[all4['date']>first].groupby(['State_abb']).mean()
all4_post['State_abb']=all4_post.index

In [185]:
#pre-covid
all4_pre_data = dict (
    type = 'choropleth',
    locations =all4_pre['State_abb'],
    locationmode='USA-states',
    colorscale = 'blues',
    z=all4_pre['sentiment'])


lyt = dict(geo=dict(scope='usa'),
           title = 'Sentiment Scores of Four Brands Before Covid')
all4_map = go.Figure(data=[all4_pre_data], layout = lyt)
pio.show(all4_map)

In [180]:
#post-covid
all4_post_data = dict (
    type = 'choropleth',
    locations =all4_post['State_abb'],
    locationmode='USA-states',
    colorscale = 'blues',
    z=all4_post['sentiment'])


lyt = dict(geo=dict(scope='usa'),
           title = 'Sentiment Scores of Four Brands During Covid Time')
all4_map = go.Figure(data=[all4_post_data], layout = lyt)
pio.show(all4_map)

### print some tweets during covid time ###

In [192]:
pd.set_option('max_colwidth',500)
print(all4[all4['date']>first][['content','sentiment']].iloc[[20,6004,4000]])

                                                                                                                                                                                                                                                             content  \
33                                                                                                                                                                             Dear @Breyers, I am in quarantine. Are we serious right now?? https://t.co/g3Xlj6IocD   
5437                                                                                                                                               Ben & Jerry's postpones 'Free Cone Day' over coronavirus concerns https://t.co/c8KpyOmcsY https://t.co/QeuYVvhhjF   
3256  "Why don't these unruly red state peasants just open their ten-thousand dollar freezer and pull out another pint of Ben & Jerry's as they ignore small businesses struggling to survive, the way people I 

### Create choropleth for four brands sparately ###

In [196]:
# calculate mean sentiment score for each state
hd_agg=all4[all4.brand=="hd"].groupby(['State_abb']).mean()
hd_agg['State_abb']=hd_agg.index

bnj_agg=all4[all4.brand=="bnj"].groupby(['State_abb']).mean()
bnj_agg['State_abb']=bnj_agg.index

tal_agg=all4[all4.brand=="tal"].groupby(['State_abb']).mean()
tal_agg['State_abb']=tal_agg.index

breyers_agg=all4[all4.brand=="breyers"].groupby(['State_abb']).mean()
breyers_agg['State_abb']=breyers_agg.index


all4_agg=all4.groupby(['State_abb']).mean()
all4_agg['State_abb']=all4_agg.index

In [42]:
#all4[all4.brand=="hd"].head()

Unnamed: 0,url,published,sentiment,content,long,lat,name,state,city,reach,date,time,ampm,brand,State_abb
0,http://twitter.com/bradenelee/status/125348681...,04/23/20 5:52:00 PM,0,almonds and Häagen-Dazs,-77.008667,38.899841,bradenelee,"Washington, D.C.","Washington, D.C.",370,04/23/20,5:52:00,PM,hd,DC
1,http://twitter.com/KennedyRalphie/status/12442...,03/29/20 9:15:38 AM,0,That Haagen-dazs with the crumbs of digestive ...,-77.008667,38.899841,KennedyRalphie,"Washington, D.C.","Washington, D.C.",1139,03/29/20,9:15:38,AM,hd,DC
2,http://twitter.com/Sanders509/status/125449702...,04/26/20 12:46:14 PM,0,@Steph_Jerryy Too sweet for me. Chocolate cake...,-121.470337,38.575745,Sanders509,California,"Sacramento, California",218,04/26/20,12:46:14,PM,hd,CA
3,http://twitter.com/SashaEats/status/1279033240...,07/03/20 5:44:22 AM,0,@tangoandrakija @HaagenDazs_US @DivaFoodies @C...,-84.402466,33.829651,SashaEats,Georgia (U.S. state),Atlanta,14373,07/03/20,5:44:22,AM,hd,GA
4,http://twitter.com/SparklingBlue/status/130140...,09/02/20 11:13:55 PM,5,And it’s freaking Haagen Dazs too. This is not...,-77.008667,38.899841,SparklingBlue,"Washington, D.C.","Washington, D.C.",239,09/02/20,11:13:55,PM,hd,DC


In [197]:
all4_data = dict (
    type = 'choropleth',
    locations =all4_agg['State_abb'],
    locationmode='USA-states',
    colorscale = 'blues',
    z=all4_agg['sentiment'])


lyt = dict(geo=dict(scope='usa'),
           title = 'Sentiment Scores of Four Brands',)
all4_map = go.Figure(data=[all4_data], layout = lyt)
pio.show(all4_map)

In [198]:
hd_data = dict (
    type = 'choropleth',
    locations =hd_agg['State_abb'],
    locationmode='USA-states',
    colorscale = 'blues',
    z=hd_agg['sentiment'])


lyt = dict(geo=dict(scope='usa'),
           title = 'Sentiment Scores of Haagen-Dazs',)
hd_map = go.Figure(data=[hd_data], layout = lyt)
pio.show(hd_map)
#py.offline.plot(hd_map)

In [199]:
bnj_data = dict (
    type = 'choropleth',
    locations =bnj_agg['State_abb'],
    locationmode='USA-states',
    colorscale = 'blues',
    z=bnj_agg['sentiment'])


lyt = dict(geo=dict(scope='usa'),
           title = 'Sentiment Scores of Ben & Jerry',)
bnj_map = go.Figure(data=[bnj_data], layout = lyt)
pio.show(bnj_map)

In [200]:
tal_data = dict (
    type = 'choropleth',
    locations =tal_agg['State_abb'],
    locationmode='USA-states',
    colorscale = 'blues',
    z=tal_agg['sentiment'])


lyt = dict(geo=dict(scope='usa'),
           title = 'Sentiment scores of Talenti',)
tal_map = go.Figure(data=[tal_data], layout = lyt)
pio.show(tal_map)

In [201]:
breyers_data = dict (
    type = 'choropleth',
    locations =breyers_agg['State_abb'],
    locationmode='USA-states',
    colorscale = 'blues',
    z=breyers_agg['sentiment'])


lyt = dict(geo=dict(scope='usa'),
           title = 'Sentiment scores of Breyers',)
breyers_map = go.Figure(data=[breyers_data], layout = lyt)
pio.show(breyers_map)

In [110]:
'''
#Density plot
fig = px.density_mapbox(all4, lat='lat', lon='long', z='sentiment', radius=5,
                        opacity=0.9,
                        center=dict(lat=0, lon=180), zoom=2,
                        mapbox_style="carto-positron",
                       color_continuous_scale='blues')
fig.show()
'''

'\nfig = px.density_mapbox(all4, lat=\'lat\', lon=\'long\', z=\'sentiment\', radius=5,\n                        opacity=0.9,\n                        center=dict(lat=0, lon=180), zoom=2,\n                        mapbox_style="carto-positron",\n                       color_continuous_scale=\'blues\')\nfig.show()\n'