In [1]:
import pandas as pd
import requests
import datetime
import plotly.graph_objects as go
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer
import warnings
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import re
import time
import json
import numpy as np
from bs4 import BeautifulSoup
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from bs4 import BeautifulSoup


warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
!pip install dash

In [None]:
def get_state_codes():

  url = 'https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States'
  response = requests.get(url)
  html_content = response.text

  soup = BeautifulSoup(html_content, 'html.parser')
  table = soup.find('table', {'class': 'wikitable'})
  data = []
  for row in table.find_all('tr')[1:]:  # Skipping the header row
      header_cell = row.find('th')
      first_data_cell = row.find('td')

      if header_cell and first_data_cell:
          state_link = header_cell.find('a')
          if state_link:
              state_name = state_link.text.strip()
              state_name = state_name.replace(' ', '')
          abbreviation = first_data_cell.text.strip()
          data.append([state_name, abbreviation])

  state_codes = pd.DataFrame(data, columns=['State', 'Abbreviation'])

  return state_codes

In [None]:
state_codes = get_state_codes()

In [None]:
def get_fips_code():

  url = "https://www.census.gov/library/reference/code-lists/ansi/ansi-codes-for-states.html"
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'html.parser')
  rows = soup.find_all('table')[0].find_all('tr')[1:]  # Assuming the first table is the one we need

  state_fips_codes = {}

  for row in rows:
      cols = row.find_all('td')
      state_name = cols[0].text.strip()
      state_name = state_name.replace(' ', '')
      fips_code = cols[1].text.strip()
      state_fips_codes[state_name] = fips_code

  state_fips_codes = pd.DataFrame(list(state_fips_codes.items()), columns=['State', 'FIPS Code'])

  return state_fips_codes

In [None]:
state_fips_codes = get_fips_code()

In [None]:
data = state_codes.merge(state_fips_codes[['State', 'FIPS Code']], on='State', how='left')

In [None]:
manual_mappings = {'Arizona': 'arizona', 'Alaska': 'alaska', 'Florida': 'florida',
                   'Illinois': 'illinois', 'Kansas': 'kansas', 'Maryland': 'maryland',
                   'Massachusetts': 'massachusetts', 'Minnesota': 'minnesota',
                   'Mississippi': 'mississippi', 'Missouri': 'missouri',
                   'Oklahoma': 'oklohoma', 'Oregon': 'oregon',
                   'Vermont': 'vermont', 'Wisconsin': 'wisconsin', 'Wyoming': 'wyoming'}

data['State'] = data['State'].apply(lambda x: manual_mappings[x] if x in manual_mappings else x)

In [None]:
'''
  fips_code : FIPS State Code
  get : Table name identified from (https://api.census.gov/data/2016/acs/acs1/variables.json)
  key : Fetch data for 'key' demography
'''

def fetch_data(fips_code, get, key=None):
  base_url = "https://api.census.gov/data/2019/acs/acs5"
  params = {
      'get': get,
      'for': f'state:{fips_code}'
  }
  response = requests.get(base_url, params=params)
  time.sleep(1)

  if response.status_code == 200:
      data = response.json()
      if(key == 'income'):
        return int(data[1][0])
      elif(key == 'young_population'):
        return sum(map(int, data[1][1:-1]))
      else:
        return sum(map(int, data[1][:-1]))
  else:
      return None

In [None]:
def plot(state_codes, data, title, colorbar_title):
  fig = go.Figure(data=go.Choropleth(
      locations=state_codes,
      z=data.astype(float),
      locationmode='USA-states',
      colorscale='Greens',
      colorbar_title=colorbar_title,
  ))

  fig.update_layout(
      title_text=title,
      geo_scope='usa',
  )

  fig.show()

In [None]:
data['young_population'] = data['FIPS Code'].apply(lambda x: fetch_data(x, 'B01001_006E,B01001_007E,B01001_008E,B01001_009E,B01001_010E,B01001_011E,B01001_030E,B01001_031E,B01001_032E,B01001_033E,B01001_034E,B01001_035E', key='young_population'))
data['income'] = data['FIPS Code'].apply(lambda x: fetch_data(x, 'B19013_001E', key='income'))
data['education'] = data['FIPS Code'].apply(lambda x: fetch_data(x, 'B15003_022E,B15003_023E,B15003_024E,B15003_025E'))
data['adult_population'] = data['FIPS Code'].apply(lambda x: fetch_data(x, 'B01001_011E,B01001_012E,B01001_013E,B01001_014E,B01001_015E,B01001_016E,B01001_017E,B01001_018E,B01001_019E'))
data['male_population'] = data['FIPS Code'].apply(lambda x: fetch_data(x, 'B01001_002E'))
data['female_population'] = data['FIPS Code'].apply(lambda x: fetch_data(x, 'B01001_026E'))

In [None]:
data

Unnamed: 0,State,Abbreviation,FIPS Code,young_population,income,education,adult_population,male_population,female_population
0,Alabama,AL,1,885031,50536,845772,1218746,2359355,2516895
1,alaska,AK,2,148171,77640,142019,207790,384915,352153
2,arizona,AZ,4,1320500,58945,1394526,1760691,3504509,3545790
3,Arkansas,AR,5,543996,47597,463236,744414,1471760,1527610
4,California,CA,6,7615269,75235,8980726,10527413,19526298,19757199
5,Colorado,CO,8,1066669,72331,1565134,1550686,2823201,2787148
6,Connecticut,CT,9,637965,78444,975465,923931,1744245,1830829
7,Delaware,DE,10,167750,68287,214138,239635,462890,494358
8,florida,FL,12,3504562,55660,4471701,5288301,10220813,10680823
9,Georgia,GA,13,1967121,58700,2157616,2658195,5062096,5341751


In [None]:
data.to_csv('demography_data.csv', index=False)
demography = pd.read_csv('demography.csv')

In [None]:
states = ['arizona', 'alaska', 'Alabama', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'florida', 'Georgia', 'Hawaii', 'Idaho', 'illinois', 'Indiana', 'Iowa', 'kansas', 'Kentucky', 'Louisiana', 'Maine', 'maryland', 'massachusetts', 'Michigan', 'minnesota', 'mississippi', 'missouri', 'Montana', 'Nebraska', 'Nevada', 'newhampshire', 'newjersey', 'NewMexico', 'newyork', 'NorthCarolina', 'northdakota', 'Ohio', 'oklahoma', 'oregon', 'Pennsylvania', 'RhodeIsland', 'southcarolina', 'SouthDakota', 'Tennessee', 'Texas', 'Utah', 'vermont', 'Virginia', 'Washington', 'WestVirginia', 'wisconsin', 'wyoming']
topics = ['Trump economy', 'Trump COVID', 'Trump BLM', 'Trump Healthcare', 'Trump immigrant', 'lgbtq', 'Trump abortion']
model_name = 'cardiffnlp/twitter-roberta-base-sentiment'
sentiment_pipeline = pipeline('sentiment-analysis',  model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

In [None]:
def clean_comment(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove user mentions
    text = re.sub(r'@\w+', '', text)

    # Remove hashtags (you can decide if you want to keep the text without the '#' symbol)
    text = re.sub(r'#', '', text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d', '', text)

    return text


In [None]:
def getPullPushData(query, after, before, sub):
    url = 'https://api.pullpush.io/reddit/search/comment/?q='+str(query)+'&size=1000&after='+str(after)+'&before='+str(before)+'&subreddit='+str(sub)
    r = requests.get(url)
    data = json.loads(r.text)
    time.sleep(1)
    return data['data']

In [None]:
def collect_and_analyze_data(state, topic):
    initial = len(comments)
    ## before and after are selected according to Trump's Tenure
    after = "1478592000"  # Nov 08, 2016
    before = "1604390400"  # Nov 03, 2020
    data = getPullPushData(topic, after, before, state)

    while len(data) > 0:
        for submission in data:
            try:
                comment = submission['body']
                comment = clean_comment(comment)

                if 'score' in submission:
                  ups = abs(submission['score'])
                else:
                  ups = 1

                tokens = tokenizer.encode(comment, add_special_tokens=True, truncation=True, max_length=1024)
                if len(tokens) > 512:
                    try:
                        summary_result = summarizer(comment, max_length=200, min_length=30, do_sample=False)
                        if summary_result and 'summary_text' in summary_result[0]:
                            summary = summary_result[0]['summary_text']
                        else:
                            print("No summary was returned for a comment, using original comment as summary.")
                            summary = comment
                    except Exception as e:
                        print(f"Error in summarization: {e}")
                        summary = comment
                else:
                    summary = comment

                sentiment_result = sentiment_pipeline(summary)[0]
                if(sentiment_result['label'] == 'LABEL_0'):
                    sentiment_score = -sentiment_result['score']
                else:
                    sentiment_score = sentiment_result['score']

                comments.append({
                    'state': state,
                    'topic': topic,
                    'comment': comment,
                    'sentiment_score': sentiment_score,
                    'score': ups
                })
            except Exception as e:
                print(f"An error occurred during processing: {e}")

        if data:
            before = data[-1]['created_utc']
            data = getPullPushData(topic, after, before, state)

    final = len(comments)
    print(f"Total comments for {state}: ", final - initial)

    return comments

In [None]:
def weighted_avg_and_count(group):
    if group['score'].sum() == 0:  # Avoid division by zero
        weighted_avg = group['sentiment_score'].mean()
    else:
        weighted_avg = np.average(group['sentiment_score'], weights=group['score'])
    count = group['comment'].count()
    return pd.Series([weighted_avg, count], index=['weighted_sentiment_score', 'comment_count'])


In [None]:
# Iterate over states and topics
for topic in topics:
  comments = []
  for state in tqdm(states):
      comments = collect_and_analyze_data(state, topic)
  comments_df = pd.DataFrame(comments)
  state_sentiments = comments_df.groupby('state').apply(weighted_avg_and_count).reset_index()
  topic = topic.replace(' ', '_')
  state_sentiments.to_csv(f'{topic}.csv', index=False)

In [None]:
abortion = pd.read_csv('state_sentiments_abortion.csv')
blm = pd.read_csv('state_sentiments_blm.csv')
covid = pd.read_csv('state_sentiments_covid.csv')
economy = pd.read_csv('state_sentiments_economy.csv')
healthcare = pd.read_csv('state_sentiments_healthcare.csv')
immigrant = pd.read_csv('state_sentiments_immigrant.csv')
lgbtq = pd.read_csv('state_sentiments_lgbtq.csv')

In [None]:
abortion.rename(columns={'weighted_sentiment_score': 'abortion_sentiment'}, inplace=True)
blm.rename(columns={'weighted_sentiment_score': 'blm_sentiment'}, inplace=True)
covid.rename(columns={'weighted_sentiment_score': 'covid_sentiment'}, inplace=True)
economy.rename(columns={'weighted_sentiment_score': 'economy_sentiment'}, inplace=True)
healthcare.rename(columns={'weighted_sentiment_score': 'healthcare_sentiment'}, inplace=True)
immigrant.rename(columns={'weighted_sentiment_score': 'immigrant_sentiment'}, inplace=True)
lgbtq.rename(columns={'weighted_sentiment_score': 'lgbtq_sentiment'}, inplace=True)

In [None]:
# Start by merging the first two DataFrames
combined_df = pd.merge(abortion[['state', 'abortion_sentiment']],
                       blm[['state', 'blm_sentiment']],
                       on='state', how='outer')

combined_df = pd.merge(combined_df, covid[['state', 'covid_sentiment']], on='state', how='outer')
combined_df = pd.merge(combined_df, economy[['state', 'economy_sentiment']], on='state', how='outer')
combined_df = pd.merge(combined_df, healthcare[['state', 'healthcare_sentiment']], on='state', how='outer')
combined_df = pd.merge(combined_df, immigrant[['state', 'immigrant_sentiment']], on='state', how='outer')
combined_df = pd.merge(combined_df, lgbtq[['state', 'lgbtq_sentiment']], on='state', how='outer')

In [None]:
combined_df.to_csv('sentiment_scores.csv')

In [None]:
sentiment_scores = pd.read_csv('sentiment_scores.csv', index_col=0)

In [27]:
demography_data = pd.read_csv('demography_data.csv', index_col=0)

In [28]:
demography_data


Unnamed: 0_level_0,Abbreviation,FIPS Code,young_population,income,education,adult_population,male_population,female_population
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alabama,AL,1,885031,50536,845772,1218746,2359355,2516895
alaska,AK,2,148171,77640,142019,207790,384915,352153
arizona,AZ,4,1320500,58945,1394526,1760691,3504509,3545790
Arkansas,AR,5,543996,47597,463236,744414,1471760,1527610
California,CA,6,7615269,75235,8980726,10527413,19526298,19757199
Colorado,CO,8,1066669,72331,1565134,1550686,2823201,2787148
Connecticut,CT,9,637965,78444,975465,923931,1744245,1830829
Delaware,DE,10,167750,68287,214138,239635,462890,494358
florida,FL,12,3504562,55660,4471701,5288301,10220813,10680823
Georgia,GA,13,1967121,58700,2157616,2658195,5062096,5341751


In [None]:
state_to_abbreviation = demography_data.set_index('State')['Abbreviation'].to_dict()
state_to_abbreviation = {key.lower(): value for key, value in state_to_abbreviation.items()}
sentiment_scores['state'] = sentiment_scores['state'].apply(lambda x: x.lower())
sentiment_scores['Abbreviation'] = sentiment_scores['state'].map(state_to_abbreviation)
sentiment_scores.to_csv("sentiment_scores.csv")
sentiment_scores = pd.read_csv("sentiment_scores.csv", index_col=0)

In [None]:
sentiment_scores

Unnamed: 0_level_0,abortion_sentiment,blm_sentiment,covid_sentiment,economy_sentiment,healthcare_sentiment,immigrant_sentiment,lgbtq_sentiment,Abbreviation
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alabama,-0.089307,-0.408708,-0.51699,-0.397384,-0.490296,-0.61993,-0.193995,AL
arkansas,-0.78552,-0.732035,-0.59488,-0.654507,-0.600109,-0.342295,-0.158848,AR
california,-0.30262,-0.503194,-0.001588,-0.086244,-0.374565,-0.487916,0.022265,CA
colorado,-0.276622,-0.799052,-0.66609,-0.615214,-0.608577,-0.568149,0.113118,CO
connecticut,-0.54844,-0.692,-0.64345,-0.265361,-0.225542,-0.645028,-0.388699,CT
delaware,-0.821125,0.630381,-0.66597,-0.350349,-0.578338,-0.735475,0.142113,DE
georgia,-0.680708,-0.748684,-0.512972,-0.463411,-0.634951,-0.475869,-0.234335,GA
hawaii,-0.7146,-0.614325,-0.638937,-0.389114,-0.737807,-0.494126,0.072662,HI
idaho,-0.32401,-0.298227,-0.607888,-0.578603,-0.601268,-0.395996,-0.566649,ID
indiana,-0.582315,-0.374484,-0.532429,-0.216331,-0.429129,-0.490451,-0.385232,IN


In [26]:
"""

  Access our app on thish site : https://data-and-web-technologies-for-data.onrender.com/

"""

demography_data = pd.read_csv('demography_data.csv', index_col=0)
sentiment_scores = pd.read_csv('sentiment_scores.csv', index_col=0)

demography_data = demography_data.sort_values(by='Abbreviation').reset_index(drop=True)
sentiment_scores = sentiment_scores.sort_values(by='Abbreviation').reset_index(drop=True)

app = dash.Dash(__name__)

app.layout = html.Div(style={'backgroundColor': '#f5f5f5'},  # Common background color
                      children=[
    html.H1('State wise correlation between various national topics and their respective sentiment scores', style={'textAlign': 'center', 'color': '#333'}),
    html.Div([
        html.Div([
            dcc.Dropdown(
                id='demographic-dropdown',
                options=[{'label': col, 'value': col} for col in demography_data.columns if col not in ['State', 'Abbreviation', 'FIPS Code']],
                value='income'
            ),
        ], style={'width': '45%', 'display': 'inline-block', 'padding': '10px'}),

        html.Div([
            dcc.Dropdown(
                id='topic-dropdown',
                options=[{'label': col, 'value': col} for col in sentiment_scores.columns if col not in ['state', 'Abbreviation']],
                value='abortion_sentiment'
            ),
        ], style={'width': '45%', 'float': 'right', 'display': 'inline-block', 'padding': '10px'}),
    ]),
    html.Div([
        dcc.Graph(id='demographic-map', style={'height': '50vh'}),
    ], style={'width': '45%', 'display': 'inline-block', 'padding': '10px'}),

    html.Div([
        dcc.Graph(id='sentiment-map', style={'height': '50vh'}),
    ], style={'width': '50%', 'float': 'right', 'display': 'inline-block', 'padding': '10px'}),

    html.Div([
        dcc.Graph(id='scatter-plot', style={'height': '60vh'}),
    ], style={'width': '90%', 'display': 'block', 'marginLeft': 'auto', 'marginRight': 'auto', 'padding': '10px 20px'}),

])

@app.callback(
    Output('demographic-map', 'figure'),
    Input('demographic-dropdown', 'value')
)
def update_demographic_map(selected_demographic):
    fig = px.choropleth(
        demography_data,
        locations='Abbreviation',
        locationmode="USA-states",
        color=selected_demographic,
        scope="usa",
        color_continuous_scale='Blues'
    )
    return fig

@app.callback(
    Output('sentiment-map', 'figure'),
    Input('topic-dropdown', 'value')
)
def update_sentiment_map(selected_topic):
    fig = px.choropleth(
        sentiment_scores,
        locations='Abbreviation',
        locationmode="USA-states",
        color=selected_topic,
        scope="usa",
        color_continuous_scale='RdBu',
        range_color=[-1, 1]
    )
    return fig

@app.callback(
    Output('scatter-plot', 'figure'),
    [Input('demographic-dropdown', 'value'),
     Input('topic-dropdown', 'value')]
)
def update_scatter_plot(selected_demographic, selected_topic):
    X = np.array(demography_data[selected_demographic].values).reshape(-1, 1)
    y = np.array(sentiment_scores[selected_topic].values)

    reg = LinearRegression().fit(X, y)

    line_X = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
    line_y = reg.predict(line_X)

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=X.ravel(), y=y, mode='markers', name='Data'))
    fig.add_trace(go.Scatter(x=line_X.ravel(), y=line_y, mode='lines', name='Regression Line'))

    return fig

server = app.server

if __name__ == '__main__':
    app.run_server(debug=True)


<IPython.core.display.Javascript object>

In [None]:
# Plotting
fig = go.Figure(data=go.Choropleth(
    locations=sentiment_scores['Abbreviation'],
    z=sentiment_scores['lgbtq_sentiment'].astype(float),
    locationmode='USA-states',
    colorscale='RdBu',
    colorbar_title="Sentiment Score",
    zmin=-1,
    zmax=1,
))

fig.update_layout(
    title_text='State-wise Sentiment on Various Topics',
    geo_scope='usa',
)

fig.show()