In [2]:
import matplotlib.pyplot as plt
import plotly
from pathlib import Path
import sys
import os
import folium
from folium.plugins import HeatMap
from geopy.geocoders import Nominatim
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from database import database_manager as dbm

geolocator = Nominatim(user_agent="geoapiExercises")

# queries database to get the table data
def get_sentiment_data(table):
    print(f"Retrieving {table} table...")
    
    # Tables in DB:
    # table = "raw_twitter_data"
    # table = "processed_twitter_data"
    # table = "sentiment_scores"
    
    query = f"""
        SELECT * 
        FROM "{table}" 
    """
    df = dbm.query_db(query)
    return df

In [3]:
tf_idf_df = get_sentiment_data("tfidf_results_1")

Retrieving tfidf_results_1 table...


In [4]:
tf_idf_df.head()


Unnamed: 0,tweet_id,term,tfidf_score,rank
0,1,prime,0.500229,1
1,1,cost,0.478771,2
2,1,australia,0.430155,3
3,1,minister,0.420804,4
4,1,covid19,0.258287,5


In [18]:
# Filter out terms with low tf-idf scores
tf_idf_df = tf_idf_df[tf_idf_df["tfidf_score"] > 0.1]

In [19]:
# Only include the top tfidf word per tweet (there is multuiplte terms per tweet id, only include the top one)
tf_idf_df = tf_idf_df.groupby("tweet_id").first().reset_index()


In [20]:
#check for duplicates in the tweet_id column
print(f"Number of duplicate tweet ids: {tf_idf_df.duplicated(subset=['tweet_id']).sum()}")

Number of duplicate tweet ids: 0


In [30]:
df_sentiment = get_sentiment_data("sentiment_scores_df")

Retrieving sentiment_scores_df table...


In [None]:
df_sentiment.head()

In [None]:
# Merge the sentiment scores with the top terms with sentiment df being id, and top terms being tweet_id
df_sentiment = df_sentiment.merge(tf_idf_df, left_on="id", right_on="tweet_id")

In [21]:
# List of terms to exclude
exclude_terms = ["covidvaccine", "covid19", "covid", "vaccine"]

# Filter out the rows where the 'term' column matches any of the terms in the exclude_terms list
tf_idf_df = tf_idf_df[~tf_idf_df["term"].isin(exclude_terms)]


In [22]:
# Get the top 10 terms
top_terms = tf_idf_df.sort_values(by="tfidf_score", ascending=False).head(10)

In [23]:
print(top_terms)

       tweet_id       term  tfidf_score  rank
2620       2699     office          1.0     1
12812     13211    welcome          1.0     1
94526     97357      great          1.0     1
60965     62799      false          1.0     1
12843     13246      daily          1.0     1
83097     85548      check          1.0     1
49282     50801    awesome          1.0     1
60916     62747  hopefully          1.0     1
10608     10946     school          1.0     1
25924     26742       fast          1.0     1


In [24]:
# Filter out words that say "covidvaccine" or "covid19"
top_terms = top_terms[~top_terms["term"].str.contains("covidvaccine")]

In [11]:
print(top_terms)

        tweet_id   term  tfidf_score  rank
199966     47623  trump          1.0     1


In [None]:
# Only include the top tfidf word per tweet


In [27]:
# Merge the sentiment scores with the top terms with sentiment df being id, and top terms being tweet_id
df_sentiment = df_sentiment.merge(top_terms, left_on="id", right_on="tweet_id")

In [28]:
df_sentiment.head()

Unnamed: 0,id,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,...,is_retweet,compound,neg,neu,pos,tweet,tweet_id,term,tfidf_score,rank
0,2699,🐾 Catt Stone,"Californian, USA",Part-timer ▪ H2O lover ▪ Tech Editor ▪ Smile n...,2016-05-21,543,500,11598,False,2020-12-31,...,False,0.5983,0.136,0.509,0.356,I got my CovidVaccine! smiling face with smil...,2699,office,1.0,1
1,10946,Julia No,Ontario,"Love to laugh, my family, #books, #music, #out...",2021-01-23,30,227,200,False,2021-01-25,...,False,0.4404,0.0,0.508,0.492,That's good news CovidVaccine,10946,school,1.0,1
2,13211,National Law Review,"Chicago, IL & Denver, CO",Each month over 2 million legal & business pro...,2009-07-10,34557,3606,4274,True,2021-01-20,...,False,0.0,0.0,1.0,0.0,Government Lightens Enforcement of HIPAA Rules...,13211,welcome,1.0,1
3,13246,Page Lie,"San Francisco, CA","She/Her, Registered Nurse, Mom & Wife #Dissent...",2011-10-19,4540,4997,72186,False,2021-01-20,...,False,-0.5255,0.326,0.674,0.0,Hey UCSF this is terrible! COVIDVaccine NurseT...,13246,daily,1.0,1
4,26742,John Kanelis,"Princeton, Texas","Husband, father, grandfather, veteran, blogger...",2011-06-15,886,1187,870,False,2021-02-20,...,False,0.6369,0.0,0.625,0.375,I love being a statistic via jkanelis VA pand...,26742,fast,1.0,1
