## Background

The outbreak of the pandemic has truely changed many people's life. With the loss of almost 7 million lives in the world, many people exhibited behaviors associated with the psychological shadow. Thus, the demand of psychological service might increase fast in order to ease people's emotions.

Our project could be used by psychologist, for both therapist and researchers, when

- Studying psychological reactions to the pandemic: Researchers can analyze tweets to understand people's emotional reactions to the pandemic, such as fear, anxiety, stress, and sadness. They can also examine how people cope with the pandemic and how they adapt to new circumstances. By studying the language people use in their tweets, researchers can gain insight into their psychological state and experiences.

- Understanding social dynamics during the pandemic: Researchers can use tweets to examine how people interact with each other during the pandemic. For example, they can analyze how people express social support or criticize others for not following public health guidelines. They can also investigate how social norms change during the pandemic and how they influence people's behavior.

- Identifying risk factors for mental health problems: Researchers can analyze tweets to identify risk factors for mental health problems during the pandemic. For example, they can look for patterns of language use that suggest loneliness, social isolation, or depression. They can also examine how people talk about their coping strategies and identify which strategies are associated with better mental health outcomes.



## Demo

In [12]:
import ipywidgets as widgets

import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display, clear_output
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import datetime

import pandas as pd
import ipywidgets as widgets
from IPython.display import display

from IPython.display import display
from ipywidgets import (interact, Dropdown, SelectMultiple,
                        IntSlider, ToggleButtons, DatePicker)

In [15]:
df = pd.read_csv('../data/covid2023.csv')
df['date'] = pd.to_datetime(df['date'])

In [16]:
date_array = pd.date_range(start='2022-01-01', end='2022-12-31', freq='D')
value_array = range(len(date_array))

df1 = pd.DataFrame({
    'date': date_array,
    'value': value_array
})


# Define start and end date pickers
start_date_picker = widgets.DatePicker(
    description='Start Date',
    value=datetime.datetime(2023, 1, 1),
    disabled=False
)

end_date_picker = widgets.DatePicker(
    description='End Date',
    value=datetime.datetime(2023, 1, 1),
    disabled=False
)

def filter_dataframe(start_date, end_date):
    if start_date is None or end_date is None:
        print("Please select start and end dates.")
        return
    try:
        start_date = pd.to_datetime(start_date)
        end_date = pd.to_datetime(end_date)
    except ValueError:
        print("Please enter dates in yyyy-mm-dd format.")
        return
    if not isinstance(df1['date'], pd.DatetimeIndex):
        print("The 'date' column is not a datetime object.")
        return
    filtered_df = df1[(df1['date'] >= start_date) & (df1['date'] <= end_date)]
    display(filtered_df)

    
# Create an interaction between the date pickers and the filter function
widgets.interact(filter_dataframe, start_date=start_date_picker, end_date=end_date_picker);



interactive(children=(DatePicker(value=datetime.datetime(2023, 1, 1, 0, 0), description='Start Date', step=1),…

In [17]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
nltk.download('stopwords')
nltk.download('vader_lexicon')

ModuleNotFoundError: No module named 'seaborn'

In [19]:
!pip install seaborn

Collecting seaborn
  Using cached seaborn-0.12.2-py3-none-any.whl (293 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.12.2


In [None]:
df['date']

In [None]:
### basic cleaning of the data

# drop duplicates
df = df.drop_duplicates(subset='text')
# drop the rows that user_followers =0
df = df[df['user_followers'] != 0]
# drop NA values
df = df.dropna()
# reset index
df = df.reset_index(drop=True)

# preprocess the text column and do text cleaning(most common methods are covered)
def preprocess_text(text):
    # convert to lowercase
    text = text.lower()
    
    # remove URLs
    text = re.sub(r"http\S+", "", text)
    
    # remove numbers
    text = re.sub(r"\d", "", text)
    
    # remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    
    # remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    
    # Removing extra spaces
    text = " ".join(tokens)
    text = re.sub(' +', ' ', text)
    
    # Removing Emojis
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    # Removing emoticons
    text = re.sub(r':\w+:', '', text)
    
    # Removing Contractions
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    
    return text

df["text"] = df["text"].apply(preprocess_text)

def remove_emojis(column):
    return re.sub(r'[^\x00-\x7F]+', '', column)

df["user_name"] = df["user_name"].apply(remove_emojis)
df["user_location"] = df["user_location"].apply(remove_emojis)
df["user_description"] = df["user_description"].apply(remove_emojis)


# check the distribution of tweet length
df["tweet_length"] = df["text"].apply(len)


In [None]:
## impletement date selection
import pandas as pd


df['date'] = pd.to_datetime(df['date']).dt.tz_localize(None)

if start_date_picker.value is not None and end_date_picker.value is not None:
    start_date = pd.to_datetime(start_date_picker.value)
    end_date = pd.to_datetime(end_date_picker.value)

    # filter the dataframe
    filtered_df = df[(df['data_date'] >= start_date) & (df['data_date'] <= end_date)]
else:
    filtered_df = df.copy()

# filter the dataframe
filtered_df = df[(df['data_date'] >= start_date) & (df['data_date'] <= end_date)]

print(start_date, end_date)
filtered_df

In [None]:
start_date

In [None]:
end_date

In [None]:
filtered_df.shape

In [None]:
from collections import Counter
import matplotlib.pyplot as plt

text = " ".join(filtered_df['text'])
words = text.split()
words_counter = Counter(words)
most_common_words = words_counter.most_common(20)

words = [word[0] for word in most_common_words]
counts = [word[1] for word in most_common_words]

plt.bar(words, counts)
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Bar Plot of Most Common Words in Tweets within Time Selection')
plt.xticks(rotation=90)
plt.show()

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import re

# Clean the user_location column
filtered_df['user_location'] = filtered_df['user_location'].str.replace('[^\w\s]','')
filtered_df = filtered_df[filtered_df['user_location'].notna()]
text = " ".join(filtered_df['user_location'].values)

# Create the word cloud
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = None, 
                min_font_size = 10).generate(text)

# Plot the word cloud
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

The bar plot could give the psychologist the most popular twitter words used within the time period, and the word cloud could give the most common location within this time period.

## Data Pick Up

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
vader_analyzer = SentimentIntensityAnalyzer()

In [None]:
# Data pick from selected date
import datetime

df.loc[:, "data_date"] = \
    pd.to_datetime(df["date"]).dt.date

#date_picker = widgets.DatePicker(name='Date Picker',start= s, end= e,value=datetime.datetime(df[0]["date"])
date_picker = widgets.DatePicker(description='Select Date',value=datetime.datetime(2023, 1, 1))


@interact(date=date_picker)
def order_by_date(date):
    pick_data = df.loc[
        (df["data_date"] == date)
    ]
    if len(pick_data) == 0:
        return print ("Choose different day")
    print(f"{len(pick_data)=}\n")
    return pick_data.head()

## Sentimental score

In [None]:
#date_picker = widgets.DatePicker(start=datetime.datetime(2023, 1, 31), end=datetime.datetime(2023, 2, 8)value)
date_picker = widgets.DatePicker(description='Select Date',value=datetime.datetime(2023, 1, 1))

@interact(input_date=date_picker)
#def sentimental(start_date, end_date): 
def sentimental(input_date): 
    
    if (isinstance(input_date, date)):
        filtered_df = df[(df['data_date'] == input_date)]
    else:
        filtered_df = df[(df['data_date'] >= input_date.date()) & (df['data_date'] <= input_date.date())]
        
    print ("Data count = " , len(filtered_df))
    sentences23 = filtered_df['text']
    date23 = filtered_df['date']
    
    if len(date23) == 0:
        return print ("Choose different day")
    else:
        result = []
        df23 = pd.DataFrame()
        for s in sentences23:
            score = vader_analyzer.polarity_scores(s)
            result.append(score)

        i = 0
        for i in range(len(result)):
            x = pd.DataFrame.from_dict(result[i], orient='index').T
            df23 = pd.concat([df23,x], ignore_index=True)
        df23.index = sentences23
        
        return plt.hist( df23["compound"],bins =30)
    



## hashtag 

In [None]:
#hashtag_list = df["hashtags"].tolist()
#hashtag_list_0 = SelectMultiple(options=hashtag_list,discription="hashtag")
#hashtag_list_0 
