In [74]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [75]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import numpy as np
import re
from datetime import datetime
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

In [76]:
!pip install mlxtend



In [77]:
import os
os.chdir("/content/drive/MyDrive/CMPT733_Final_Project")


In [78]:
files = [file for file in os.listdir("tweet_sentiment_added_noise_removed") if "tag_#" in file]
files

['tag_#coronavirus.csv',
 'tag_#coronaupdate.csv',
 'tag_#selfisolating.csv',
 'tag_#quarantine.csv',
 'tag_#wearamask.csv',
 'tag_#stayhomestaysafe.csv',
 'tag_#pneumonia.csv',
 'tag_#herdimmunity.csv',
 'tag_#wfh.csv',
 'tag_#masks4all.csv',
 'tag_#covid19.csv',
 'tag_#faceshield.csv',
 'tag_#sarscov2.csv',
 'tag_#frontlineheroes.csv',
 'tag_#washyourhands.csv',
 'tag_#covid-19.csv',
 'tag_#coronavaccine.csv',
 'tag_#flattenthecurve.csv',
 'tag_#covidvaccine.csv',
 'tag_#workfromhome.csv',
 'tag_#socialbubble.csv',
 'tag_#ppe.csv',
 'tag_#socialdistancing.csv',
 'tag_#lockdown.csv',
 'tag_#pandemic.csv']

In [79]:
def extract_tags(Text):
  tags = re.findall(r"#(\w+)", Text)
  return tags

def find_related_tags(target_tag, target_tag_tweets):

  hashtags = ['coronavirus',
              'coronaupdate',
              'selfisolating',
              'quarantine',
              'wearamask',
              'stayhomestaysafe',
              'pneumonia',
              'herdimmunity',
              'wfh',
              'masks4all',
              'covid19',
              'faceshield',
              'sarscov2',
              'frontlineheroes',
              'washyourhands',
              'covid-19',
              'coronavaccine',
              'flattenthecurve',
              'covidvaccine',
              'workfromhome',
              'socialbubble',
              'ppe',
              'socialdistancing',
              'lockdown',
              'pandemic',
              target_tag]

  df = pd.read_csv('./covid_tweet/' + target_tag_tweets, lineterminator='\n')

  df['Tags'] = df['Text'].apply(extract_tags)
  tags_list = df['Tags'].to_list()

  tags_filter_list = []
  for sublist in tags_list:
    sublist = [i for i in sublist if i in hashtags]
    if sublist:
      tags_filter_list.append(sublist)
    
  encoder = TransactionEncoder()

  tags_array_encoded = encoder.fit(tags_filter_list).transform(tags_filter_list)

  tags_df_encoded = pd.DataFrame(tags_array_encoded, columns=encoder.columns_)

  tags_cluster = apriori(tags_df_encoded, min_support=0.001, use_colnames=True)

  # remove tags contains only 1 item
  tags_cluster['length'] = tags_cluster['itemsets'].apply(lambda x: len(x))
  tags_cluster_filtered = tags_cluster[(tags_cluster['length'] >= 2)].sort_values(by=['support'], ascending=False)
  tag_list = [list(i) for i in tags_cluster_filtered.itemsets]
  tag_list

  # find related tags
  related_tags = []
  for tags in tag_list:
    if target_tag in tags:
      related_tags.extend(tags)
  related_tags = list(set(related_tags))
  related_tags.remove(target_tag)
  related_tags = [ '#'+tag for tag in related_tags]

  return related_tags


In [133]:

# get all values
def get_average_sentiment_score(data_folder):
  files = [file for file in os.listdir(data_folder) if "tag_#" in file]

  colnames = ['date']
  daily_mean_df = pd.DataFrame(columns=['date'])
  weekly_mean_df = pd.DataFrame(columns=['date'])
  monthly_mean_df = pd.DataFrame(columns=['date'])

  b = 0
  for file in files: 
      df = pd.read_csv('./tweet_sentiment_added_noise_removed/' + file, lineterminator='\n')
      tag = re.sub('.csv', '', re.sub('tag_', '', file))
      colnames.append(tag)

      df['compound'] = np.where(df['compound'] >= 0.05, 1, df['compound'])
      df['compound'] = np.where(df['compound'] <= -0.05, -1, df['compound']) 
      df['compound'] = np.where(df['compound'].between(-0.05, 0.05), 0, df['compound'])
      
      # mean_df = df.groupby("date")['compound'].mean()
      mean_df = df.groupby('date')['compound'].agg(lambda sentiment: sentiment.value_counts().index[0])
      mean_df.index = pd.to_datetime(mean_df.index)

      # mean_df = mean_df.fillna(0)

      daily_mean_df = daily_mean_df.merge(mean_df, how='outer', on='date')
      # weekly_mean_df = weekly_mean_df.merge(mean_df.resample("W").agg(lambda sentiment: sentiment.value_counts(dropna = False).index[0]), how='outer', on='date')
      # monthly_mean_df = monthly_mean_df.merge(mean_df.resample("M").agg(lambda sentiment: sentiment.value_counts().index[0]), how='outer', on='date')

  daily_mean_df.columns = colnames
  # weekly_mean_df.columns = colnames
  # monthly_mean_df.columns = colnames
  
  return daily_mean_df

In [138]:
# predict target tag value
def predict_target_tag(target_tag, target_tag_tweets, assumed_daily_increase, data_folder):
  related_tags = find_related_tags(target_tag, target_tag_tweets)
  daily_mean_df = get_average_sentiment_score(data_folder)
  
  column_selected = related_tags + ['date']
  predicted_df = daily_mean_df[column_selected]
  predicted_df[target_tag] = daily_mean_df[related_tags].mean(axis=1)
  tested_predicted_df = predicted_df[predicted_df[target_tag] == 0]

  confirmed_global = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
  us_covid19 = confirmed_global.loc[confirmed_global["Country/Region"]=="US"]
  us_covid19 = us_covid19[us_covid19.columns[4:]].melt(var_name="date", value_name="count")

  us_covid19["date"] = us_covid19["date"].apply(lambda row: datetime.strptime(row, '%m/%d/%y'))
  us_covid19["daily_increase"] = us_covid19.set_index('date').diff().reset_index()["count"]
  us_covid19

  us_covid19_daily = us_covid19[(us_covid19["date"] > '2020-4-12') & (us_covid19["date"] < '2021-2-12')].reset_index()
  # us_covid19_daily = us_covid19_daily.set_index("date")
  # us_covid19_weekly = us_covid19_daily.resample("W").sum().reset_index()
  # us_covid19_weekly = us_covid19_weekly.rename(columns = {'daily_increase': 'weekly_increase'})

  # us_covid19_monthly = us_covid19_daily.resample("M").sum().reset_index()

  idx = us_covid19_daily['daily_increase'].sub(assumed_daily_increase).abs().idxmin()
  related_row = us_covid19_daily.loc[[idx]]

  related_row["date"] = pd.to_datetime(related_row["date"], errors = "coerce").dt.date
  related_date = str(related_row.iloc[0]['date'])

  predicted_df['date'] = pd.to_datetime(predicted_df['date'], format='%Y-%m-%d')
  print(tested_predicted_df)

  predict_row = predicted_df.loc[(predicted_df['date'] == related_date)]

  return predict_row.iloc[0][target_tag]
  
  # df = predicted_df.melt(id_vars="date", var_name="hashtag", value_name="compound_score")
  # fig = px.line(df, x = "date", y = "compound_score", color = "hashtag", height=800, title='Weekly Mean Compound Sentiment Score')
  # fig.show()
a = predict_target_tag('homeschool', 'tag_#homeschool.csv', 30000, 'tweet_sentiment_added_noise_removed')

     #coronavirus  #lockdown  #covid19  #pandemic       date  homeschool
150          -1.0        1.0      -1.0        1.0 2020-09-09         0.0
175          -1.0        1.0      -1.0        1.0 2020-10-04         0.0
176          -1.0        1.0      -1.0        1.0 2020-10-05         0.0
195          -1.0        1.0      -1.0        1.0 2020-10-24         0.0




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [131]:

print('The predicted sentiment type of the tag #homeschool when assumed daily increase is 50000 is ' + str(a))

The predicted sentiment type of the tag #homeschool when assumed daily increase is 50000 is 1.0
