In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install mlxtend

In [3]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

**Testing Notes:** Please change the inputs of the version 1 model function in the following cell.

We have runned this file on Google Colab, so the folder path format is associated with Google Drive. In order to test this file, please change the path variables accordingly in the following cell. For ease of testing, we have the path variable 'folder_contains_hashtagstweets_with_sentiment_compoundvalue' as one of the input for the model function, but the actual inputs are the three we mentioned in the report.

Please run every cell to the end to see the output.

In [4]:
import os

# inputs for model function
target_hashtag = 'homeschool'
target_hashtag_tweets = 'tag_#homeschool.csv'
assumed_daily_increase = 20000

# the path variables for required folders

# the directory that contains all the sub folders or files
main_folder = '/content/drive/MyDrive/CMPT733_Final_Project'

# the folder that contains the tables of tweets compound values for each hashtag in the hashtags list after sentiment analysis 
folder_contains_hashtagstweets_with_sentiment_compoundvalue = 'tweet_sentiment_added_noise_removed'

# the folder that contains the file of collected tweets of the target hashtag
folder_contains_target_hashtag_tweets = 'covid_tweet'
os.chdir(main_folder)


In [5]:
def extract_tags(Text):
  tags = re.findall(r"#(\w+)", Text)
  return tags

def find_related_tags(target_tag, target_tag_tweets):

  hashtags = ['coronavirus',
              'coronaupdate',
              'selfisolating',
              'quarantine',
              'wearamask',
              'stayhomestaysafe',
              'pneumonia',
              'herdimmunity',
              'wfh',
              'masks4all',
              'covid19',
              'faceshield',
              'sarscov2',
              'frontlineheroes',
              'washyourhands',
              'covid-19',
              'coronavaccine',
              'flattenthecurve',
              'covidvaccine',
              'workfromhome',
              'socialbubble',
              'ppe',
              'socialdistancing',
              'lockdown',
              'pandemic',
              target_tag]

  df = pd.read_csv('./' + folder_contains_target_hashtag_tweets + '/' + target_tag_tweets, lineterminator='\n')

  df['Tags'] = df['Text'].apply(extract_tags)
  tags_list = df['Tags'].to_list()

  tags_filter_list = []
  for sublist in tags_list:
    sublist = [i for i in sublist if i in hashtags]
    if sublist:
      tags_filter_list.append(sublist)
    
  encoder = TransactionEncoder()

  tags_array_encoded = encoder.fit(tags_filter_list).transform(tags_filter_list)

  tags_df_encoded = pd.DataFrame(tags_array_encoded, columns=encoder.columns_)

  tags_cluster = apriori(tags_df_encoded, min_support=0.001, use_colnames=True)

  # remove tags contains only 1 item
  tags_cluster['length'] = tags_cluster['itemsets'].apply(lambda x: len(x))
  tags_cluster_filtered = tags_cluster[(tags_cluster['length'] >= 2)].sort_values(by=['support'], ascending=False)
  tag_list = [list(i) for i in tags_cluster_filtered.itemsets]
  tag_list

  # find related tags
  related_tags = []
  for tags in tag_list:
    if target_tag in tags:
      related_tags.extend(tags)
  related_tags = list(set(related_tags))
  related_tags.remove(target_tag)
  related_tags = [ '#'+tag for tag in related_tags]

  return related_tags


In [7]:
# get all values
def get_average_sentiment_score(data_folder):
  files = [file for file in os.listdir(data_folder) if "tag_#" in file]

  colnames = ['date']
  daily_mean_df = pd.DataFrame(columns=['date'])


  for file in files: 
      df = pd.read_csv('./' + folder_contains_hashtagstweets_with_sentiment_compoundvalue + '/' + file, lineterminator='\n')
      tag = re.sub('.csv', '', re.sub('tag_', '', file))
      colnames.append(tag)
      
      mean_df = df.groupby("date")['compound'].mean()
      mean_df.index = pd.to_datetime(mean_df.index)

      daily_mean_df = daily_mean_df.merge(mean_df, how='outer', on='date')


  daily_mean_df.columns = colnames

  
  return daily_mean_df

In [8]:
# predict target tag value
def predict_target_tag(target_tag, target_tag_tweets, assumed_daily_increase, data_folder):
  related_tags = find_related_tags(target_tag, target_tag_tweets)
  daily_mean_df = get_average_sentiment_score(data_folder)

  column_selected = related_tags + ['date']
  predicted_df = daily_mean_df[column_selected]
  predicted_df[target_tag] = daily_mean_df[related_tags].mean(axis=1)

  predicted_df[target_tag] = np.where(predicted_df[target_tag] >= 0.20, 1, predicted_df[target_tag])
  predicted_df[target_tag] = np.where(predicted_df[target_tag] <= 0.15, -1, predicted_df[target_tag])
  predicted_df[target_tag] = np.where(predicted_df[target_tag].between(0.15, 0.20), 0, predicted_df[target_tag])

  confirmed_global = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
  us_covid19 = confirmed_global.loc[confirmed_global["Country/Region"]=="US"]
  us_covid19 = us_covid19[us_covid19.columns[4:]].melt(var_name="date", value_name="count")

  us_covid19["date"] = us_covid19["date"].apply(lambda row: datetime.strptime(row, '%m/%d/%y'))
  us_covid19["daily_increase"] = us_covid19.set_index('date').diff().reset_index()["count"]

  us_covid19_daily = us_covid19[(us_covid19["date"] > '2020-4-12') & (us_covid19["date"] < '2021-2-12')].reset_index()

  idx = us_covid19_daily['daily_increase'].sub(assumed_daily_increase).abs().idxmin()
  related_row = us_covid19_daily.loc[[idx]]

  related_row["date"] = pd.to_datetime(related_row["date"], errors = "coerce").dt.date
  related_date = str(related_row.iloc[0]['date'])

  predicted_df['date'] = pd.to_datetime(predicted_df['date'], format='%Y-%m-%d')

  predict_row = predicted_df.loc[(predicted_df['date'] == related_date)]

  return predict_row.iloc[0][target_tag]


In [9]:
target_tag_sentiment_result = predict_target_tag(target_hashtag, target_hashtag_tweets, assumed_daily_increase, folder_contains_hashtagstweets_with_sentiment_compoundvalue)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,co

In [10]:
if target_tag_sentiment_result == 1:
  target_tag_sentiment_text = 'postive'
elif target_tag_sentiment_result == 0:
  target_tag_sentiment_text = 'neutral'
else:
  target_tag_sentiment_text = 'negative'

print('The predicted sentiment type of the tag #homeschool when assumed daily increase is 20000 using version 1 model is ' + str(target_tag_sentiment_text))

The predicted sentiment type of the tag #homeschool when assumed daily increase is 20000 using version 1 model is negative
