In [None]:
import os
import glob
from pathlib import Path
import json
import ast
import pandas as pd
import numpy as np
import re
import random
import warnings
warnings.filterwarnings("ignore")

## NLP
import string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.corpus.reader import CorpusReader
from nltk.internals import deprecated
from nltk.probability import FreqDist
from nltk.util import binary_search_file as _binary_search_file
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
lemmatizer = WordNetLemmatizer()
w_tokenizer = WhitespaceTokenizer()

import gensim
import gensim.corpora as corpora

import itertools
from collections import Counter

from nostril import nonsense
from polyglot.detect import Detector
from polyglot.detect.base import logger as polyglot_logger
polyglot_logger.setLevel("ERROR")
from textblob import TextBlob


# Cleanup functions

In [None]:
punctuations = []
for punct in string.punctuation:
    if punct not in ["#", "@"]:
        punctuations.append(punct)

stopWords = set(stopwords.words('english'))

punctuations_stopwords = []
for punct in punctuations:
    if punct not in punctuations_stopwords:
        punctuations_stopwords.append(punct)  
for stopword in stopWords:
    if stopword not in punctuations_stopwords:
        punctuations_stopwords.append(stopword)

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def get_clean_text(x):
    if type(x) is str:
        x = x.lower()
        #remove user handles
        x = re.sub('@[^\s]+','',x)
        #regex to remove emails
        x = re.sub(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', '', x) 
        #regex to remove URLs
        x = re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', x)
        #remove "RT"
        x = re.sub('rt', '', x)
        #remove unk tokens
        x = re.sub('unk', '', x)
        #removing anything that's not alphabets
        x = re.sub('[^A-Z a-z]+', '', x)
        #remove elipses from end of words
        x = re.sub("\.\.\.", '', x)
        x = re.sub("\.\.", '', x)
        return x
    else:
        return x

def remove_nonenglish(text):
    try:
        if Detector(text).language.code == "en":
            return text
        else:
            return ""
    except: # errors are likely due to short length, so append anyway
        return text

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w.lower()) for w in w_tokenizer.tokenize(text)]

def remove_punctuation(text):
    return [i for i in text if i not in punctuations_stopwords]

def remove_spaces(text):
    response = []
    for i in text:
        if i !=" ":
            response.append(i.strip())
    return response

def remove_nonsense(text):
    response = []
    if text is not None:
        if len(text) > 0:
            for i in text:
                try:
                    if not nonsense(i):
                        response.append(i)
                except:
                    response.append(i)
    return response

def pipeline(text):
    no_emojis = text.apply(deEmojify)
    clean = no_emojis.apply(lambda x: get_clean_text(x))
    text_english = clean.apply(remove_nonenglish)
    text_lemmatized = text_english.apply(lemmatize_text) # Now text is a list
    text_unpunctuated = text_lemmatized.apply(remove_punctuation) 
    text_nospace = text_unpunctuated.apply(remove_spaces)
    text_sense = text_nospace.apply(remove_nonsense)
    return text_sense

def pipeline_bert(text):
    no_emojis = text.apply(deEmojify)
    clean = no_emojis.apply(lambda x: get_clean_text(x))
    text_nospace = clean.apply(lambda x: re.sub(r"\s+", " ", x))
    text_string = text_nospace.apply(lambda x: re.sub(r"\.", "[CLS] [SEP]", x))
    text_string = text_string.apply(lambda x: "[CLS] " + x + " [SEP]")
    return text_string

def pipeline_LIWC(text):
    no_emojis = text.apply(deEmojify)
    clean = no_emojis.apply(lambda x: get_clean_text(x))
    text_nospace = clean.apply(lambda x: re.sub(r"\s+", " ", x))
    return text_nospace


# Word dictionary functions

In [None]:
def get_synonyms(word_list):
    new = []
    for text in word_list:
        new.append(text)
        for syn in wordnet.synsets(text):
            if syn.pos() in ["a", "s", "r"]:
                word = syn.name().split(".")[0]
                if word not in new:
                    new.append(word)
    for word in new:
        lemma = lemmatizer.lemmatize(word)
        if lemma not in new:
            new.append(lemma)
    final = []
    for word in new:
        if word not in final:
            final.append(word)
    return final

def get_adjectives(word_list):
    new = []
    for text in word_list:
        new.append(text)
        for syn in wordnet.synsets(text):
            also_sees = syn.also_sees()
            if len(also_sees) > 0:
                for seealso in also_sees:
                    if seealso.pos() in ["a", "s", "r"]:
                        word = seealso.name().split(".")[0]
                        new.append(word)
            similar_tos = syn.similar_tos()
            if len(similar_tos) > 0:
                for similar in similar_tos:
                    if similar.pos() in ["a", "s", "r"]:
                        word = similar.name().split(".")[0]
                        new.append(word)
            attributes = syn.attributes()
            if len(attributes) > 0:
                for attribute in attributes:
                    if attribute.pos() in ["a", "s", "r"]:
                        word = attribute.name().split(".")[0]
    for word in new:
        lemma = lemmatizer.lemmatize(word)
        if lemma not in new:
            new.append(lemma)
    final = []
    for word in new:
        if word not in final:
            final.append(word)
    return final
    

def get_hypernyms(word_list):
    new = []
    for text in word_list:
        new.append(text)
        for syn in wordnet.synsets(text):
            hypernyms = syn.hypernyms()
            if len(hypernyms) > 0:
                for hypernym in hypernyms:
                    if hypernym.pos() in ["a", "s", "r"]:
                        word = hypernym.name().split(".")[0]
                        new.append(word)
    for word in new:
        lemma = lemmatizer.lemmatize(word)
        if lemma not in new:
            new.append(lemma)
    final = []
    for word in new:
        if word not in final:
            final.append(word)
    return final

def get_hyponyms(word_list):
    new = []
    for text in word_list:
        new.append(text)
        for syn in wordnet.synsets(text):
            hyponyms = syn.hyponyms()
            if len(hyponyms) > 0:
                for hyponym in hyponyms:
                    if hyponym.pos() in ["a", "s", "r"]:
                        word = hyponym.name().split(".")[0]
                        new.append(word)
    for word in new:
        lemma = lemmatizer.lemmatize(word)
        if lemma not in new:
            new.append(lemma)
    final = []
    for word in new:
        if word not in final:
            final.append(word)
    return final

def get_holonyms(word_list):
    new = []
    for text in word_list:
        new.append(text)
        for syn in wordnet.synsets(text):
            member_holonyms = syn.member_holonyms()
            if len(member_holonyms) > 0:
                for holonym in member_holonyms:
                    if holonym.pos() in ["a", "s", "r"]:
                        word = holonym.name().split(".")[0]
                        new.append(word)
        for syn in wordnet.synsets(text):
            substance_holonyms = syn.substance_holonyms()
            if len(substance_holonyms) > 0:
                for holonym in substance_holonyms:
                    if holonym.pos() in ["a", "s", "r"]:
                        word = holonym.name().split(".")[0]
                        new.append(word)
        for syn in wordnet.synsets(text):
            part_holonyms = syn.part_holonyms()
            if len(part_holonyms) > 0:
                for holonym in part_holonyms:
                    if holonym.pos() in ["a", "s", "r"]:
                        word = holonym.name().split(".")[0]
                        new.append(word)
    for word in new:
        lemma = lemmatizer.lemmatize(word)
        if lemma not in new:
            new.append(lemma)
    final = []
    for word in new:
        if word not in final:
            final.append(word)
    return final

def get_meronyms(word_list):
    new = []
    for text in word_list:
        new.append(text)
        for syn in wordnet.synsets(text):
            member_meronyms = syn.member_meronyms()
            if len(member_meronyms) > 0:
                for meronym in member_meronyms:
                    if meronym.pos() in ["a", "s", "r"]:
                        word = meronym.name().split(".")[0]
                        new.append(word)
        for syn in wordnet.synsets(text):
            substance_meronyms = syn.substance_meronyms()
            if len(substance_meronyms) > 0:
                for meronym in substance_meronyms:
                    if meronym.pos() in ["a", "s", "r"]:
                        word = meronym.name().split(".")[0]
                        new.append(word)
        for syn in wordnet.synsets(text):
            part_meronyms = syn.part_meronyms()
            if len(part_meronyms) > 0:
                for meronym in part_meronyms:
                    if meronym.pos() in ["a", "s", "r"]:
                        word = meronym.name().split(".")[0]
                        new.append(word)
    for word in new:
        lemma = lemmatizer.lemmatize(word)
        if lemma not in new:
            new.append(lemma)
    final = []
    for word in new:
        if word not in final:
            final.append(word)
    return final
    
def get_derivatives(word_list):
    new = []
    for text in word_list:
        new.append(text)
        for syn in wordnet.synsets(text):
            if syn.pos() in ["a", "s", "r"]:
                lemmas = wordnet.lemmas(syn.name().split(".")[0], syn.name().split(".")[1])
                if len(lemmas) > 0:
                    for lemma in lemmas:
                        if lemma.syntactic_marker():
                            new.append(lemma.name())
                        else:
                            pass
    for word in new:
        lemma = lemmatizer.lemmatize(word)
        if lemma not in new:
            new.append(lemma)
    final = []
    for word in new:
        if word not in final:
            final.append(word)
    return final


# Counting functions

In [None]:
def has_asian(text_list):
    counter = 0
    asian_words = ["asian", "asians", "chinese", "china", "wuhan"]
    asian_words_longer = get_synonyms(asian_words)
    if type(text_list) == str:
        text_list = ast.literal_eval(text_list)
    for word in text_list:
        if word in asian_words_longer:
            counter += 1
    return counter

def has_chinese(text_list):
    counter = 0
    chinese_words = ["chinese", "china", "wuhan"]
    if type(text_list) == str:
        text_list = ast.literal_eval(text_list)
    for word in text_list:
        if word in chinese_words:
            counter += 1
    return counter

def has_covid(text_list):
    counter = 0
    covid_words = ["covid", "covid-19", "coronavirus", "sars-cov-2"]
    if type(text_list) == str:
        text_list = ast.literal_eval(text_list)
    for word in text_list:
        if word in covid_words:
            counter += 1
    return counter


# Wrangling

## Full df

In [None]:
# result = glob.glob('raw/Full/*.csv')

In [None]:
# for r in result:
#     try:
#         df = pd.read_csv(r, lineterminator='\n')
#         df['date'] = r[19:29]
#         df['time'] = r[30:38]
#         df.to_csv(r, index=False)
#     except:
#         os.remove(r)
#         print(f"File {r} was deleted because it was empty.")

# for r in result:
#     df = pd.read_csv(r, lineterminator='\n')
#     df['lemma'] = pipeline(df.text)
#     # Remove empty lines
#     df['lemma_length'] = df.lemma.apply(len)
#     df = df[df['lemma_length'] > 0]
#     count_row = df.shape[0]
#     if count_row > 0:
#             df.to_csv(r, index=False)

# for r in result:
#     df = pd.read_csv(r, lineterminator='\n')
#     deduped_df = df[-df[['author_id', 'id']].duplicated()]
#     deduped_df.to_csv(r, index=False)


In [None]:
# result_dict = {}

# for r in result:
#     if r[19:29] in result_dict.keys():
#         result_dict[r[19:29]].append(r)
#     else:
#         result_dict[r[19:29]] = [r]

# for key in result_dict.keys():
#     output_path = f"raw/Full/response_{key}.csv"
#     df1 = pd.read_csv(result_dict[key][0], lineterminator='\n')
#     df = df1
#     if len(result_dict[key]) > 1:
#         df2 = pd.read_csv(result_dict[key][1], lineterminator='\n')
#         df = pd.concat([df1, df2])
#     if len(result_dict[key]) > 2:
#         df3 = pd.read_csv(result_dict[key][2], lineterminator='\n')
#         df = pd.concat([df1, df2, df3])
#     df.to_csv(output_path, index=False)



### Data Quality Checks

In [None]:
# unique_users = {}
# result = glob.glob('raw/Full/*.csv')
# n_users = 0
# n_samedayrepeats = 0
# maximum = 0
# minimum = 1000000000000000000000000000000000000
# dates = []

# for r in result:
#     if r[18:28] not in unique_users.keys():
#         df = pd.read_csv(r, lineterminator='\n')
#         users = df.author_id.unique().tolist()
#         if len(df.author_id.tolist()) - len(df.author_id.unique().tolist()) != 0:
#             dates.append(r[18:28])
#             if df.groupby(["author_id", "id"]).size().max() > maximum:
#                 maximum = max(df.groupby(["author_id", "id"]).size())
#             elif df.groupby(["author_id", "id"]).size().min() < minimum:
#                 minimum = min(df.groupby(["author_id", "id"]).size())
            
#             n_samedayrepeats += len(df.author_id.tolist()) - len(df.author_id.unique().tolist())
            
#         n_users += len(users)
#         unique_users[r[19:29]] = users


In [None]:
# dates.sort()
# dates

## Create columns for BERT and LIWC

In [None]:
# result = glob.glob('raw/Full/*.csv')

# for r in result:
#     outfile = f"{r[:4]}BERT{r[8:]}"
#     df = pd.read_csv(r, lineterminator='\n')
#     df['bert_lemma'] = pipeline_bert(df.text)
#     df['LIWC_lemma'] = pipeline_LIWC(df.text)
#     df.to_csv(outfile, index=False)

## Create sub dfs

In [None]:
# result = glob.glob('raw/BERT/*.csv')

In [None]:
# for r in result:
#     outfile = f"{r[:17]}Asian/{r[17:]}"
#     if not os.path.exists(outfile):
#         df = pd.read_csv(r, lineterminator='\n')
#         df['asian'] = df["lemma"].apply(has_asian)
#         df_asian = df[df['asian'] > 0]
#         count_row = df_asian.shape[0]
#         if count_row > 0:
#             df_asian.to_csv(outfile, index=False)
#         else:
#             print(f"File {outfile} was not saved because it was empty.")
#     else:
#         pass

# for r in result:
#     outfile = f"{r[:17]}Chinese/{r[17:]}"
#     if not os.path.exists(outfile):
#         df = pd.read_csv(r, lineterminator='\n')
#         df['chinese'] = df["lemma"].apply(has_chinese)
#         df_chinese = df[df['chinese'] > 0]
#         count_row = df_chinese.shape[0]
#         if count_row > 0:
#             df_chinese.to_csv(outfile, index=False)
#         else:
#             print(f"File {outfile} was not saved because it was empty.")
#     else:
#         pass

# for r in result:
#     outfile = f"{r[:17]}COVID/{r[17:]}"
#     if not os.path.exists(outfile):
#         df = pd.read_csv(r, lineterminator='\n')
#         df['covid'] = df["lemma"].apply(has_covid)
#         df_covid = df[df['covid'] > 0]
#         count_row = df_covid.shape[0]
#         if count_row > 0:
#             df_covid.to_csv(outfile, index=False)
#         else:
#             print(f"File {outfile} was not saved because it was empty.")
#     else:
#         pass

## Dedupe sub dfs

In [None]:
# result_covid = glob.glob('raw/COVID/*.csv')
# result_asian = glob.glob('raw/Asian/*.csv')

# for r in result_covid:
#     df = pd.read_csv(r, lineterminator='\n')
#     deduped_df = df[-df[['author_id', 'id']].duplicated()]
#     deduped_df.to_csv(r, index=False)

# for r in result_asian:
#     df = pd.read_csv(r, lineterminator='\n')
#     deduped_df = df[-df[['author_id', 'id']].duplicated()]
#     deduped_df.to_csv(r, index=False)


# Calculate number of tweets of each type per day

In [None]:
# result = glob.glob('raw/Full/*.csv')
# result_covid = glob.glob('raw/COVID/*.csv')
# result_asian = glob.glob('raw/Asian/*.csv')

In [None]:
# for r in result:
#     r_new = f"{r[:8]}/bert{r[8:]}"
#     df = pd.read_csv(r, lineterminator='\n')
#     df['bert_lemma'] = pipeline_bert(df.text)
#     df.to_csv(r_new, index=False)

In [None]:
# result_dict = {}

# for r in result:
#     key = r[18:-4]
#     df = pd.read_csv(r, lineterminator='\n')
#     r, c = df.shape
#     result_dict[key] = r

In [None]:
# df = pd.DataFrame.from_dict(result_dict, orient='index')
# df = df.reset_index()
# df.columns = ["day", "Number of tweets"]
# df.head()

# df.to_csv('TweetsPerDay.csv', index=False)

In [None]:
# result_covid_dict = {}

# for r in result_covid:
#     key = r[19:29]
#     df = pd.read_csv(r, lineterminator='\n')
#     r, c = df.shape
#     result_covid_dict[key] = r

In [None]:
# df = pd.DataFrame.from_dict(result_covid_dict, orient='index')
# df = df.reset_index()
# df.columns = ["day", "Number of tweets"]
# df.head()

# df.to_csv('COVIDTweetsPerDay.csv', index=False)

In [None]:
# result_asian_dict = {}

# for r in result_asian:
#     key = r[19:29]
#     df = pd.read_csv(r, lineterminator='\n')
#     r, c = df.shape
#     result_asian_dict[key] = r

In [None]:
# df = pd.DataFrame.from_dict(result_asian_dict, orient='index')
# df = df.reset_index()
# df.columns = ["day", "Number of tweets"]
# df.head()

# df.to_csv('AsianTweetsPerDay.csv', index=False)