In [87]:
import pandas as pd

import spacy
from scispacy.abbreviation import AbbreviationDetector

from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

import html
import emoji
from googletrans import Translator

from pathlib import Path
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from database import database_manager as dbm

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/raimuu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


**Extract**

In [88]:
file_path = "../../data/raw/covid_twitter_labelled.zip"
df = pd.read_csv(file_path, compression='zip')
new_df = df["Processed_Text"].head()
new_df.to_csv("testing")
display(df)

Unnamed: 0.1,Unnamed: 0,index,text,date,truncated,Processed_Text,sentiment,new_sentiment
0,0,0,RT @danpfeiffer: Isn't wild that the President...,Thu Apr 02 00:27:49 +0000 2020,FALSE,danpfeiff isnt wild presid unit state own thou...,3,3
1,1,2,RT @Complex: 90-year-old woman dies from Coron...,Thu Apr 02 00:27:51 +0000 2020,FALSE,complex 90yearold woman die coronavirus ask do...,3,3
2,2,4,RT @jennycohn1: We need their help not so much...,Thu Apr 02 00:27:52 +0000 2020,FALSE,jennycohn1 need help much discuss happen 2016 ...,3,3
3,3,6,RT @AmbJehangir: Deadly but still funny (?)\nE...,Thu Apr 02 00:27:58 +0000 2020,FALSE,ambjehangir dead still funni even dead contagi...,1,1
4,4,8,RT @SethAbramson: BREAKING NEWS: Daily Coronav...,Thu Apr 02 00:28:24 +0000 2020,FALSE,sethabramson break news daili coronavirus deat...,3,3
...,...,...,...,...,...,...,...,...
119967,41954,89181,RT @kairyssdal: This is...\n3. An economic cri...,Fri Mar 06 00:04:20 +0000 2020,FALSE,kairyssd 3 econom crisi,2,1
119968,41955,89182,"After watching Billions, i basically learned t...",Fri Mar 06 00:04:20 +0000 2020,FALSE,watch billion basic learn make stock market mo...,3,3
119969,41956,89185,All Plymouth schools to be closed for cleaning...,Fri Mar 06 00:04:21 +0000 2020,FALSE,plymouth school close clean friday student tes...,3,3
119970,41957,89191,RT @PaulDoroshenko: Can you sue the president ...,Fri Mar 06 00:04:22 +0000 2020,FALSE,pauldoroshenko sue presid bad medic advic,0,0


**Transform**

In [89]:
# remove unwanted columns
columns_to_drop = ["text", "Unnamed: 0", "index", "truncated", "new_sentiment"]
columns_to_rename = {"Processed_Text": "text", "sentiment": "numeric_labelled_sentiment"}
df = df.drop(columns=columns_to_drop).reset_index(drop=True)
df = df.rename(columns=columns_to_rename)
display(df)


Unnamed: 0,date,text,numeric_labelled_sentiment
0,Thu Apr 02 00:27:49 +0000 2020,danpfeiff isnt wild presid unit state own thou...,3
1,Thu Apr 02 00:27:51 +0000 2020,complex 90yearold woman die coronavirus ask do...,3
2,Thu Apr 02 00:27:52 +0000 2020,jennycohn1 need help much discuss happen 2016 ...,3
3,Thu Apr 02 00:27:58 +0000 2020,ambjehangir dead still funni even dead contagi...,1
4,Thu Apr 02 00:28:24 +0000 2020,sethabramson break news daili coronavirus deat...,3
...,...,...,...
119967,Fri Mar 06 00:04:20 +0000 2020,kairyssd 3 econom crisi,2
119968,Fri Mar 06 00:04:20 +0000 2020,watch billion basic learn make stock market mo...,3
119969,Fri Mar 06 00:04:21 +0000 2020,plymouth school close clean friday student tes...,3
119970,Fri Mar 06 00:04:22 +0000 2020,pauldoroshenko sue presid bad medic advic,0


In [90]:
def unescape_html(text): 
    return html.unescape(text)

df_clean = df.dropna().copy() # drop all rows containing null

df_clean['text'] = (
    df_clean['text']
    .str.replace(r'http\S+|www.\S+', '', regex=True) # remove links
    .str.replace(r'[\n\t]', ' ', regex=True) # remove \t\n
    .apply(unescape_html) # escape html formatting
    .apply(emoji.demojize) # replace emojis with their symbolic name
    .str.replace(r'[^\w\s.,!?;:\-()\'"/&]', '', regex=True) # remove non-alphanumeric characters (symbols not in this set)
    .str.replace(":", " ")
    .str.replace("_", " ")
)

# convert columns to appropriate data types
df_clean = df_clean.drop_duplicates(subset='text') # remove duplicates after cleaning/unifying format
df_clean = df_clean.reset_index(drop=True)



In [91]:
# convert numeric_labelled_sentiment to labeleld_sentiment
def normalize_sent(numeric_sent):
    if int(numeric_sent) > 2:
        return "positive"
    elif int(numeric_sent) < 2:
        return "negative"
    else:
        return "neutral"

df_clean['labelled_sentiment'] = df_clean['numeric_labelled_sentiment'].apply(normalize_sent)

**Load**

In [None]:
%%script true
newTable = "processed_labelled_twitter_data"
dbm.create_table(table_name=newTable, dataframe=df_clean, replace=True)

query = f"""
    SELECT * FROM "{newTable}"
"""
df = dbm.query_db(query)
display(df)

The table 'processed_labelled_twitter_data' already exists. Replacing entries.


Unnamed: 0,id,date,text,numeric_labelled_sentiment,labelled_sentiment
0,0,Thu Apr 02 00:27:49 +0000 2020,danpfeiff isnt wild presid unit state own thou...,3,positive
1,1,Thu Apr 02 00:27:51 +0000 2020,complex 90yearold woman die coronavirus ask do...,3,positive
2,2,Thu Apr 02 00:27:52 +0000 2020,jennycohn1 need help much discuss happen 2016 ...,3,positive
3,3,Thu Apr 02 00:27:58 +0000 2020,ambjehangir dead still funni even dead contagi...,1,negative
4,4,Thu Apr 02 00:28:24 +0000 2020,sethabramson break news daili coronavirus deat...,3,positive
...,...,...,...,...,...
64942,64942,Fri Mar 06 00:04:19 +0000 2020,rtcom moscow declar high alert coronavirus thr...,3,positive
64943,64943,Fri Mar 06 00:04:20 +0000 2020,kairyssd 3 econom crisi,2,neutral
64944,64944,Fri Mar 06 00:04:20 +0000 2020,watch billion basic learn make stock market mo...,3,positive
64945,64945,Fri Mar 06 00:04:21 +0000 2020,plymouth school close clean friday student tes...,3,positive
