In [9]:
DATA_PATH = './sliced_data_900.csv'
EMOTICON_DATA_PATH = "./emoticon_dict.json"


In [10]:
import re
import ast
from typing import Union
import pandas as pd
from bs4 import BeautifulSoup

class TwitterDataPrep:
    def __init__(self, data_path):
        self.data_path = data_path
        self.emoticons_dict = {}
    def clean_text(self, text: str) -> str:
        """
        Clean the tweets in a basic way.
        :param text:
        :return: text
        """
        pat1 = r'@[^ ]+'  # remove username @
        pat2 = r'https?://[A-Za-z0-9./]+'  # remove urls
        pat3 = r'\'s'  # remove apostrophe todo: check if it is necessary for the model
        pat4 = r'\#\w+'  # remove hashtag
        pat5 = r'&amp '  # remove unicode `&`
        # pat6 = r"[\n\t]*" # r'[^A-Za-z\s]'
        pat7 = r'RT'  # remove RT / retweet
        pat8 = r'www\S+'  # remove link www
        combined_pat = r'|'.join((pat1, pat2, pat3, pat4, pat5, pat7, pat8))  # combine all patterns
        text = re.sub(combined_pat, "", text)  # .lower()
        text = re.sub(r'\s+', ' ', text)  # remove extra spaces
        return text.strip()

    def _parse_bytes(self, field: Union[str, ast.AST]) -> Union[str, ast.AST]:
        """ Convert string represented in Python byte-string literal syntax into a
        decoded character string. Other field types returned unchanged.
        :param field: string or bytestring
        :return: string
        """
        result = field
        try:
            result = ast.literal_eval(field)
        finally:
            return result.decode() if isinstance(result, bytes) else field

    def replace_emoticons(self, text) -> str:
        """
        Replace emoticons in the text with their corresponding word.
        :param text:
        :return:
        """
        for emoticon, context in self.emoticons_dict.items():
            text = text.replace(emoticon, ' ' + context + ' ')
            text = re.sub(' +', ' ', text)
        return text

    def replace_emojis(self, text: str) -> str:
        """
        Replace emojis in the text with their corresponding word using demoji.
        :param text:
        :return:
        """
        for emoji, context in demoji.findall(text).items():
            text = text.replace(emoji, ' ' + context + ' ')
            text = re.sub(' +', ' ', text)
        return text

    def preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Preprocess the tweets.
        :param df: dataframe
        :return: dataframe
        """
        print('Preprocessing data...')
        # open encoded file
        df['tweet'] = df['tweet'].apply(self._parse_bytes)
        # normalize the text
        # df['tweet'] = df['tweet'].map(lambda x:BeautifulSoup(x, "html").get_text(strip=True))
        df['tweet'] = df['tweet'].str.normalize('NFKD')
        # clean the text
        df['tweet'] = df['tweet'].map(self.clean_text)
        # .str.encode('ascii', 'ignore').str.decode('utf-8')
        # replace emoticons with custom dictionary
        # df['tweet'].replace(emoticon_dict_escape, regex=True, method='pad') # to use regex with replace method
        df['tweet'] = df['tweet'].apply(lambda x: self.replace_emoticons(x))
        # replace emojis
        df['tweet'] = df['tweet'].apply(lambda x: self.replace_emojis(x))
        print('Preprocessing done.')
        return df

    

In [17]:
twitter_data_prep = TwitterDataPrep(data_path=DATA_PATH)
twitter_data_prep

<__main__.TwitterDataPrep at 0x793ec6d91990>

In [19]:
temp_df = pd.read_csv('./sliced_data_900.csv')
preprocessed_data = twitter_data_prep.preprocess_data(temp_df)
preprocessed_data

Preprocessing data...
Preprocessing done.


Unnamed: 0.1,Unnamed: 0,twitter_id,date,tweet,source_file
0,1,1.546540e+18,2022-07-11 17:00:00+00:00,Metaverse VR creates own Ecosystem with P2E Ga...,10Ronaldinho
1,2,1.546514e+18,2022-07-11 15:16:09+00:00,flag: Brazil Ronaldinho hot face | |,10Ronaldinho
2,3,1.545479e+18,2022-07-08 18:44:58+00:00,flag: Brazil flag: Brazil stopwatch 5 seasons...,10Ronaldinho
3,4,1.546629e+18,2022-07-11 22:55:57+00:00,Ele é o cara!,10neto
4,5,1.546603e+18,2022-07-11 21:12:11+00:00,"Ô Daronco, você já foi bem melhor como árb...",10neto
...,...,...,...,...,...
2679,2680,1.533259e+18,2022-06-05 01:25:31+00:00,heart suit folded hands: light skin tone whit...,zairana
2680,2681,1.533138e+18,2022-06-04 17:25:30+00:00,Hice este video resumen de mis últimas 72hs e...,zairana
2681,2682,1.543453e+18,2022-07-03 04:33:48+00:00,face with tears of joy sign of the horns: med...,zaynmalik
2682,2683,1.542884e+18,2022-07-01 14:51:06+00:00,Thank you for all the love and support - Today...,zaynmalik


In [21]:
preprocessed_data.to_csv('./c_900.csv', index=False)