Raw data access and clean text and emojis

In [3]:
import pandas as pd
import re
import emoji
import nltk
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from collections import Counter
from textblob import TextBlob

In [24]:
# read txt file
def read_txt(file):
    with open(file, 'r') as f:
        data = f.read().splitlines()
    return data

get data from tweetval: https://github.com/cardiffnlp/tweeteval/tree/main/datasets/sentiment

In [25]:
test_files = ['../dataset/tweetval/test_text.txt', '../dataset/tweetval/test_labels.txt']
train_files = ['../dataset/tweetval/train_text.txt', '../dataset/tweetval/train_labels.txt']
val_files = ['../dataset/tweetval/val_text.txt', '../dataset/tweetval/val_labels.txt']

test_text = read_txt(test_files[0])
test_labels = read_txt(test_files[1])
train_text = read_txt(train_files[0])
train_labels = read_txt(train_files[1])
val_text = read_txt(val_files[0])
val_labels = read_txt(val_files[1])


In [26]:
# create dataframe
def create_df(text, labels):
    df = pd.DataFrame({'text': text, 'sentiment': labels})
    return df

# cocatenate train, val and test data
train_df = create_df(train_text, train_labels)
val_df = create_df(val_text, val_labels)
test_df = create_df(test_text, test_labels)

new_df = pd.concat([train_df, val_df, test_df], ignore_index=True)

# save the dataframe as tsv file
new_df.to_csv('../dataset/tweetval/tweetval_data.tsv', sep='\t', index=False)

clean data

In [11]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # remove url
    text = re.sub(r"@\S+", "", text)     # remove @
    text = re.sub(r"#\S+", "", text)     # remove hashtag
    text = re.sub(r"[\n\t]", " ", text)   # remove \n and \t
    text = re.sub(r"\s+", " ", text)     # remove extra whitespace
    text = re.sub(r"RT", "", text)       # remove RT
    text = re.sub(r"pic.\S+", "", text)  # remove pic
    text.strip()  # remove leading and trailing whitespace 
    text = text.lower()  # convert to lowercase
    
    return text

In [19]:
# read the tsv file
merged_raw_data  = pd.read_csv('../dataset/tweetval/tweetval_data.tsv', sep='\t')

In [14]:
# remove the link or url in the text
merged_raw_data["text"] = merged_raw_data["text"].apply(clean_text)

print(merged_raw_data.head())

merged_raw_data.to_csv("../dataset/tweetval/tweetval_data_cleaned.tsv", sep='\t', index=False)

                                                text  sentiment
0  "qt in the original draft of the 7th book, rem...          2
1  "ben smith / smith (concussion) remains out of...          1
2  sorry bout the stream last night i crashed out...          1
3  chase headley's rbi double in the 8th inning o...          1
4   alciato: bee will invest 150 million in janua...          2


In [18]:
# remove emoji
def remove_emoji(text):
    return emoji.replace_emoji(text, replace='')

merged_raw_data["text"] = merged_raw_data["text"].apply(remove_emoji)

# remove emoticons
def remove_emoticons(text):
    emoticon_pattern = re.compile(r'[\d\/\*\:\)\.\?\^\;?\-_\'~!\<\>\=\"#&$%\\\{\}\|\[\]ç\+ω○\@¡éı・…¡\`：）♡ӳ！“”à≧∇≦♂ş≈¬⊄─✔•×ü–₹。ó°ʖ—¶ķñ฿ĺ∑；⏸][\d\/\*\:\)\.\?\^\;?\-_\'~!\<\>\=\"#&$%\\\{\}\|\[\]ç\+ω○\@¡éı・…¡\`：）♡ӳ！“”à≧∇≦♂ş≈¬⊄─✔•×ü–₹。ó°ʖ—¶ķñ฿ĺ∑；⏸]')
    return emoticon_pattern.sub(r'', text)

merged_raw_data["text"] = merged_raw_data["text"].apply(remove_emoticons)
print(merged_raw_data.head())

merged_raw_data.to_csv("../dataset/tweetval/tweetval_data_cleaned_without_emoji_emoticons.tsv", sep='\t', index=False)

                                                text  sentiment
0  "qt in the original draft of the 7th book, rem...          2
1  "ben smith / smith (concussion) remains out of...          1
2  sorry bout the stream last night i crashed out...          1
3  chase headley's rbi double in the 8th inning o...          1
4   alciato: bee will invest 0 million in january...          2


get another data from this dataset: https://github.com/BaleChen/emoji-setiment-analysis/tree/main/Data/emoji2vec_original_files

In [37]:
import pickle
import pandas as pd
from collections import defaultdict

In [38]:
class TweetTrainingExample:
    """Structure holding a Tweet Training example"""

    def __init__(self, id, text, label):
        """Create the training example

        Args:
            id: ID of the example
            text: text of the example
            label: example label
        """
        self.id = id
        self.text = text
        self.label = label

    def __repr__(self):
        return str.format('{}, {}, {}\n', self.id, self.label, self.text)

In [39]:
files = ["examples.p"]
data = defaultdict(list)

for file in files:
    with open(f'../dataset/example/{file}', 'rb') as f:
        examples = pickle.load(f)
        for example in examples.values():
            data['label'].append(example.label)
            data['text'].append(example.text)

In [40]:
df = pd.DataFrame(data)

print(df.head())

In [42]:
df = df[df['text'].notnull()]

df = df.reset_index(drop=True)

print(len(df))

64599


In [43]:
# count the number of negative, neutral and positive tweets
print(df['label'].value_counts())


label
Neutral     29722
Positive    18611
Negative    16266
Name: count, dtype: int64


In [45]:
label_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
df['sentiment'] = df['label'].map(label_mapping)

In [46]:
print(df)

#  count the number of negative, neutral and positive tweets  
print(df['sentiment'].value_counts())

          label                                               text  sentiment
0      Negative              @alexsnowden_ what the hell snowballs          0
1      Positive  RT @5SOS: Our bus is 100% rock n roll http://t...          2
2      Positive  RT @InnocentLaTre: The game has changed 😂💀 htt...          2
3       Neutral  RT @TRILOGILINSKY: YO IS THIS A VINE HES SO FU...          1
4      Negative  I hate wearing clothes like i just wanna walk ...          0
...         ...                                                ...        ...
64594  Positive  Mabaiiiit sobraaa. And then a sweet one! ☺️☺️ ...          2
64595   Neutral  The truth about education spending - The Progr...          1
64596  Positive     @LaciePassmore lol Jeremy is bootylicious dude          2
64597  Negative  @MannanJamil17 so am I hopefully you lot will ...          0
64598  Positive  The whole time we're just laughing talking bou...          2

[64599 rows x 3 columns]
sentiment
1    29722
2    18611
0    1

In [44]:
# read the tsv file
merged_raw_data  = pd.read_csv('../dataset/tweetval/tweetval_data.tsv', sep='\t')

#  count the number of negative, neutral and positive tweets
print(merged_raw_data['sentiment'].value_counts())

sentiment
1    27479
2    21043
0    11377
Name: count, dtype: int64


concat two dataset

In [26]:
# read the tsv file
merged_raw_data  = pd.read_csv('../dataset/tweetval/tweetval_data.tsv', sep='\t')

# concat the two dataframes
new_df = pd.concat([df, merged_raw_data], axis=0)

print(new_df)


          label                                               text  sentiment
0      Negative              @alexsnowden_ what the hell snowballs          0
1      Positive  RT @5SOS: Our bus is 100% rock n roll http://t...          2
2      Positive  RT @InnocentLaTre: The game has changed 😂💀 htt...          2
3       Neutral  RT @TRILOGILINSKY: YO IS THIS A VINE HES SO FU...          1
4      Negative  I hate wearing clothes like i just wanna walk ...          0
...         ...                                                ...        ...
59894       NaN  Sentinel Editorial: FBI’s Comey ‘had no one of...          1
59895       NaN  perfect pussy clips #vanessa hudgens zac efron...          1
59896       NaN  #latestnews 4 #newmexico #politics + #nativeam...          1
59897       NaN  Trying to have a conversation with my dad abou...          0
59898       NaN  @user You are a stand up guy and a Gentleman V...          2

[124498 rows x 3 columns]


In [27]:
label_mapping = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
new_df['label'] = new_df['sentiment'].map(label_mapping)


In [30]:
new_df.to_csv('../dataset/process/tweets.tsv', sep='\t', index=False)

In [31]:
import re
import emoji

def count_emoji(text):
    # Use regex to find all Unicode emoji characters
    emoji_pattern = re.compile(r'([\U0001F600-\U0001F64F]|[\U0001F300-\U0001F5FF]|[\U0001F680-\U0001F6FF]|[\U0001F1E0-\U0001F1FF])')
    emojis = emoji_pattern.findall(text)
    
    # If the list of emojis is not empty, the text contains an emoji
    return bool(emojis)

new_df["has_emoji"] = new_df["text"].apply(count_emoji)
print(new_df.head())

print(new_df["has_emoji"].value_counts())

# print text with emoji
print(new_df[new_df["has_emoji"] == True]["text"].values[:10])

      label                                               text  sentiment  \
0  Negative              @alexsnowden_ what the hell snowballs          0   
1  Positive  RT @5SOS: Our bus is 100% rock n roll http://t...          2   
2  Positive  RT @InnocentLaTre: The game has changed 😂💀 htt...          2   
3   Neutral  RT @TRILOGILINSKY: YO IS THIS A VINE HES SO FU...          1   
4  Negative  I hate wearing clothes like i just wanna walk ...          0   

   has_emoji  
0      False  
1      False  
2       True  
3      False  
4      False  
has_emoji
False    113065
True      11433
Name: count, dtype: int64
['RT @InnocentLaTre: The game has changed 😂💀 http://t.co/4TqPV5eqCY'
 '“@Tvga_: @theninimarley bootayyyyyy !!!”massive booty 😍'
 'Like can I just ignore you 😂😂😂😂'
 '@Bon_Qui_Quii lmaoo akuaaaaa 😭😭 we both are going to mss'
 'RT @wOwBoice: 😍😍😍😍😍"@boice106: #CSPH  Greedy man\nCr.ahebrewprincess\nhttp://t.co/MdeBsAycKF"'
 'Chelsea just fell off her chair 😂😂👏👏'
 '@Lisa_higgins0 th

clean data

In [4]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # remove url
    text = re.sub(r"@\S+", "", text)     # remove @
    text = re.sub(r"#\S+", "", text)     # remove hashtag
    text = re.sub(r"[\n\t]", " ", text)   # remove \n and \t
    text = re.sub(r"\s+", " ", text)     # remove extra whitespace
    text = re.sub(r"RT", "", text)       # remove RT
    text = re.sub(r"pic.\S+", "", text)  # remove pic
    text.strip()  # remove leading and trailing whitespace 
    text = text.lower()  # convert to lowercase
    
    return text

In [None]:
# read the tsv file
merged_raw_data = pd.read_csv('../dataset/process/tweets_convert.tsv', sep='\t')

merged_raw_data ['text'] = merged_raw_data ['text'].astype(str)

# remove the link or url in the text
merged_raw_data["text"] = merged_raw_data["text"].apply(clean_text)

print(merged_raw_data.head())

merged_raw_data.to_csv("../dataset/process/tweets_convert_cleaned.tsv", sep='\t', index=False)

In [None]:
# remove emoji
def remove_emoji(text):
    return emoji.replace_emoji(text, replace='')

merged_raw_data["text"] = merged_raw_data["text"].apply(remove_emoji)

# remove emoticons
def remove_emoticons(text):
    emoticon_pattern = re.compile(r'[\d\/\*\:\)\.\?\^\;?\-_\'~!\<\>\=\"#&$%\\\{\}\|\[\]ç\+ω○\@¡éı・…¡\`：）♡ӳ！“”à≧∇≦♂ş≈¬⊄─✔•×ü–₹。ó°ʖ—¶ķñ฿ĺ∑；⏸][\d\/\*\:\)\.\?\^\;?\-_\'~!\<\>\=\"#&$%\\\{\}\|\[\]ç\+ω○\@¡éı・…¡\`：）♡ӳ！“”à≧∇≦♂ş≈¬⊄─✔•×ü–₹。ó°ʖ—¶ķñ฿ĺ∑；⏸]')
    return emoticon_pattern.sub(r'', text)

merged_raw_data["text"] = merged_raw_data["text"].apply(remove_emoticons)
print(merged_raw_data.head())

merged_raw_data.to_csv("../dataset/process/tweets_convert_without_emoji_emoticons.tsv", sep='\t', index=False)