In [63]:
import bz2
import pandas as pd
import os

In [39]:
def read_tsv_bz2(file_path):
    with bz2.open(file_path, "rt") as file:
        df = pd.read_csv(file, delimiter="\t", header=None)
    return df

## dataset source
Yin, Wenjie; Alkhalifa, Rabab; Zubiaga, Arkaitz (2021). TM-Senti. figshare. Dataset. https://doi.org/10.6084/m9.figshare.16438281.v1

In [41]:
file_paths = [
    # "../dataset/TM-Senti/ar-ids.tsv.bz2",
    # "../dataset/TM-Senti/de-ids.tsv.bz2",
    "../dataset/TM-Senti/en-ids.tsv.bz2",
    # "../dataset/TM-Senti/es-ids.tsv.bz2",
    # "../dataset/TM-Senti/fr-ids.tsv.bz2",
    # "../dataset/TM-Senti/it-ids.tsv.bz2",
    # "../dataset/TM-Senti/zh-ids.tsv.bz2"
]

# read all datasets
dfs = [read_tsv_bz2(file_path) for file_path in file_paths]

# print the first 5 rows of each dataset
for i, df in enumerate(dfs):
    print(f"Data from {file_paths[i]}:")
    # date \t tweetid \t sentiment \t evidence
    df.columns = ["date", "tweetid", "sentiment", "evidence"]
    print(df.head())
    print(len(df))
    print("\n")

# print the df with the date "2020-01-01"
print(df[df["date"] == "2020-01-01"])

Data from ./dataset/TM-Senti/en-ids.tsv.bz2:
         date             tweetid sentiment evidence
0  2013-01-01  286003826803240960       pos        ✌
1  2013-01-01  286003826811625472       neg        😒
2  2013-01-01  286003826815807489       pos        😊
3  2013-01-01  286003830993326080       pos        😘
4  2013-01-01  286003831001710592       pos       :)
117411852


                 date              tweetid sentiment evidence
111100330  2020-01-01  1212267211261267969       pos        😇
111100331  2020-01-01  1212267211282436097       pos        🥳
111100332  2020-01-01  1212267211299151874       pos        🥳
111100333  2020-01-01  1212267232245551104       neg      😭,😭
111100334  2020-01-01  1212267244828418050       neg      😭,😭
...               ...                  ...       ...      ...
111132434  2020-01-01  1212629544617828352       pos        😘
111132435  2020-01-01  1212629557204922368       neg      😭,😭
111132436  2020-01-01  1212629565576601600       pos        😍
11113

In [42]:
# save the 2020-01 data to a csv file
en_2020_tweet_data = df[df["date"].str.contains("2020-01")].copy()

print(en_2020_tweet_data.head())
print(len(en_2020_tweet_data))



                 date              tweetid sentiment evidence
111100330  2020-01-01  1212267211261267969       pos        😇
111100331  2020-01-01  1212267211282436097       pos        🥳
111100332  2020-01-01  1212267211299151874       pos        🥳
111100333  2020-01-01  1212267232245551104       neg      😭,😭
111100334  2020-01-01  1212267244828418050       neg      😭,😭
942466


In [43]:
# pos -> 1, neg -> 0
en_2020_tweet_data["sentiment"] = en_2020_tweet_data["sentiment"].apply(lambda x: 1 if x == "pos" else 0)

# rearrange index
en_2020_tweet_data.reset_index(drop=True, inplace=True)

print(en_2020_tweet_data.head())
# save to tsv
en_2020_tweet_data.to_csv("../dataset/TM-Senti/en-2020-01.tsv", sep="\t", index=False)

         date              tweetid  sentiment evidence
0  2020-01-01  1212267211261267969          1        😇
1  2020-01-01  1212267211282436097          1        🥳
2  2020-01-01  1212267211299151874          1        🥳
3  2020-01-01  1212267232245551104          0      😭,😭
4  2020-01-01  1212267244828418050          0      😭,😭


## Add orign tweet data

Concat parts of TM-Senti data with its origin tweet(2020-01-01~2020-01-05 with 2020-01-03 excluded due to missing soure tweets data)


In [44]:
en_2020_01_TM_Senti = pd.read_csv("../dataset/TM-Senti/en-2020-01.tsv", delimiter="\t")

print(len(en_2020_01_TM_Senti))

942466


In [45]:
# get all the tweet csv file in folder /dataset/tweets_data
tweet_files = os.listdir("../dataset/tweets_data")
tweet_files = [file for file in tweet_files if file.endswith(".csv")]
en_2020_0101_tweet_raw_data = pd.DataFrame()
for file in tweet_files:
    print(file)
    df = pd.read_csv(f"../dataset/tweets_data/{file}")
    en_2020_0101_tweet_raw_data = pd.concat([en_2020_0101_tweet_raw_data, df])


tweets_data_2020_01_01.csv
tweets_data_2020_01_02.csv
tweets_data_2020_01_04.csv
tweets_data_2020_01_05.csv


In [46]:
print(en_2020_0101_tweet_raw_data.head())
print(en_2020_01_TM_Senti.head())

                    id                                               text
0  1212267211265458176                    ガチャ結果によってはサプでタミン取ろうかと思ったけどダマがない
1  1212267211261280256         Kind Lady(interlude) / OKUYATOS/ CSDDRMAX2
2  1212267211290624001                                          ワイ、三国11連敗
3  1212267211261267969  Look y’all a bitch going to sleep. Happy New Y...
4  1212267211298983939  RT @icedflatwhite: ถ้าปีหน้าเผลอไปด่าใครก็นั่น...
         date              tweetid  sentiment evidence
0  2020-01-01  1212267211261267969          1        😇
1  2020-01-01  1212267211282436097          1        🥳
2  2020-01-01  1212267211299151874          1        🥳
3  2020-01-01  1212267232245551104          0      😭,😭
4  2020-01-01  1212267244828418050          0      😭,😭


In [51]:
merged_data = pd.merge(en_2020_01_TM_Senti, en_2020_0101_tweet_raw_data, left_on="tweetid", right_on="id")
merged_data.drop(columns=["id"], inplace=True)

         date              tweetid  sentiment evidence  \
0  2020-01-01  1212267211261267969          1        😇   
1  2020-01-01  1212267211282436097          1        🥳   
2  2020-01-01  1212267211299151874          1        🥳   
3  2020-01-01  1212267232245551104          0      😭,😭   
4  2020-01-01  1212267244828418050          0      😭,😭   

                                                text  
0  Look y’all a bitch going to sleep. Happy New Y...  
1  To our Mountain time Moose Chucklers:\n\n🏔️🥳HA...  
2  Happy New Year, Denver! 🥳🥂 https://t.co/xo9qCT...  
3  @BTS_twt yoongi i love you so so so so so so s...  
4                           What an amazing night! 😭  


In [52]:
merged_data.drop(columns=["date"], inplace=True)

In [54]:
# eliminate null or duplicate values
merged_data.dropna(inplace=True)
merged_data.drop_duplicates(inplace=True)

In [None]:
print(len(merged_data))

merged_data.to_csv("../dataset/process/en-2020-01-merged.tsv", sep="\t", index=False)

# Clean data

In [79]:
import re

In [80]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # remove url
    text = re.sub(r"@\S+", "", text)     # remove @
    text = re.sub(r"#\S+", "", text)     # remove hashtag
    text = re.sub(r"[\n\t]", " ", text)   # remove \n and \t
    text = re.sub(r"\s+", " ", text)     # remove extra whitespace
    text = re.sub(r"RT", "", text)       # remove RT
    text = re.sub(r"pic.\S+", "", text)  # remove pic
    text.strip()  # remove leading and trailing whitespace 
    text = text.lower()  # convert to lowercase
    
    return text

In [81]:
merged_raw_data = pd.read_csv("../dataset/process/en-2020-01-merged.tsv", delimiter="\t")

In [82]:
# remove the link or url in the text
merged_raw_data["text"] = merged_raw_data["text"].apply(clean_text)

print(merged_raw_data.head())

merged_raw_data.to_csv("../dataset/process/en-2020-01-merged-cleaned.tsv", sep="\t", index=False)

               tweetid  sentiment evidence  \
0  1212267211261267969          1        😇   
1  1212267211282436097          1        🥳   
2  1212267211299151874          1        🥳   
3  1212267232245551104          0      😭,😭   
4  1212267244828418050          0      😭,😭   

                                                text  
0  look y’all a bitch going to sleep. happy new y...  
1  to our mountain time moose chucklers: 🏔️🥳happy...  
2                        happy new year, denver! 🥳🥂   
3   yoongi i love you so so so so so so so so so ...  
4                           what an amazing night! 😭  


In [83]:
print(len(merged_raw_data))
print(merged_raw_data["text"].isnull().sum())
print(merged_raw_data["text"].duplicated().sum())
print(merged_raw_data["text"].values[:10])

129670
0
1743
['look y’all a bitch going to sleep. happy new years tho 😇'
 'to our mountain time moose chucklers: 🏔️🥳happy new year! 🥳🏔️ from all of us at ! '
 'happy new year, denver! 🥳🥂 '
 ' yoongi i love you so so so so so so so so so much 🥺😭💗💓💞💝💘💕💖💝💞💓💗💘💕💖 '
 'what an amazing night! 😭'
 ' happy new year!! 🎆🎆🎉🎉 thank you for sharing the beautiful art!🥰🥰'
 ' happy birthday b !! i hope you had a good new year and i hope your birthday goes well! 🥳🥳🎊'
 'happy new year, everyone!!! 🥳 - 2 piggys 🤪 ' 'it’s new years🥳'
 'happy new year 🥳 ']


In [84]:
import emoji

In [85]:
# remove emoji
def remove_emoji(text):
    return emoji.replace_emoji(text, replace='')

merged_raw_data["text"] = merged_raw_data["text"].apply(remove_emoji)

# remove emoticons
def remove_emoticons(text):
    emoticon_pattern = re.compile(r'[\d\/\*\:\)\.\?\^\;?\-_\'~!\<\>\=\"#&$%\\\{\}\|\[\]ç\+ω○\@¡éı・…¡\`：）♡ӳ！“”à≧∇≦♂ş≈¬⊄─✔•×ü–₹。ó°ʖ—¶ķñ฿ĺ∑；⏸][\d\/\*\:\)\.\?\^\;?\-_\'~!\<\>\=\"#&$%\\\{\}\|\[\]ç\+ω○\@¡éı・…¡\`：）♡ӳ！“”à≧∇≦♂ş≈¬⊄─✔•×ü–₹。ó°ʖ—¶ķñ฿ĺ∑；⏸]')
    return emoticon_pattern.sub(r'', text)

merged_raw_data["text"] = merged_raw_data["text"].apply(remove_emoticons)
print(merged_raw_data.head())

merged_raw_data.to_csv("../dataset/process/en-2020-01-merged-cleaned-without-emoji.tsv", sep="\t", index=False)

               tweetid  sentiment evidence  \
0  1212267211261267969          1        😇   
1  1212267211282436097          1        🥳   
2  1212267211299151874          1        🥳   
3  1212267232245551104          0      😭,😭   
4  1212267244828418050          0      😭,😭   

                                                text  
0  look y’all a bitch going to sleep. happy new y...  
1  to our mountain time moose chucklers: happy ne...  
2                          happy new year, denver!    
3   yoongi i love you so so so so so so so so so ...  
4                            what an amazing night!   


In [86]:
data_clean = pd.read_csv("../dataset/process/en-2020-01-merged-cleaned.tsv", delimiter="\t")

print(data_clean[['sentiment', 'evidence', 'text']][0:15].values)

[[1 '😇' 'look y’all a bitch going to sleep. happy new years tho 😇']
 [1 '🥳'
  'to our mountain time moose chucklers: 🏔️🥳happy new year! 🥳🏔️ from all of us at ! ']
 [1 '🥳' 'happy new year, denver! 🥳🥂 ']
 [0 '😭,😭'
  ' yoongi i love you so so so so so so so so so much 🥺😭💗💓💞💝💘💕💖💝💞💓💗💘💕💖 ']
 [0 '😭,😭' 'what an amazing night! 😭']
 [1 '🥰'
  ' happy new year!! 🎆🎆🎉🎉 thank you for sharing the beautiful art!🥰🥰']
 [1 '🥳'
  ' happy birthday b !! i hope you had a good new year and i hope your birthday goes well! 🥳🥳🎊']
 [1 '🥳' 'happy new year, everyone!!! 🥳 - 2 piggys 🤪 ']
 [1 '🥳' 'it’s new years🥳']
 [1 '🥳' 'happy new year 🥳 ']
 [1 '😊,😍'
  '😊 happy new year for real. the time is here. thought i’d be sleeping by now.🎉🎊😍🎈⭐️🕸🐾❄️💫']
 [0 '😭,😭'
  'i’m so done up i keep opening instagram thinking it’s twitter 😭😭']
 [1 '👍' 'we support you 👍🏽']
 [1 '🥳' 'hello 2020. 🥳']
 [1 '😌,😌' 'im going thru it 😌😌']]


In [87]:
# count the number of positive and negative sentiment
print(data_clean["sentiment"].value_counts())

# count the number of positive and negative sentiment
print(data_clean["sentiment"].value_counts(normalize=True))

sentiment
1    66006
0    63664
Name: count, dtype: int64
sentiment
1    0.509031
0    0.490969
Name: proportion, dtype: float64


In [88]:
# count the number of unique words

unique_words = set()
data_clean['text'].astype(str).apply(lambda x: unique_words.update(x.split()))
print(f"Number of unique words: {len(unique_words)}")

# count the number of unique emoji
def extract_emojis(text):
    return [char for char in text if char in emoji.EMOJI_DATA]
unique_emoji = set()
data_clean['text'].astype(str).apply(lambda x: unique_emoji.update(extract_emojis(x)))
print(f"Number of unique emoji: {len(unique_emoji)}")

Number of unique words: 124709
Number of unique emoji: 1015
