In [13]:
import pandas as pd
import re

cur = 'A'
df = pd.read_csv(f'data/COVIDSenti-{cur}.csv')
print(df.head())

                                               tweet label
0  Coronavirus | Human Coronavirus Types | CDC ht...   neu
1  @shehryar_taseer That‚Äôs üíØ true , \nCorona...   neu
2  TLDR: Not SARS, possibly new coronavirus. Diff...   neg
3  Disease outbreak news from the WHO: Middle Eas...   neu
4  China - Media: WSJ says sources tell them myst...   neu


In [14]:
df['label'].value_counts()

neu    22949
neg     5083
pos     1968
Name: label, dtype: int64

In [15]:
df['label'] = df['label'].replace({'pos': 1, 'neu': 0, 'neg': -1})

In [16]:
X, y = df['tweet'], df['label']

In [17]:
X = X.apply(lambda x: x.lower())
X.head()

0    coronavirus | human coronavirus types | cdc ht...
1    @shehryar_taseer that‚äôs üíø true , \ncorona...
2    tldr: not sars, possibly new coronavirus. diff...
3    disease outbreak news from the who: middle eas...
4    china - media: wsj says sources tell them myst...
Name: tweet, dtype: object

In [18]:
# Remove urls
def remove_urls(text):
    url_pattern = r'http\S+|www\S+|https\S+'
    return re.sub(url_pattern, ' ', text, flags=re.MULTILINE)
X = X.apply(remove_urls)

# Remove mentions
def remove_mentions(text):
    mention_pattern = r'@\w+'
    return re.sub(mention_pattern, ' ', text)
X = X.apply(remove_mentions)

X.head()

0        coronavirus | human coronavirus types | cdc  
1      that‚äôs üíø true , \ncorona virus \nswine ...
2    tldr: not sars, possibly new coronavirus. diff...
3    disease outbreak news from the who: middle eas...
4    china - media: wsj says sources tell them myst...
Name: tweet, dtype: object

In [19]:
# Remove newlines
X = X.replace(r'\n', ' ', regex=True)

# Remove special characters
def remove_special_characters(text):
    special_char_pattern = r'[^a-zA-Z0-9\s]'
    return re.sub(special_char_pattern, ' ', text)
X = X.apply(remove_special_characters)

# Remove extra numbers
def remove_numbers(text):
    return re.sub(r'\d+', ' ', text)
X = X.apply(remove_numbers)

# Remove extra spaces
def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text).strip()
X = X.apply(remove_extra_spaces)

X.head()

0              coronavirus human coronavirus types cdc
1    that s true corona virus swine flue bird flu i...
2    tldr not sars possibly new coronavirus difficu...
3    disease outbreak news from the who middle east...
4    china media wsj says sources tell them mystery...
Name: tweet, dtype: object

In [20]:
# Spelling correction
# from textblob import TextBlob
# def correct_spelling(text):
#     return str(TextBlob(text).correct())

# X = X.apply(correct_spelling)
# X.head()

In [21]:
# Remove stop words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
def remove_stop_words(text):
    word_tokens = word_tokenize(text)
    return ' '.join([word for word in word_tokens if word not in stop_words])

X = X.apply(remove_stop_words)
X.head()

[nltk_data] Downloading package punkt_tab to /Users/ltree/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ltree/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0              coronavirus human coronavirus types cdc
1    true corona virus swine flue bird flu december...
2    tldr sars possibly new coronavirus difficult c...
3    disease outbreak news middle east respiratory ...
4    china media wsj says sources tell mystery pneu...
Name: tweet, dtype: object

In [22]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    word_tokens = word_tokenize(text)
    return ' '.join([lemmatizer.lemmatize(word) for word in word_tokens])

X = X.apply(lemmatize_text)
X.head()

[nltk_data] Downloading package wordnet to /Users/ltree/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/ltree/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


0               coronavirus human coronavirus type cdc
1    true corona virus swine flue bird flu december...
2    tldr sars possibly new coronavirus difficult c...
3    disease outbreak news middle east respiratory ...
4    china medium wsj say source tell mystery pneum...
Name: tweet, dtype: object

In [23]:
new_df = pd.DataFrame({'tweet': X, 'label': y})
new_df.to_csv(f'data/clean_COVIDSenti-{cur}.csv', index=False)