In [22]:
# Modified based on Sub-Task-1 (Shi Su and Yuqi Sun)'s code by Jiahe Song
import pandas as pd
import nltk

In [23]:
# Load data
df_orig = pd.read_csv('twitter_validation.csv', names=["Twitter ID","Topic","Sentiment","Text"])

In [24]:
# Control size of data set
df = df_orig.iloc[0:1000]

# Preprocessing (delete username and url)
def preprocess(text):
    temp = []

    for t in text.split(" "): # split a sentence into words by spaces " ".
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        temp.append(t)
    return " ".join(temp)

# Change Irrelevant label into Neutral as mentioned in the data set description.
def adjust_ori_sentiment(sentiment):
    if sentiment == "Irrelevant":
        temp_str = "Neutral"
        return temp_str
    else:
        return sentiment

df['Text'] = df['Text'].apply(preprocess)
df['Sentiment'] = df['Sentiment'].apply(adjust_ori_sentiment)
df

Unnamed: 0,Twitter ID,Topic,Sentiment,Text
0,3364,Facebook,Neutral,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@user Why do I pay for WORD when it functions ...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...
...,...,...,...,...
995,4891,GrandTheftAuto(GTA),Neutral,⭐️ Toronto is the arts and culture capital of ...
996,4359,CS-GO,Neutral,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
997,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
998,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [25]:
df_text = df["Text"]

In [26]:
df['Sentiment'].value_counts()

Neutral     457
Positive    277
Negative    266
Name: Sentiment, dtype: int64

In [27]:
from collections import Counter

words = df['Text'].str.split(" ")
word_counts = Counter(word for sentence in words for word in sentence)
top_words = word_counts.most_common(10)

# Print the top 10 most common words
for word, count in top_words:
    print(f'{word}: {count}')

the: 540
@user: 462
to: 409
and: 371
a: 363
I: 308
of: 282
is: 269
for: 225
in: 199


In [28]:
# Lower case
df_text = df_text.str.lower()
df_text

0      i mentioned on facebook that i was struggling ...
1      bbc news - amazon boss jeff bezos rejects clai...
2      @user why do i pay for word when it functions ...
3      csgo matchmaking is so full of closet hacking,...
4      now the president is slapping americans in the...
                             ...                        
995    ⭐️ toronto is the arts and culture capital of ...
996    this is actually a good move tot bring more vi...
997    today sucked so it’s time to drink wine n play...
998    bought a fraction of microsoft today. small wins.
999    johnson & johnson to stop selling talc baby po...
Name: Text, Length: 1000, dtype: object

In [29]:
## remove retweet
df_text = df_text.str.replace('@[\w]*', '', regex=True)

## remove &amp &quot &lt
df_text = df_text.str.replace('&\S+', '', regex=True)

In [30]:
df_text

0      i mentioned on facebook that i was struggling ...
1      bbc news - amazon boss jeff bezos rejects clai...
2       why do i pay for word when it functions so po...
3      csgo matchmaking is so full of closet hacking,...
4      now the president is slapping americans in the...
                             ...                        
995    ⭐️ toronto is the arts and culture capital of ...
996    this is actually a good move tot bring more vi...
997    today sucked so it’s time to drink wine n play...
998    bought a fraction of microsoft today. small wins.
999    johnson & johnson to stop selling talc baby po...
Name: Text, Length: 1000, dtype: object

In [31]:
### remove punctuations

In [32]:
import string
def remove_punctuation(text):
    # Remove punctuation using the `translate` method of strings
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    return text
df_text = df_text.apply(remove_punctuation)
df_text

0      i mentioned on facebook that i was struggling ...
1      bbc news  amazon boss jeff bezos rejects claim...
2       why do i pay for word when it functions so po...
3      csgo matchmaking is so full of closet hacking ...
4      now the president is slapping americans in the...
                             ...                        
995    ⭐️ toronto is the arts and culture capital of ...
996    this is actually a good move tot bring more vi...
997    today sucked so it’s time to drink wine n play...
998      bought a fraction of microsoft today small wins
999    johnson  johnson to stop selling talc baby pow...
Name: Text, Length: 1000, dtype: object

In [33]:
## remove stop words

In [34]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Song\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [35]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    # Split the text into words
    in_words = text.split()
    # Remove stop words using the `set()` function and list comprehension
    filtered_words = [word for word in in_words if word.lower() not in stop_words]
    # Join the remaining words into a string
    text = ' '.join(filtered_words)
    return text

# Apply the remove_stop_words function to each element in the Series
df_text = df_text.apply(remove_stop_words)
df_text

0      mentioned facebook struggling motivation go ru...
1      bbc news amazon boss jeff bezos rejects claims...
2                 pay word functions poorly chromebook 🙄
3      csgo matchmaking full closet hacking truly awf...
4      president slapping americans face really commi...
                             ...                        
995    ⭐️ toronto arts culture capital canada it’s wo...
996    actually good move tot bring viewers one peopl...
997    today sucked it’s time drink wine n play borde...
998           bought fraction microsoft today small wins
999    johnson johnson stop selling talc baby powder ...
Name: Text, Length: 1000, dtype: object

In [36]:
res_list = df_text.to_list()
df['Tidy_Text'] = res_list
df

Unnamed: 0,Twitter ID,Topic,Sentiment,Text,Tidy_Text
0,3364,Facebook,Neutral,I mentioned on Facebook that I was struggling ...,mentioned facebook struggling motivation go ru...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...,bbc news amazon boss jeff bezos rejects claims...
2,8312,Microsoft,Negative,@user Why do I pay for WORD when it functions ...,pay word functions poorly chromebook 🙄
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,...",csgo matchmaking full closet hacking truly awf...
4,4433,Google,Neutral,Now the President is slapping Americans in the...,president slapping americans face really commi...
...,...,...,...,...,...
995,4891,GrandTheftAuto(GTA),Neutral,⭐️ Toronto is the arts and culture capital of ...,⭐️ toronto arts culture capital canada it’s wo...
996,4359,CS-GO,Neutral,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,actually good move tot bring viewers one peopl...
997,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...,today sucked it’s time drink wine n play borde...
998,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.,bought fraction microsoft today small wins


In [37]:
df.to_csv('cleaned_tweets.csv')