In [1]:
#import necessary libraries
import pandas as pd
import re
import preprocessor as p
import nltk
from nltk.tokenize import TweetTokenizer
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
nltk.download
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
import spacy

# To ignore warnings
import warnings

warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kusalkasilva/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kusalkasilva/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/kusalkasilva/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
#Read csv file
tweet_df = pd.read_csv("Tweet_Dataset.csv")

In [3]:
#Check stats - no null values
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247007 entries, 0 to 247006
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   author id   247007 non-null  object
 1   created_at  247007 non-null  object
 2   geo         247007 non-null  object
 3   id          247007 non-null  object
 4   language    247007 non-null  object
 5   source      247007 non-null  object
 6   tweet       247007 non-null  object
 7   location    247007 non-null  object
 8   bbox        247007 non-null  object
dtypes: object(9)
memory usage: 17.0+ MB


In [4]:
#replacing hashing
tweet_df["clean_tweet"] = None

for i in range(len(tweet_df)):
    tweet_df['clean_tweet'][i] = tweet_df['tweet'][i].replace("#", "")

In [5]:
#Text-Cleaning (URLs, Mentions,etc, non alphabet characters, but noticed that full stops, question marks weren't removed) 
#- cleaning is done using tweet-preprocessor package

for i in range(len(tweet_df)):
    tweet_df['clean_tweet'][i] = p.clean(tweet_df['tweet'][i])

In [6]:
#output
tweet_df[['tweet','clean_tweet']].sample(2)

Unnamed: 0,tweet,clean_tweet
205824,@EuropaLeague braga the local police are the b...,braga the local police are the biggest bunch o...
241851,@SergeantCustard @AmazonUK THANK YOU. I bet it...,THANK YOU. I bet it's a tough book to read jus...


In [7]:
#Filtering using regex
for i in range (len(tweet_df['clean_tweet'])):
    tweet_df['clean_tweet'][i] = re.sub(r"(@[A-Za-z0-9_]+)|[^\w\s]|#|http\S+", "", tweet_df['clean_tweet'][i])

In [8]:
tweet_df[['tweet','clean_tweet']].sample(2)

Unnamed: 0,tweet,clean_tweet
201279,@bmj_latest @TheBMA @NHSEmployers @AoMRC @theR...,Theres no excuse but how about you examine Cau...
29789,Great watching @SocialistTelly tonight with @0...,Great watching tonight with and Really good a...


In [9]:
#lowercasing
lower_text = tweet_df['clean_tweet'].str.lower()
tweet_df['clean_tweet'] = lower_text

In [10]:
tweet_df[['tweet','clean_tweet']].sample(2)

Unnamed: 0,tweet,clean_tweet
74913,"@rmorganbentley ""HOW TO MURDER YOUR WIFE AND G...",how to murder your wife and get away with
133583,Double homicide https://t.co/tmNkDt2WoC,double homicide


In [11]:
#Tokenization
w_tokenizer =  TweetTokenizer()

# Lemmatization using spaCy
nlp = spacy.load('en_core_web_sm')
tweet_df["lemmatized_tweet"] = lower_text.apply(lambda row: " ".join([w.lemma_ for w in nlp(row)]))
tweet_df['lemmatized_tweet'] = tweet_df['lemmatized_tweet'].apply(w_tokenizer.tokenize)

In [12]:
#Function for stemming
stemmer = SnowballStemmer("english")
w_tokenizer =  TweetTokenizer()
 
def stem_text(text):
    return [(stemmer.stem(w)) for w \
                       in w_tokenizer.tokenize((text))]

In [13]:
#applying function stemming
tweet_df['stemming_tweet'] = lower_text.apply(stem_text)

In [14]:
tweet_df[['tweet', 'clean_tweet', 'lemmatized_tweet', 'stemming_tweet']].sample(2)

Unnamed: 0,tweet,clean_tweet,lemmatized_tweet,stemming_tweet
35671,12/02/21 #LateShift Marked Proactive Patrols o...,0221 marked proactive patrols of between atten...,"[0221, mark, proactive, patrol, of, between, a...","[0221, mark, proactiv, patrol, of, between, at..."
135105,@SonnyBunch Respectfully disagree. We're const...,respectfully disagree were constantly invited ...,"[respectfully, disagree, be, constantly, invit...","[respect, disagre, were, constant, invit, to, ..."


In [15]:
#get stopwords dictionary in english language 
stop_words = set(stopwords.words('english'))

In [16]:
#we exclude not from the stopwords corpus since removing not from the text will change the context of the text
stop_words.remove('not')

In [17]:
#stop words removal
tweet_df['clean_tweet_lem'] = None
tweet_df['clean_tweet_stem'] = None

for i in range(len(tweet_df)):
    tweet_df['lemmatized_tweet'][i] = [word for word in tweet_df['lemmatized_tweet'][i] if not word in stop_words]
    
    tweet_df['stemming_tweet'][i] = [word for word in tweet_df['stemming_tweet'][i] if not word in stop_words]
    
    tweet_df['clean_tweet_lem'][i] = [word for word in tweet_df['lemmatized_tweet'][i] if not word in stop_words]
    tweet_df['clean_tweet_lem'][i] = (" ").join([word for word in tweet_df['lemmatized_tweet'][i] if not word in stop_words])
    
    tweet_df['clean_tweet_stem'][i] = [word for word in tweet_df['stemming_tweet'][i] if not word in stop_words]
    tweet_df['clean_tweet_stem'][i] = (" ").join([word for word in tweet_df['stemming_tweet'][i] if not word in stop_words])

In [18]:
tweet_df[['tweet', 'clean_tweet', 'lemmatized_tweet', 'stemming_tweet', 'clean_tweet_lem', 'clean_tweet_stem']].sample(2)

Unnamed: 0,tweet,clean_tweet,lemmatized_tweet,stemming_tweet,clean_tweet_lem,clean_tweet_stem
140723,The best! \r\nhttps://t.co/oiuL7VWV7E,the best,[good],[best],good,best
22075,"@DJISupport ,just bought new machine,charged ...",just bought new machinecharged batteries to di...,"[buy, new, machinecharge, battery, not, fly, h...","[bought, new, machinecharg, batteri, didnt, fl...",buy new machinecharge battery not fly house ba...,bought new machinecharg batteri didnt fli hous...


In [19]:
tweet_df = tweet_df.drop(columns="clean_tweet")

In [20]:
#dataset columns summary
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247007 entries, 0 to 247006
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   author id         247007 non-null  object
 1   created_at        247007 non-null  object
 2   geo               247007 non-null  object
 3   id                247007 non-null  object
 4   language          247007 non-null  object
 5   source            247007 non-null  object
 6   tweet             247007 non-null  object
 7   location          247007 non-null  object
 8   bbox              247007 non-null  object
 9   lemmatized_tweet  247007 non-null  object
 10  stemming_tweet    247007 non-null  object
 11  clean_tweet_lem   247007 non-null  object
 12  clean_tweet_stem  247007 non-null  object
dtypes: object(13)
memory usage: 24.5+ MB


In [21]:
#Remove duplicated tweets
tweet_df.drop_duplicates(inplace=True, subset="clean_tweet_lem")
tweet_df.reset_index(inplace=True)

In [22]:
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234977 entries, 0 to 234976
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   index             234977 non-null  int64 
 1   author id         234977 non-null  object
 2   created_at        234977 non-null  object
 3   geo               234977 non-null  object
 4   id                234977 non-null  object
 5   language          234977 non-null  object
 6   source            234977 non-null  object
 7   tweet             234977 non-null  object
 8   location          234977 non-null  object
 9   bbox              234977 non-null  object
 10  lemmatized_tweet  234977 non-null  object
 11  stemming_tweet    234977 non-null  object
 12  clean_tweet_lem   234977 non-null  object
 13  clean_tweet_stem  234977 non-null  object
dtypes: int64(1), object(13)
memory usage: 25.1+ MB


In [23]:
# Token Length Filtering
for i in range(len(tweet_df)):
    if len(tweet_df['lemmatized_tweet'][i]) <= 2 :
        tweet_df = tweet_df.drop(i)

In [24]:
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 231460 entries, 0 to 234976
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   index             231460 non-null  int64 
 1   author id         231460 non-null  object
 2   created_at        231460 non-null  object
 3   geo               231460 non-null  object
 4   id                231460 non-null  object
 5   language          231460 non-null  object
 6   source            231460 non-null  object
 7   tweet             231460 non-null  object
 8   location          231460 non-null  object
 9   bbox              231460 non-null  object
 10  lemmatized_tweet  231460 non-null  object
 11  stemming_tweet    231460 non-null  object
 12  clean_tweet_lem   231460 non-null  object
 13  clean_tweet_stem  231460 non-null  object
dtypes: int64(1), object(13)
memory usage: 34.6+ MB


In [25]:
tweet_df[['tweet', 'lemmatized_tweet', 'stemming_tweet', 'clean_tweet_lem', 'clean_tweet_stem']].sample(2)

Unnamed: 0,tweet,lemmatized_tweet,stemming_tweet,clean_tweet_lem,clean_tweet_stem
180238,What is this Bastard us country\nEngland \nI. ...,"[bastard, us, countryengland, I, like, lomborg...","[bastard, us, countryengland, like, lomborghin...",bastard us countryengland I like lomborghiniph...,bastard us countryengland like lomborghiniphot...
29526,Did I imagine that Ghislaine Maxwell has been ...,"[I, imagine, ghislaine, maxwell, convict, anyt...","[imagin, ghislain, maxwel, convict, anyth, yet...",I imagine ghislaine maxwell convict anything y...,imagin ghislain maxwel convict anyth yet didnt...


In [26]:
#save csv
tweet_df.to_csv('HashingPreprocessed_Dataset.csv', index=False)

<h2>References - </h2>

https://towardsdatascience.com/basic-tweet-preprocessing-in-python-efd8360d529e
https://aronakhmad.medium.com/twitter-data-cleaning-using-python-db1ec2f28f08
https://towardsai.net/p/programming/tweet-topic-modeling-part-2-cleaning-and-preprocessing-tweets
https://www.linkedin.com/pulse/extracting-twitter-data-pre-processing-sentiment-using-jayasekara/
https://stackabuse.com/removing-stop-words-from-strings-in-python/