In [1]:
import pandas as pd
import string
import re
import demoji
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

In [2]:
nltk.download('stopwords')
nltk.download('opinion_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Konrad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\Konrad\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


True

In [3]:
# Choosing a theme for collected data
theme = 'WorldCup'

In [4]:
# Loading collected data
tweets = pd.read_csv(f'./data/{theme}/tweets.csv')
users = pd.read_csv(f'./data/{theme}/users.csv')

In [5]:
# Dropping duplicated data
tweets = tweets.drop_duplicates(subset=['id'])
users = users.drop_duplicates(subset=['id'])

In [6]:
# Getting tweets in english
tweets_en = tweets[tweets['lang'] == 'en']

In [7]:
# Preprocessing function

def preprocess(text):
    # Removing links and mentions
    text_links = re.sub(r'(@|https?)\S+', '', text)

    # Cleaning whitespaces
    text_white = re.sub(r"\s+", " ", text_links, flags=re.UNICODE)

    # Removing numbers
    text_number = re.sub(r"\d+", "", text_white)

    # Lowering text
    text_lower = text_number.lower()

    # Removing punctuation
    text_punctuation = text_lower.translate(str.maketrans('', '', string.punctuation))

    # Filtering all remaining redundant characters
    text_characters = re.sub('[^a-z]', ' ', text_punctuation)

    # Removing emojis
    text_emoji = demoji.replace(text_characters, '')

    return text_emoji

In [8]:
# Lemmatizing function
lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer()

def lemmatize(text):
    text_words = [lemmatizer.lemmatize(word) for word in tokenizer.tokenize(text)]
    return text_words

In [9]:
# Stopwords removal function
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    text_stopwords = [word for word in text if word not in stop_words]

    return text_stopwords

In [10]:
# Applying preprocessing function
tweets_en['text_clean'] = tweets_en.text.apply(lambda row : preprocess(row))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_en['text_clean'] = tweets_en.text.apply(lambda row : preprocess(row))


In [11]:
# Applying tokenizing function
tweets_en['text_clean_list'] = tweets_en.text_clean.apply(lambda row : lemmatize(row))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_en['text_clean_list'] = tweets_en.text_clean.apply(lambda row : lemmatize(row))


In [12]:
# Applying stopwords removal function
tweets_en['text_clean_stopwords'] = tweets_en.text_clean_list.apply(lambda row : remove_stopwords(row))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_en['text_clean_stopwords'] = tweets_en.text_clean_list.apply(lambda row : remove_stopwords(row))


In [13]:
# Obtaining list of hashtags from a tweet
tweets_en['hashtags_list'] = tweets_en.text.apply(lambda row : [x.lower() for x in re.findall('#(\w+)', row)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_en['hashtags_list'] = tweets_en.text.apply(lambda row : [x.lower() for x in re.findall('#(\w+)', row)])


In [14]:
# Displaying preprocessed data
tweets_en

Unnamed: 0,hashtag,id,author_id,text,like_count,reply_count,retweet_count,quote_count,created_at,lang,mentions,text_clean,text_clean_list,text_clean_stopwords,hashtags_list
2,#Qatar2022,1592601191761838081,1591065849531604994,Massage and sex in Qatar❤️\nMassage body to bo...,0,0,0,0,2022-11-15 19:31:27+00:00,en,,massage and sex in qatar massage body to bod...,"[massage, and, sex, in, qatar, massage, body, ...","[massage, sex, qatar, massage, body, body, qat...","[qatar, qatar2022]"
5,#Qatar2022,1592601156412649477,968203797049769984,How to identify a whitexican in Qatar 2022 #Q...,0,1,0,0,2022-11-15 19:31:19+00:00,en,,how to identify a whitexican in qatar qatar m...,"[how, to, identify, a, whitexican, in, qatar, ...","[identify, whitexican, qatar, qatar, mexico, w...","[qatar2022, mexico, worldcup2022]"
7,#Qatar2022,1592601145545031681,1585336194010955781,Gillingham vs AFC Fylde\n\n⏰ 4:45:00 PM\n\nWho...,0,0,0,0,2022-11-15 19:31:16+00:00,en,,gillingham vs afc fylde pm whos going to wi...,"[gillingham, v, afc, fylde, pm, who, going, to...","[gillingham, v, afc, fylde, pm, going, win, fi...","[fifaworldcup, qatar2022]"
8,#Qatar2022,1592601145503059968,1585336194010955781,Derby vs Torquay\n\n⏰ 4:45:00 PM\n\nWho's goin...,0,0,0,0,2022-11-15 19:31:16+00:00,en,,derby vs torquay pm whos going to win fifaw...,"[derby, v, torquay, pm, who, going, to, win, f...","[derby, v, torquay, pm, going, win, fifaworldc...","[fifaworldcup, qatar2022]"
9,#Qatar2022,1592601145415008256,1585336194010955781,Cambridge United vs Curzon Ashton\n\n⏰ 4:45:00...,0,0,0,0,2022-11-15 19:31:16+00:00,en,,cambridge united vs curzon ashton pm whos g...,"[cambridge, united, v, curzon, ashton, pm, who...","[cambridge, united, v, curzon, ashton, pm, goi...","[fifaworldcup, qatar2022]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461392,#Qatar2022,1605037565999980545,1499701605079396353,The coronation of the true living #GOAT! @WeAr...,0,0,0,0,2022-12-20 03:09:10+00:00,en,"['2849963640', '1058376110']",the coronation of the true living goat dexto...,"[the, coronation, of, the, true, living, goat,...","[coronation, true, living, goat, dextools, en,...","[goat, dextools, worldcup2022, worldcup, qatar..."
461393,#Qatar2022,1605037409074573313,897741929387106305,Peter Drury on Argentina’s winning moment.\n\n...,3,0,0,0,2022-12-20 03:08:33+00:00,en,,peter drury on argentina s winning moment mess...,"[peter, drury, on, argentina, s, winning, mome...","[peter, drury, argentina, winning, moment, mes...","[messi𓃵, argentinavsfrance, qatar2022, fifawor..."
461405,#Qatar2022,1605037015233310720,1363653333479788549,Argentine dairy industry #DitchDairy @dairy_tr...,0,0,0,0,2022-12-20 03:06:59+00:00,en,"['1370074650038910983', '1246189682150367233']",argentine dairy industry ditchdairy worldcupfi...,"[argentine, dairy, industry, ditchdairy, world...","[argentine, dairy, industry, ditchdairy, world...","[ditchdairy, worldcupfinal, fifaworldcup, qata..."
461406,#Qatar2022,1605036970660462602,1522221998151061509,This time @TeamMessi Argentina ⚽❤✌ \n\n 📕 #Dex...,0,0,0,0,2022-12-20 03:06:48+00:00,en,['1058376110'],this time argentina dextools quack wool ...,"[this, time, argentina, dextools, quack, wool,...","[time, argentina, dextools, quack, wool, chz, ...","[dextools, worldcup2022, worldcup, qatar2022, ..."


In [15]:
# Saving processed tweets and users datasets
tweets_en.to_csv(f'./data/{theme}/tweets_clean.csv', index=False)
users.to_csv(f'./data/{theme}/users_clean.csv', index=False)