# Preliminary operations

In [None]:
!pip install tweet-preprocessor
!pip install ekphrasis
!pip install emoji
!pip install langdetect
!pip install contractions

In [None]:
# Scraping
#import snscrape.modules.twitter as sntwitter

# Utility
from google.colab import drive
import pandas as pd
pd.set_option('display.max_rows', 500)
from shutil import copyfile
from collections import Counter
import copy
from langdetect import detect
import matplotlib.pyplot as plt

# String operations
import re
import string
from textblob import TextBlob
import preprocessor as p
from emoji import demojize
import contractions

# Nltk
from nltk.tokenize import sent_tokenize, word_tokenize, TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
drive.mount('/content/gdrive')

# Pre-processing

In [None]:
df = pd.read_csv('gdrive/My Drive/Text Mining & Search/tweets.csv')
df.head()

Unnamed: 0,Datetime,Text,Username,Likes,Retweets
0,2022-12-18 23:58:42+00:00,Haters can cry in their rage. 🔥 #Qatar2022 htt...,Majstar7,181,29
1,2022-12-18 23:57:42+00:00,A story I can proudly narrate to my grandkids ...,Abhilashk45,8,0
2,2022-12-18 23:57:42+00:00,WORLD CUP AGENDA: More hypocrisy from those ⁦@...,Iromg,87,16
3,2022-12-18 23:57:33+00:00,Messi’s championship speech #Qatar2022 #Messi𓃵...,__Nerazzurri,8,2
4,2022-12-18 23:57:22+00:00,"Despite conceding 3️⃣ goals apiece, @emimartin...",JioCinema,58,15


## Extracting hashtags

First of all, we extract the hashtags from the text and we put them in a separate column.

In [None]:
df['Hashtags'] = df['Text'].apply(lambda x: re.findall(r"#(\w+)", x))
# Remove duplicates
df['Hashtags'] = df['Hashtags'].apply(lambda x: list(dict.fromkeys(x)))
df.head()

Unnamed: 0,Datetime,Text,Username,Likes,Retweets,Hashtags
0,2022-12-18 23:58:42+00:00,Haters can cry in their rage. 🔥 #Qatar2022 htt...,Majstar7,181,29,[Qatar2022]
1,2022-12-18 23:57:42+00:00,A story I can proudly narrate to my grandkids ...,Abhilashk45,8,0,"[FIFAWorldCup, Qatar2022]"
2,2022-12-18 23:57:42+00:00,WORLD CUP AGENDA: More hypocrisy from those ⁦@...,Iromg,87,16,"[LGBT, QATAR2022]"
3,2022-12-18 23:57:33+00:00,Messi’s championship speech #Qatar2022 #Messi𓃵...,__Nerazzurri,8,2,"[Qatar2022, Messi𓃵]"
4,2022-12-18 23:57:22+00:00,"Despite conceding 3️⃣ goals apiece, @emimartin...",JioCinema,58,15,"[FIFAWorldCup, ARGFRA, Qatar2022, FIFAWConJioC..."


## Extracting mentions

Now, we extract the mentions too.

In [None]:
df['Mentions'] = df['Text'].apply(lambda x: re.findall(r"@(\w+)", x))
# Remove duplicates
df['Mentions'] = df['Mentions'].apply(lambda x: list(dict.fromkeys(x)))
df.head()

Unnamed: 0,Datetime,Text,Username,Likes,Retweets,Hashtags,Mentions
0,2022-12-18 23:58:42+00:00,Haters can cry in their rage. 🔥 #Qatar2022 htt...,Majstar7,181,29,[Qatar2022],[]
1,2022-12-18 23:57:42+00:00,A story I can proudly narrate to my grandkids ...,Abhilashk45,8,0,"[FIFAWorldCup, Qatar2022]",[]
2,2022-12-18 23:57:42+00:00,WORLD CUP AGENDA: More hypocrisy from those ⁦@...,Iromg,87,16,"[LGBT, QATAR2022]",[BBCSport]
3,2022-12-18 23:57:33+00:00,Messi’s championship speech #Qatar2022 #Messi𓃵...,__Nerazzurri,8,2,"[Qatar2022, Messi𓃵]",[]
4,2022-12-18 23:57:22+00:00,"Despite conceding 3️⃣ goals apiece, @emimartin...",JioCinema,58,15,"[FIFAWorldCup, ARGFRA, Qatar2022, FIFAWConJioC...","[emimartinezz1, Mahindra_Auto]"


These columns will be used in the exploration phase.

## Removing extra-spaces

In [None]:
def remove_extra_spaces(text):
  new_text = ' '.join(text.split()) 
  return new_text

df['Text'] = df['Text'].apply(lambda x: remove_extra_spaces(x))

spaces = 0
newlines = 0
tabs = 0
length = 0

for i in range(0, len(df)):
  spaces+=len(re.findall(r'\s\s+', df['Text'][i]))
  newlines+=len(re.findall(r'\n\n+', df['Text'][i]))
  tabs+=len(re.findall(r'\t+', df['Text'][i]))
  length+=len(df['Text'][i])

print("Number of multiple spaces:", spaces)
print("Number of multiple newlines:", newlines)
print("Number of multiple tabs:", tabs)
print('Overall tweets length:', length)

Number of multiple spaces: 0
Number of multiple newlines: 0
Number of multiple tabs: 0
Overall tweets length: 26112984


## Case folding

In [None]:
df['Text'] = df['Text'].apply(lambda x: x.lower())
print(df['Text'].head())

0    haters can cry in their rage. 🔥 #qatar2022 htt...
1    a story i can proudly narrate to my grandkids ...
2    world cup agenda: more hypocrisy from those ⁦@...
3    messi’s championship speech #qatar2022 #messi𓃵...
4    despite conceding 3️⃣ goals apiece, @emimartin...
Name: Text, dtype: object


## Removing repeated characters

In [None]:
def remove_repetition(text):
  pattern_alpha = re.compile(r"([A-Za-z])\1{2,}", re.DOTALL)
  formatted_text = pattern_alpha.sub(r"\1\1", text) 
  pattern_punct = re.compile(r'([.,/#!$%^&*?;:{}=_`~()+-])\1{1,}')
  combined_formatted = pattern_punct.sub(r'\1', formatted_text)
  return combined_formatted

df['Text'] = df['Text'].apply(lambda x: remove_repetition(x))
print(df['Text'].head())

0    haters can cry in their rage. 🔥 #qatar2022 htt...
1    a story i can proudly narrate to my grandkids ...
2    world cup agenda: more hypocrisy from those ⁦@...
3    messi’s championship speech #qatar2022 #messi𓃵...
4    despite conceding 3️⃣ goals apiece, @emimartin...
Name: Text, dtype: object


## Removing streams

Checking the word *stream* in order to remove those tweets that provide the streaming of the matches.

In [None]:
print(len(df))
for i in range(0, len(df)):
  if re.search('stream', df['Text'][i]):
    df.drop(i, inplace = True)
df.reset_index(drop = True, inplace = True)
print(df['Text'].head())
print(len(df))

151012
0    haters can cry in their rage. 🔥 #qatar2022 htt...
1    a story i can proudly narrate to my grandkids ...
2    world cup agenda: more hypocrisy from those ⁦@...
3    messi’s championship speech #qatar2022 #messi𓃵...
4    despite conceding 3️⃣ goals apiece, @emimartin...
Name: Text, dtype: object
150225


## Removing less than 10 likes

We keep only those tweets with more than 10 likes. These are probably the most informative, and this also allows us to reduce the computational complexity.

In [None]:
print(len(df))
for i in range(0, len(df)):
  if df['Likes'][i] < 10:
    df.drop(i, inplace = True)
df.reset_index(drop = True, inplace = True)
print(df['Text'].head())
print(len(df))

150225
0    haters can cry in their rage. 🔥 #qatar2022 htt...
1    world cup agenda: more hypocrisy from those ⁦@...
2    despite conceding 3️⃣ goals apiece, @emimartin...
3    📸 a picture is worth a thousand words. #qatar2...
4    why politicians should steer clear of proper s...
Name: Text, dtype: object
120382


## Removing symbols

We remove only those symbols which are not used in natural language (i.e. we keep punctuations).

In [None]:
custom = ['|', '\\', '⏱', '↯', ' ⃣0⃣ ', ' ⃣', '↓', '•', "'⃣", '◘', '^', '_', '{', '}', '~', '[', ']']

def correct(text, custom):
  # Normalizing punctuations
  text = text.replace('‘', "'").replace('’', "'").replace('“', '"').replace('”', '"')
  text = text.replace('…', '...').replace('`', "'")
  # Removing symbols
  text = ''.join([i for i in text if i not in custom])
  return text

df['Text'] = df['Text'].apply(lambda x: correct(x, custom))
print(df['Text'].head())

0    haters can cry in their rage. 🔥 #qatar2022 htt...
1    world cup agenda: more hypocrisy from those ⁦@...
2    despite conceding 3️⃣ goals apiece, @emimartin...
3    📸 a picture is worth a thousand words. #qatar2...
4    why politicians should steer clear of proper s...
Name: Text, dtype: object


## Checking language

Even if we specified english language at query time, we perform another control through the *langdetect* library.

In [None]:
print(len(df))
for i in range(0, len(df)):
  try:
    if detect(df['Text'][i]) != 'en':
      df.drop(i, inplace = True)
  except:
    df.drop(i, inplace = True)

df.reset_index(inplace = True, drop = True)
print(df['Text'].head())
print(len(df))

120382
0    haters can cry in their rage. 🔥 #qatar2022 htt...
1    world cup agenda: more hypocrisy from those ⁦@...
2    despite conceding 3️⃣ goals apiece, @emimartin...
3    📸 a picture is worth a thousand words. #qatar2...
4    why politicians should steer clear of proper s...
Name: Text, dtype: object
117504


## Tweet-preprocessor

Now we use the library tweet-preprocessor. This allows us to automatically clean our data from Twitter. Specifically, we remove URLs, mentions and hashtags. We remove hashtags because we found that they're not very informative, most being with the same meaning and related to the same event (e.g. #FifaWorldCup, #Qatar2022, #WorldCup2022).

In [None]:
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.RESERVED)
df['Text'] = df['Text'].apply(lambda x: p.clean(x))
print(df['Text'].head())

0                      haters can cry in their rage. 🔥
1    world cup agenda: more hypocrisy from those ⁦⁩...
2    despite conceding 3️⃣ goals apiece, &amp; hugo...
3               📸 a picture is worth a thousand words.
4    why politicians should steer clear of proper s...
Name: Text, dtype: object


## Removing extra-spaces

The processes above may have created some extra-space. Let's check and remove them.

In [None]:
def remove_extra_spaces(text):
  new_text = ' '.join(text.split()) 
  return new_text

df['Text'] = df['Text'].apply(lambda x: remove_extra_spaces(x))

spaces = 0
newlines = 0
tabs = 0
length = 0

for i in range(0, len(df)):
  spaces+=len(re.findall(r'\s\s+', df['Text'][i]))
  newlines+=len(re.findall(r'\n\n+', df['Text'][i]))
  tabs+=len(re.findall(r'\t+', df['Text'][i]))
  length+=len(df['Text'][i])

print("Number of multiple spaces:", spaces)
print("Number of multiple newlines:", newlines)
print("Number of multiple tabs:", tabs)
print('Overall tweets length:', length)

Number of multiple spaces: 0
Number of multiple newlines: 0
Number of multiple tabs: 0
Overall tweets length: 13098937


## Removing less than 10 words

We filter again the tweets, by removing those with less than 10 words.

In [None]:
print(len(df))
for i in range(0, len(df)):
  if len(df['Text'][i].split()) < 10:
    df.drop(i, inplace = True)
df.reset_index(drop = True, inplace = True)
print(df['Text'].head())
print(len(df))

117504
0    despite conceding 3️⃣ goals apiece, &amp; hugo...
1    12/18/2022 - 📍times square, nyc argentina beat...
2    and at 2:52am on monday, december 19, it's a w...
3    would the question be even asked if it was ano...
4    "it is a dress for an official occasion, worn ...
Name: Text, dtype: object
91051


## Remove duplicated tweets

In [None]:
print(len(df))
df.drop_duplicates(subset = ['Text'], inplace = True)
df.reset_index(drop = True, inplace = True)
print(df['Text'].head())
print(len(df))

## Saving to CSV

In [None]:
tweet_length = 0
lemm_length = 0

for i in range(0, len(df)):
  tweet_length+=len(df['Text'][i])

print('Final number of tweets:' + str(len(df)))
print('Overall tweets length:' + str(tweet_length))

Now we save the pre-processed tweets in a CSV file.

In [None]:
df.to_csv('gdrive/My Drive/Text Mining & Search/preprocessed.csv', index = False)