# Pre-Processing Twitter Corpus

## étape 0: Load necessary packages 

In [None]:
# punkt package
import nltk
nltk.download('punkt')

In [None]:
# for expanding contraction words e.g. isn't --> is not
!pip install contractions

In [None]:
# stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('french'))

In [None]:
# stopwords
stopwords.words('english')[0:10]

In [None]:
# stopwords
stopwords.words('french')[0:10]

In [None]:
# wordnet lemmatizer
import nltk
nltk.download('punkt')
nltk.download('wordnet')

## étape 1 : Basic Cleaning - à adapter à votre corpus/imagination !

- Remove Unicode Strings and Noise
- Remove/Replace URLs, User Mentions and Hashtags
- Non-Letter characters: numbers, emojis, or hash marks.
- Remove/Replace Slang and Abbreviations
- Remove/Replace Contractions
- Remove/Replace Numbers
- Remove/Replace Repetitions of Punctuation
- Remove Punctuation
- Handling Capitalized Words / Lowercase
- Replace Elongated Words (ex: hahahaaaa, ‘Duuuuude, that's awful,’”)

https://pynative.com/python-regex-replace-re-sub/

In [None]:
#the following pre-tokenization receives string as input parameter
#and returns string as output
import re
import contractions

def remove_links(tweet):
    """Takes a string and removes web links from it"""
    tweet = re.sub(r'http\S+', '', tweet)   # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet)  # remove bitly links
    tweet = tweet.strip('[link]')   # remove [links]
    tweet = re.sub(r'pic.twitter\S+','', tweet) # remove Twitter links
    return tweet

def remove_tags(tweet):
    """Takes a string and removes AUDIO/VIDEO tags or labels"""
    tweet = re.sub('VIDEO:', '', tweet)  # remove 'VIDEO:' from start of tweet
    tweet = re.sub('AUDIO:', '', tweet)  # remove 'AUDIO:' from start of tweet
    tweet = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove hash tags
    tweet = re.sub('RT @[\w_]+:','', tweet)  # remove retweet label
    return tweet

def remove_users(tweet):
    """Takes a string and removes retweet and @user information"""
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove re-tweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove tweeted at
    return tweet

punctuation = '!”$%&\"’()*+,-./:;<=>?[\\]^_`{|}~•@'
def remove_nonText(tweet):
    tweet = re.sub('[' + punctuation + ']+', ' ', tweet)  # strip punctuation
    tweet = re.sub('\s+', ' ', tweet)  # remove double spacing
    tweet = re.sub(r'\n','', tweet)  # remove escape sequence
    tweet = re.sub('([0-9]+)', '', tweet)  # remove numbers
    tweet = re.sub('📝 …', '', tweet) # un exemple d'image que vous pouvez compléter !
    return tweet

def remove_contraction(tweet): #enlever les contractions ou les abréviations
    # tweet = ' '.join([contractions.fix(word) for word in tweet.split()])
    contraction_dict = {"t'es" : "tu es", "c'est": "ce est", "c":"ce est"} # ajouter d'autres si nécessaire
    tweet = " ".join([contraction_dict.get(i,i) for i in tweet.split()])
    return tweet

def pretokenization_cleaning(tweet):
    """Main master function to clean tweets only without tokenization or removal of stopwords"""
    tweet = remove_links(tweet)
    tweet = remove_tags(tweet)
    tweet = remove_users(tweet)
    tweet = remove_nonText(tweet)
    tweet = tweet.lower()  # lower case
    tweet = remove_contraction(tweet)
    return tweet

In [None]:
import pandas as pd
# Attention changer le chemin d'accées à votre fichier
df = pd.read_csv('../Semaine6/tweets_Greve_2023.csv',encoding='utf-8')
df_fr = df[df['Langue'] == 'fr'].copy() 

In [None]:
Text = df_fr['Tweet'][4]
Text

In [None]:
pretokenization_cleaning(Text)

In [None]:
df_fr.head()

In [None]:
#calling pretokenization_cleaning
df_fr['Clean']=[pretokenization_cleaning(sentence) for sentence in df_fr['Tweet']]
df_fr.head()

## étape 2 : Normalising data  - à adapter à votre corpus 
- Spelling Correction
- Replace Negations with Antonyms
- Handling Capitalized Words
- Lowercase
- Tokenization
- Remove Stopwords (ex: the, and….)
- Stemming
- Lemmatizing

In [None]:
from nltk.tokenize import TweetTokenizer # la différence avec word_tokenize est que tweettokenizer garde les hashtags tandis que word_tokenize ne le permet pas.
from nltk.corpus import stopwords

def tokenize(text):
    tknzr = TweetTokenizer(reduce_len=True) 
    # reduce_len pour spécifier s'il faut remplacer les séquences de caractères répétées de longueur 3 ou plus par des séquences de longueur de 3
    # par exemple pour une séquence 'This is waaaaayyyy too much for you!!!!!!',    waaaaayyyy -> waaayyy
    return tknzr.tokenize(text)

def remove_stopwords(text):
    stop_words = set(stopwords.words('french'))
    return " ".join([token for token in text if token.lower() not in stop_words])
    #return [token for token in text if token.lower() not in stop_words]

In [None]:
def preprocess_tweet(tweet):
    """Main master function to clean and normalizing tweets, and tokenizing use lemmatization"""
    tweet = remove_links(tweet)
    tweet = remove_tags(tweet)
    tweet = remove_users(tweet)
    tweet = remove_nonText(tweet)
    tweet = tweet.lower()  # lower case
    tweet = remove_contraction(tweet)

    ######################################
    
    tweet = tokenize(tweet)  # apply tokenization
    tweet = remove_stopwords(tweet)

    return tweet

In [None]:
#calling pretokenization_cleaning
df_fr['Normalized'] = [preprocess_tweet(sentence) for sentence in df_fr['Tweet']]
df_fr.head()

In [None]:
#Create a text with all words
all_words = ' '.join([word for word in df_fr['Tweet']])
all_Clean_words = ' '.join([word for word in df_fr['Clean']])
all_Normalized_words = ' '.join([word for word in df_fr['Normalized']])

In [None]:
from nltk.tokenize import word_tokenize

#Tokenize all_words
tokenized_words = word_tokenize(all_words)
tokenized_Clean_words = word_tokenize(all_Clean_words)
tokenized_Normalized_words = word_tokenize(all_Normalized_words)

In [None]:
tokenized_words = tokenize(all_words)
tokenized_Clean_words = tokenize(all_Clean_words)
tokenized_Normalized_words = tokenize(all_Normalized_words)

In [None]:
# lets find the most frequent words
from nltk.probability import FreqDist

fdist_all = FreqDist(tokenized_words)
fdist_clean = FreqDist(tokenized_Clean_words)
fdist_normalized = FreqDist(tokenized_Normalized_words)

In [None]:
print("The most frequent words in the corpus:")
print(fdist_all.most_common(20))
print("The most frequent words in the corpus---- after cleaning the data:")
print(fdist_clean.most_common(20))
print("The most frequent words in the corpus---- after normalizing the data!")
print(fdist_normalized.most_common(20))

In [None]:
fdist_normalized.plot(50)

In [None]:
!pip install seaborn

In [None]:
# Import seaborn
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
## Creating FreqDist for whole BoW, keeping the 20 most common tokens
all_fdist = fdist_normalized.most_common(20)

## Conversion to Pandas series via Python Dictionary for easier plotting
all_fdist = pd.Series(dict(all_fdist))

## Setting figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))

## Seaborn plotting using Pandas attributes + xtick rotation for ease of viewing
all_plot = sns.barplot(x=all_fdist.index, y=all_fdist.values, ax=ax)
plt.xticks(rotation=30);

In [None]:
!pip install wordcloud
from wordcloud import WordCloud

In [None]:
wordcloud = WordCloud(width= 1000, height = 600, max_words=200,
                      random_state=1, background_color='White',
                      collocations=False, stopwords = stop_words).generate(all_Normalized_words)
plt.figure(figsize=(5, 5))
plt.imshow(wordcloud) 
plt.axis("off")
plt.show()

In [None]:
#Lemmatization
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
from nltk.stem import WordNetLemmatizer

def lemmatize(sentence):
    """Returns lemmatization of a token"""
    tokens = nltk.tokenize.word_tokenize(sentence)
    # tokens = tokenize(sentence)
    return [WordNetLemmatizer().lemmatize(token, pos='v') for token in tokens]

#WordNetLemmatizer().lemmatize(token, pos='v')

In [None]:
df_fr['Lemmatized'] = [lemmatize(sentence) for sentence in df_fr['Normalized']]

In [None]:
df_fr.head()

In [None]:
i=0
df_fr['Lemmatized_bis'] = ""
for sentence in df_fr['Lemmatized']:
    df_fr['Lemmatized_bis'][i] = ' '.join(word for word in sentence)
    i=i+1

In [None]:
df_fr.head()

In [None]:
all_Lemmatized_words = ' '.join(words for words in df_fr['Lemmatized_bis'])
tokenized_Lemmatized_words = tokenize(all_Lemmatized_words)
fdist_Lemmatized = FreqDist(tokenized_Lemmatized_words)
print("The most frequent lemma in the corpus---- after normalizing the data!")
print(fdist_Lemmatized.most_common(20))

In [None]:
import seaborn as sns
sns.set_theme(style="ticks")

# Obtain top 10 words
top_20 = fdist_Lemmatized.most_common(20)

# Create pandas series to make plotting easier
fdist = pd.Series(dict(top_20))

sns.barplot(y=fdist.index, x=fdist.values, color='blue');