# Imports


In [None]:
import csv
import os
import numpy as np
import string
import matplotlib.pyplot as plt
import pickle
import random
import dask.dataframe as dd
from collections import defaultdict
from zipfile import ZipFile
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import multiprocessing
import spacy
import tensorflow as tf

In [None]:
print('Max cpu detected: {}'.format(multiprocessing.cpu_count()))
#npartitions = multiprocessing.cpu_count()

if tf.config.list_physical_devices('GPU'):
    print('GPU available')
else:
    print('No GPU detected...when it comes to training please set the accelerator to use a GPU')

Max cpu detected: 2
GPU available


# Load Data

In [None]:
data_path = '/content/drive/MyDrive/TFG/MarcusRashford_user_tweets.xlsx'

data = pd.read_excel(data_path)

print('Dataset size {}'.format(len(data)))
print('Dataset first five rows:\n{}'.format(data.head()))

Dataset size 3168
Dataset first five rows:
              Tweet Id                                               Text  \
0  1667979053511368704      Rash X Bolt ⚡️👉🏾🙎🏾‍♂️ https://t.co/5Jfw90obiW   
1  1667975927572168704  RT @England : 😀 @MarcusRashford https://t.co/h...   
2  1666795353964703746                         ❤️ https://t.co/jDBwvlyLnt   
3  1666441219365322760  RT @ManUtd : This has to be @MarcusRashford's ...   
4  1664633386734874625                            https://t.co/rYdCZweVTQ   

              Name     Screen Name                       UTC  \
0  Marcus Rashford  MarcusRashford  2023-06-11T19:36:10.000Z   
1  Marcus Rashford  MarcusRashford  2023-06-11T19:23:45.000Z   
2  Marcus Rashford  MarcusRashford  2023-06-08T13:12:34.000Z   
3  Marcus Rashford  MarcusRashford  2023-06-07T13:45:22.000Z   
4  Marcus Rashford  MarcusRashford  2023-06-02T14:01:41.000Z   

                       Created At  Favorites  Retweets Language  \
0  Sun Jun 11 19:36:10 +0000 2023      494

# Preprocess

In [None]:
nlp = spacy.load('en_core_web_sm')
stop_words = nlp.Defaults.stop_words #DefaultStopWords
stop_list = stop_words.union(set(string.punctuation)) #stopList
porter = nltk.stem.PorterStemmer() #Stemming

# preprocessing pipeline
def preprocess(text):
    text = re.sub("https?:\S+|http?:\S|[0-9]+", '', text) #NoLiks
    text = re.sub("\.{2,}", '', text) #NoElipse
    text = re.sub('@[^\s]+', '', text)  # remove usernames
    text = re.sub('\[.*?\]', '', text)  # remove square brackets and contents
    text = re.sub('\s{2,}', ' ', text)  # remove extra spaces
    doc = nlp(text) #doc object:documento de texto procesado por una biblioteca de NLP
    tokens = [word.lemma_.lower() for word in doc if not word.is_stop and not word.is_punct and not word.is_space and not word.is_digit] #check if is not an stop & lower & lemma: basic form
    tokens = [porter.stem(token) for token in tokens] #stem: raiz
    return tokens

In [None]:
test = data.head()
test = test['Text'].apply(lambda x: preprocess(x))
print('Before: {}'. format(list(data['Text'][:5])))
print('After: {}'. format(list(test[:5])))

Before: ['Rash X Bolt ⚡️👉🏾🙎🏾\u200d♂️ https://t.co/5Jfw90obiW', 'RT @England : 😀 @MarcusRashford https://t.co/hzJkbrVIN7', '❤️ https://t.co/jDBwvlyLnt', "RT @ManUtd : This has to be @MarcusRashford's best goal in a  #ThreeLions shirt 🚀\n\nOur no.10 is part of the @England squad to face North Macedonia at Old Trafford on Monday 19 June – get your tickets ➡️ https://t.co/efTeQkZTIn\n\n#MUFC https://t.co/tavLq5RyCD", 'https://t.co/rYdCZweVTQ']
After: [['rash', 'x', 'bolt', '⚡', '️', '👉', '🏾', '🙎', '🏾\u200d', '♂', '️'], ['rt', '😀'], ['❤', '️'], ['rt', 'good', 'goal', 'threelion', 'shirt', '🚀', 'squad', 'face', 'north', 'macedonia', 'old', 'trafford', 'monday', 'june', 'ticket', '➡', '️', 'mufc'], []]


In [None]:
def preprocess_df(dd:object):
  dd['Text'] = dd['Text'].map(lambda x: preprocess(x))

  return dd

In [None]:
data = data[data['Language'] == 'en']
# Contar el número de filas del DataFrame resultante
num_filas = data.shape[0]

# Mostrar el número de filas
print("Número de filas:", num_filas)

Número de filas: 2688


In [None]:
data = data.drop(data.columns.difference(['Text']), axis=1)

In [None]:
print(data)

                                                   Text
3     RT @ManUtd : This has to be @MarcusRashford's ...
5     RT @EuropaLeague : ⚽ Boniface 🤝 Rashford ⚽ \n\...
6     RT @ManUtd : Backing one of our own 👊\n\n🏆 @Ma...
8     @TyrellMalaciia @ManUtd My brother to another ...
9     It’s been a long time since an academy player ...
...                                                 ...
3163      @goodyers_end Thanks Year 5 looks brilliant x
3164  @WillJee1 @ManUtd This is incredible bro, than...
3165  Ah this is really lovely. Do you want to send ...
3166      @CardwellSE18 @foodingreenwich Hi Year 4 👋🏾♥️
3167  You are welcome Year 1 ♥️ https://t.co/Xsom87owqd

[2688 rows x 1 columns]


In [None]:
X = data['Text'].tolist()

In [None]:
total = len(X)
for i, text in enumerate(X):
    X[i] = preprocess(text)
    print('Processed {} out of {} documents ({:.1%})'.format(i+1, total, (i+1)/total))

Processed 1 out of 2688 documents (0.0%)
Processed 2 out of 2688 documents (0.1%)
Processed 3 out of 2688 documents (0.1%)
Processed 4 out of 2688 documents (0.1%)
Processed 5 out of 2688 documents (0.2%)
Processed 6 out of 2688 documents (0.2%)
Processed 7 out of 2688 documents (0.3%)
Processed 8 out of 2688 documents (0.3%)
Processed 9 out of 2688 documents (0.3%)
Processed 10 out of 2688 documents (0.4%)
Processed 11 out of 2688 documents (0.4%)
Processed 12 out of 2688 documents (0.4%)
Processed 13 out of 2688 documents (0.5%)
Processed 14 out of 2688 documents (0.5%)
Processed 15 out of 2688 documents (0.6%)
Processed 16 out of 2688 documents (0.6%)
Processed 17 out of 2688 documents (0.6%)
Processed 18 out of 2688 documents (0.7%)
Processed 19 out of 2688 documents (0.7%)
Processed 20 out of 2688 documents (0.7%)
Processed 21 out of 2688 documents (0.8%)
Processed 22 out of 2688 documents (0.8%)
Processed 23 out of 2688 documents (0.9%)
Processed 24 out of 2688 documents (0.9%)
P

In [None]:
print('Post preprocessing check: {}'.format(list(X[:5])))

Post preprocessing check: [['rt', 'good', 'goal', 'threelion', 'shirt', '🚀', 'squad', 'face', 'north', 'macedonia', 'old', 'trafford', 'monday', 'june', 'ticket', '➡', '️', 'mufc'], ['rt', '⚽', 'bonifac', '🤝', 'rashford', '⚽', 'uel', 'season', 'end', 'joint', 'scorer', '👏', '👏', '👏'], ['rt', 'back', '👊', '🏆', 'mufc'], ['brother', 'mother', '❤', '️'], ['long', 'time', 'academi', 'player', 'win', 'award', 'feel', 'huge', 'sens', 'pride', 'hope', 'academi', 'player', 'win', 'award', 'feel', 'feel', '🏆']]


In [None]:
with open('/content/drive/MyDrive/TFG/processed_RF.pkl', 'wb') as f:
  pickle.dump(X, f)