In [None]:
#!pip install spacy

In [None]:
! python -m spacy download es_core_news_sm

Collecting es-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.9/12.9 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: es-core-news-sm
Successfully installed es-core-news-sm-3.7.0
[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m‚ö† Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import pandas as pd
import os
import re
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [None]:
# Download NLTK data files (only needed if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
pd.options.display.max_colwidth = 200

In [None]:
# Load the spaCy model for Spanish
nlp = spacy.load('es_core_news_sm')

# Initialize the Snowball stemmer for Spanish
stemmer = SnowballStemmer('spanish')

# Define a list of stopwords
stop_words = set(stopwords.words('spanish'))



In [None]:
def preprocess_text(text):
    # Remove special characters (keeping numbers) and convert to lowercase
    text = re.sub(r'[^a-zA-Z√°√©√≠√≥√∫√Å√â√ç√ì√ö√±√ë√º√ú0-9]', ' ', text)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # Reconstruct the text
    processed_text = ' '.join(filtered_tokens)
    return processed_text

In [None]:
def lemmatize_and_stem(text):
    # Lemmatize using spaCy
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]

    # Stem using NLTK SnowballStemmer
    stemmed_tokens = [stemmer.stem(token) for token in lemmatized_tokens]

    # Reconstruct the text
    processed_text = ' '.join(stemmed_tokens)
    return processed_text

In [None]:
def clean_pipeline(df):
    df = df[df.lang != "qme"]
    desired_columns = ['url', 'createdAt', 'id', 'viewCount','lang', 'author__createdAt', 'author__location', 'text']
    df = df[[col for col in df.columns if col in desired_columns]]
    # Apply preprocessing, lemmatization, and stemming to the 'Text' column
    df['processed_text'] = df['text'].apply(preprocess_text).apply(lemmatize_and_stem)
    return df



In [None]:
test_df = pd.read_excel("/content/drive/MyDrive/IREX_ES/Twitter/Comments/PulsoCiudadanos_X_V3.2_replies_1740923091436613794.xlsx")

In [None]:
test_df.head()

Unnamed: 0,url,createdAt,id,isReply,inReplyToId,isRetweet,isQuote,viewCount,retweetCount,likeCount,replyCount,lang,author__createdAt,author__location,text
0,https://x.com/1_prefer_not_to/status/1741077093159428335,Sat Dec 30 12:41:41 +0000 2023,1741077093159428335,True,1.740923e+18,False,False,115,0,3,0,und,2012-10-16,,@PulsoCiudadanos @fulloa51 @nuevasideas ... https://t.co/y5jx6OKJwt
1,https://x.com/GeoMartinezH/status/1741014382958576094,Sat Dec 30 08:32:30 +0000 2023,1741014382958576094,True,1.740923e+18,False,False,174,0,4,0,es,2011-03-11,Alacant,"@PulsoCiudadanos @fulloa51 @nuevasideas Pero si el que no est√° a la altura es ud @fulloa51 deje de llamar ‚Äúcaballito de batalla‚Äù a una aberrante serie de inconstitucionalidades, creadas a su antoj..."
2,https://x.com/Aragornius1/status/1741003742777061463,Sat Dec 30 07:50:13 +0000 2023,1741003742777061463,True,1.740923e+18,False,False,318,1,12,1,es,2019-02-23,üåé,"@PulsoCiudadanos @fulloa51 @nuevasideas Lo dice el cobarde que bloquea cuando se le habla con hechos, cuando se le dice lo hip√≥crita que es con su discurso demagogo. \n\n@fulloa51 eres un traidor ..."
3,https://x.com/JuanMaz62302560/status/1740926933062603138,Sat Dec 30 02:45:00 +0000 2023,1740926933062603138,True,1.740923e+18,False,False,474,0,7,3,es,2021-12-17,"Metapan, El Salvador",@PulsoCiudadanos @fulloa51 @nuevasideas Me gustan las respuestas que da nuestro candidato a vicepresidente \nTap√°ndole la boca a este medio chayotero
4,https://x.com/MarceloSolSol/status/1740937998248886342,Sat Dec 30 03:28:58 +0000 2023,1740937998248886342,True,1.740923e+18,False,False,643,1,18,1,es,2022-09-24,,"@PulsoCiudadanos @fulloa51 @nuevasideas No hay con quien debatir imag√≠nese Beto Cuma , El sopa de patas , El de apellido parada , etc el vicepresidente solo el los les ganar√≠a en un debate , pe..."


In [None]:
test_df.lang.value_counts()

lang
es     31
en      3
qme     2
und     1
tl      1
pt      1
et      1
qam     1
in      1
Name: count, dtype: int64

In [None]:
test_df.author__location.value_counts()

author__location
New York                      3
El Salvador                   3
Alacant                       1
üåé                             1
Metapan, El Salvador          1
Chalatenango, El Salvador     1
Skara, Sverige                1
El Salvador, La libertad.     1
San Francisco, CA             1
‚öì                             1
Washington, DC                1
los angeles                   1
Unknown                       1
San Salvador, El Salvador     1
Madre Teresa de Calcuta       1
Name: count, dtype: int64

In [None]:
test_df_preprocessed = clean_pipeline(test_df)

In [None]:
test_df_preprocessed.head()

Unnamed: 0,url,createdAt,id,viewCount,lang,author__createdAt,author__location,text,processed_text
0,https://x.com/1_prefer_not_to/status/1741077093159428335,Sat Dec 30 12:41:41 +0000 2023,1741077093159428335,115,und,2012-10-16,,@PulsoCiudadanos @fulloa51 @nuevasideas ... https://t.co/y5jx6OKJwt,pulsociudadan fulloa51 nuevaside https t co y5jx6okjwt
1,https://x.com/GeoMartinezH/status/1741014382958576094,Sat Dec 30 08:32:30 +0000 2023,1741014382958576094,174,es,2011-03-11,Alacant,"@PulsoCiudadanos @fulloa51 @nuevasideas Pero si el que no est√° a la altura es ud @fulloa51 deje de llamar ‚Äúcaballito de batalla‚Äù a una aberrante serie de inconstitucionalidades, creadas a su antoj...",pulsociudadan fulloa51 nuevaside si altur ud fulloa51 dej llam caballit batall aberr seri inconstitucional cre antoj pod esper cabr aplaud person supuest estudi derech
2,https://x.com/Aragornius1/status/1741003742777061463,Sat Dec 30 07:50:13 +0000 2023,1741003742777061463,318,es,2019-02-23,üåé,"@PulsoCiudadanos @fulloa51 @nuevasideas Lo dice el cobarde que bloquea cuando se le habla con hechos, cuando se le dice lo hip√≥crita que es con su discurso demagogo. \n\n@fulloa51 eres un traidor ...",pulsociudadan fulloa51 nuevaside dec cobard bloque habl hech dec hipocrit discurs demagog fulloa51 traidor peor gusan arrastr
3,https://x.com/JuanMaz62302560/status/1740926933062603138,Sat Dec 30 02:45:00 +0000 2023,1740926933062603138,474,es,2021-12-17,"Metapan, El Salvador",@PulsoCiudadanos @fulloa51 @nuevasideas Me gustan las respuestas que da nuestro candidato a vicepresidente \nTap√°ndole la boca a este medio chayotero,pulsociudadan fulloa51 nuevaside gust respuest dar candidat vicepresident tapandol boc medi chayoter
4,https://x.com/MarceloSolSol/status/1740937998248886342,Sat Dec 30 03:28:58 +0000 2023,1740937998248886342,643,es,2022-09-24,,"@PulsoCiudadanos @fulloa51 @nuevasideas No hay con quien debatir imag√≠nese Beto Cuma , El sopa de patas , El de apellido parada , etc el vicepresidente solo el los les ganar√≠a en un debate , pe...",pulsociudadan fulloa51 nuevaside debat imagines bet cum sop pat apell par etc vicepresident sol gan debat perd tiemp val pen


In [None]:
preprocessed_drive_loc = "/content/drive/MyDrive/IREX_ES/preprocessed/Twitter"
drive = "/content/drive/MyDrive/IREX_ES/Twitter/Comments/"

In [None]:
for file in os.listdir(drive):
  if file.startswith("Pulso"):
    file_name = file.rstrip('.xlsx')
    df = pd.read_excel(drive + file)
    df = clean_pipeline(df)
    write_path = preprocessed_drive_loc +"/"+ file_name + ".csv"
    df.to_csv(write_path)





Remove languages other than es (en to be translated only if relevant)

filter the location el salvador

process the text

run the code in the loop to read from raw

save the output in the preprocessed folder

remove the less important columns






