In [17]:

# If necessary, install the required libraries
%pip install pandas nltk scikit-learn openpyxl matplotlib seaborn


Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-1.26.4-cp312-cp312-win_amd64.whl.metadata (61 kB)
     ---------------------------------------- 0.0/61.0 kB ? eta -:--:--
     ------ --------------------------------- 10.2/61.0 kB ? eta -:--:--
     ------------ ------------------------- 20.5/61.0 kB 165.2 kB/s eta 0:00:01
     ------------------- ------------------ 30.7/61.0 kB 187.9 kB/s eta 0:00:01
     ------------------------------- ------ 51.2/61.0 kB 262.6 kB/s eta 0:00:01
     -------------------------------------- 61.0/61.0 kB 295.3 kB/s eta 0:00:00
Downloading numpy-1.26.4-cp312-cp312-win_amd64.whl (15.5 MB)
   ---------------------------------------- 0.0/15.5 MB ? eta -:--:--
   ---------------------------------------- 0.1/15.5 MB 3.2 MB/s eta 0:00:05
   ---------------------------------------- 0.2/15.5 MB 2.1 MB/s eta 0:00:08
    --------------------------------------- 0.3/15.5 MB 2.6 MB/s eta 0:00:06
   - -------------------------------------- 0.5/15.5 MB 2.7 MB/s

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
blis 1.0.1 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.
thinc 8.3.2 requires numpy<2.1.0,>=2.0.0; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.


In [2]:

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Download necessary NLTK data files (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gerar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gerar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gerar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:

# Combine Multiple Excel Files
directory = "./datasets"  # Replace with the path to your Excel files

# Create an empty list to hold the dataframes
dataframes = []

# Iterate through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".xlsx"):
        file_path = os.path.join(directory, filename)
        # Read each Excel file and append it to the list
        df = pd.read_excel(file_path)
        dataframes.append(df)

# Concatenate all the dataframes into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Export the combined dataframe to a new Excel file
combined_df.to_excel("combined_tweets.xlsx", index=False)
print("All files have been combined successfully!")


All files have been combined successfully!


In [9]:
print(combined_df)

    bookmarkCount       conversationId                       createdAt  \
0               0  1843054927339483464  Sun Oct 06 22:25:11 +0000 2024   
1               0  1843053407990698186  Sun Oct 06 22:19:09 +0000 2024   
2               0  1843026753016807813  Sun Oct 06 20:50:37 +0000 2024   
3               0  1842995428897538332  Sun Oct 06 18:28:46 +0000 2024   
4               1  1842988481137492342  Sun Oct 06 18:01:09 +0000 2024   
5               0  1842988313897992279  Sun Oct 06 18:00:29 +0000 2024   
6               0  1842221123095998617  Sun Oct 06 17:46:19 +0000 2024   
7               0  1842862145056219265  Sun Oct 06 15:53:20 +0000 2024   
8               0  1842938219169542144  Sun Oct 06 15:52:04 +0000 2024   
9               0  1842938219169542144  Sun Oct 06 15:26:35 +0000 2024   
10              0  1843091498751217932  Mon Oct 07 11:07:37 +0000 2024   
11              0  1842957845945593936  Mon Oct 07 11:00:09 +0000 2024   
12              0  1843234305428144414

In [18]:
# Load the combined Excel file
df = pd.read_excel("combined_tweets.xlsx")

# Data Cleaning Function - Remove URLs, special characters, and usernames
def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions (@username)
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = text.lower()  # Convert to lowercase
    return text

# Clean the text data
df['cleaned_text'] = df['text'].apply(clean_text)  # Replace 'tweet_text' with your column name

# Tokenization
df['tokens'] = df['cleaned_text'].apply(word_tokenize)

# Removing Stopwords
stop_words = set(stopwords.words('spanish'))
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['lemmatized_tokens'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Join tokens back into a single string
df['final_text'] = df['lemmatized_tokens'].apply(lambda x: ' '.join(x))

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features
X = vectorizer.fit_transform(df['final_text'])

# Encode Sentiment Labels (if applicable)
if 'sentiment' in df.columns:
    df['sentiment_label'] = df['sentiment'].map({'positive': 1, 'negative': 0, 'neutral': 2})
    y = df['sentiment_label']
else:
    y = None

# Save preprocessed data to Excel
processed_df = df[['final_text', 'sentiment_label']] if y is not None else df[['final_text']]
processed_df.to_excel("preprocessed_tweets.xlsx", index=False)
print("Data Preprocessing Complete!")


Data Preprocessing Complete!
