In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


def extract_and_save_titles(input_file, output_file, num_rows=2000):
    df = pd.read_csv(input_file, encoding="latin1")
    df_titles = pd.DataFrame(df.head(num_rows)["Title"])
    df_titles.to_csv(output_file, index=False)
    print(
        f"Finished extracting the first {num_rows} rows of the column 'Title' and saved to '{output_file}'."
    )


class TextAnalysis:
    def __init__(self, filepath):
        self.filepath = filepath
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = nltk.corpus.stopwords.words("spanish")
        self.vectorizer = TfidfVectorizer(use_idf=True)
        self.read_data()
        self.preprocess_and_translate_text()
        self.calculate_idf()
        self.filter_below_average_idf()

    def read_data(self):
        self.df = pd.read_csv(self.filepath)["Translated_Texts"]

    def preprocess_and_translate_text(self):
        def process(text):
            words = nltk.word_tokenize(text)
            words = [
                self.lemmatizer.lemmatize(word.lower())
                for word in words
                if word.lower() not in self.stop_words
            ]
            return " ".join(words)

        self.df = self.df.apply(process)

    def calculate_idf(self):
        tfidf_matrix = self.vectorizer.fit_transform(self.df)
        features = self.vectorizer.get_feature_names_out()
        idf = self.vectorizer.idf_
        self.idf_dict = dict(zip(features, idf))
        self.average_idf = np.mean(idf)

    def filter_below_average_idf(self):
        def filter_words(text):
            words = text.split()
            return " ".join(
                [
                    word
                    for word in words
                    if self.idf_dict.get(word, 0) >= self.average_idf
                ]
            )

        self.df = self.df.apply(filter_words)

    def top_bottom_words(self, top_n=10, bottom_n=20):
        sorted_idf = sorted(self.idf_dict.items(), key=lambda x: x[1], reverse=True)
        top_words = sorted_idf[:top_n]
        bottom_words = sorted_idf[-bottom_n:]
        return top_words, bottom_words


if __name__ == "__main__":
    extract_and_save_titles("Questions.csv", "Questions_2000.csv")
    analysis = TextAnalysis("TranslatedQuestions.csv")

    print("Initial IDF Table:")
    print(sorted(analysis.idf_dict.items(), key=lambda x: x[1], reverse=True))

    top_10, bottom_20 = analysis.top_bottom_words()
    print("\nTop 10 palabras más relevantes:")
    print(top_10)

    print("\nTop 20 palabras menos relevantes:")
    print(bottom_20)

Finished extracting the first 2000 rows of the column 'Title' and saved to 'Questions_2000.csv'.
Initial IDF Table:
[('000z', 7.908255154023788), ('04', 7.908255154023788), ('0x1a', 7.908255154023788), ('100', 7.908255154023788), ('1000', 7.908255154023788), ('104', 7.908255154023788), ('1123', 7.908255154023788), ('20', 7.908255154023788), ('2006', 7.908255154023788), ('2009', 7.908255154023788), ('23', 7.908255154023788), ('301', 7.908255154023788), ('303', 7.908255154023788), ('404', 7.908255154023788), ('50', 7.908255154023788), ('500', 7.908255154023788), ('6rc2', 7.908255154023788), ('8601', 7.908255154023788), ('96', 7.908255154023788), ('__all__', 7.908255154023788), ('__dict__', 7.908255154023788), ('__future__', 7.908255154023788), ('__getattribute__', 7.908255154023788), ('__init', 7.908255154023788), ('__iter__', 7.908255154023788), ('__new__', 7.908255154023788), ('__reduce__', 7.908255154023788), ('__reduce_ex__', 7.908255154023788), ('__unicode', 7.908255154023788), ('__