In [29]:
import os
import json
import numpy as np
from datetime import datetime
import shutil
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import librosa
import cv2
from PIL import Image
import pandas as pd
from sklearn.preprocessing import StandardScaler
import re
from textblob import TextBlob
from langdetect import detect
import pickle
import hashlib
from concurrent.futures import ThreadPoolExecutor
import logging
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import soundfile as sf
import threading
import traceback

# Descargar recursos NLTK necesarios
def download_nltk_resources():
    resources = ['punkt', 'stopwords', 'wordnet', 'omw-1.4']
    for resource in resources:
        nltk.download(resource, quiet=True)
download_nltk_resources()

In [30]:
class Config:
    def __init__(self):
        self.DESKTOP_PATH = os.path.expanduser("~/Desktop")
        self.JARVIS_DATA_PATH = os.path.join(self.DESKTOP_PATH, "jarvis_data")
        self.DATA_BD_PATH = os.path.join(self.JARVIS_DATA_PATH, "databd")
        self.PREPROCESSED_PATH = os.path.join(self.JARVIS_DATA_PATH, "preprocessed")
        self.CACHE_PATH = os.path.join(self.JARVIS_DATA_PATH, "cache")
        self.CURRENT_DATE = datetime.now().strftime("%Y-%m-%d")
        self.CURRENT_PREPROCESSED_PATH = os.path.join(self.PREPROCESSED_PATH, self.CURRENT_DATE)
        self.create_directories()
        self.setup_logging()

    def create_directories(self):
        directories = [
            self.JARVIS_DATA_PATH,
            self.DATA_BD_PATH,
            self.PREPROCESSED_PATH,
            self.CURRENT_PREPROCESSED_PATH,
            self.CACHE_PATH
        ]
        for directory in directories:
            os.makedirs(directory, exist_ok=True)

    def setup_logging(self):
        logs_dir = os.path.join(self.JARVIS_DATA_PATH, "logs")
        os.makedirs(logs_dir, exist_ok=True)
        log_file = os.path.join(logs_dir, f'jarvis_preprocessing_{self.CURRENT_DATE}.log')
        logging.basicConfig(
            level=logging.DEBUG,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(log_file, encoding='utf-8'),
                logging.StreamHandler()
            ]
        )


In [31]:
class CacheManager:
    def __init__(self, config):
        self.config = config
        self.cache_dir = config.CACHE_PATH
        self.cache = {}
        self.load_cache()

    def get_hash(self, data):
        return hashlib.md5(str(data).encode()).hexdigest()

    def load_cache(self):
        cache_file = os.path.join(self.cache_dir, 'preprocessing_cache.pkl')
        if os.path.exists(cache_file):
            with open(cache_file, 'rb') as f:
                self.cache = pickle.load(f)

    def save_cache(self):
        cache_file = os.path.join(self.cache_dir, 'preprocessing_cache.pkl')
        with open(cache_file, 'wb') as f:
            pickle.dump(self.cache, f)

    def get_cached(self, data):
        data_hash = self.get_hash(data)
        return self.cache.get(data_hash)

    def add_to_cache(self, data, processed_data):
        data_hash = self.get_hash(data)
        self.cache[data_hash] = processed_data
        self.save_cache()


In [32]:
class EmotionalFeatureExtractor:
    def __init__(self):
        try:
            self.nlp = spacy.load('es_core_news_lg')
        except OSError:
            os.system('python -m spacy download es_core_news_lg')
            self.nlp = spacy.load('es_core_news_lg')
        self.context_memory = {}

    def extract_emotional_features(self, text):
        features = {"basic_analysis": TextBlob(text).sentiment.polarity}
        if self.nlp:
            doc = self.nlp(text)
            features.update({
                "entity_emotions": self._analyze_entity_emotions(doc),
                "emotional_context": self._analyze_emotional_context(text)
            })
        return features

    def _analyze_entity_emotions(self, doc):
        return {}

    def _analyze_emotional_context(self, text):
        return {}


In [33]:
class AdvancedDataPreprocessor:
    def __init__(self, config=None):
        self.config = config or {}
        self.input_folders = self.config.get('data_paths', [])
        self.vectorizer = TfidfVectorizer()
        self.processed_data = None
        self.setup_tools()

    def setup_tools(self):
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        self.stop_words = set(stopwords.words('spanish') + stopwords.words('english'))
        try:
            self.nlp = spacy.load('es_core_news_sm')
        except OSError:
            os.system('python -m spacy download es_core_news_sm')
            self.nlp = spacy.load('es_core_news_sm')

    def preprocess_text(self, text):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token not in self.stop_words]
        doc = self.nlp(' '.join(tokens))
        return ' '.join([token.lemma_ for token in doc])
    
    def process_files_parallel(self):
        for input_folder in self.input_folders:
            files = self._get_files_to_process(input_folder)
            with ThreadPoolExecutor(max_workers=os.cpu_count() or 4) as executor:
                results = list(executor.map(self._process_file, files))
            return results

    def _get_files_to_process(self, folder):
        return [
            os.path.join(root, file)
            for root, _, files in os.walk(folder)
            for file in files if file.endswith(('.txt', '.json'))
        ]

    def _process_file(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            return self.preprocess_text(f.read())


In [34]:
if __name__ == "__main__":
    config_instance = Config()
    preprocessor = AdvancedDataPreprocessor({'data_paths': [config_instance.DATA_BD_PATH]})
    preprocessor.process_files_parallel()
