In [None]:
!pip install spacy textblob vaderSentiment scikit-learn pandas
!python -m spacy download es_core_news_sm
!pip install git+https://github.com/openai/whisper.git
!apt update && apt install -y ffmpeg
!pip install spacy sklearn
!python -m spacy download es_core_news_sm
!pip install deep-translator --upgrade

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Collecting es-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: es-core-news-sm
Successfully installed es-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restar

In [None]:
!pip show deep-translator

Name: deep-translator
Version: 1.11.4
Summary: A flexible free and unlimited python tool to translate between different languages in a simple way using multiple translators
Home-page: https://github.com/nidhaloff/deep_translator
Author: Nidhal Baccouri
Author-email: nidhalbacc@gmail.com
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: beautifulsoup4, requests
Required-by: 


In [None]:
import whisper
import spacy
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import re
from collections import Counter
from google.colab import drive
from deep_translator import GoogleTranslator, PonsTranslator, LingueeTranslator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import spacy

In [None]:
# Paso 1: Transcripción con Whisper
def transcribe_audio(file_path):
    model = whisper.load_model("base")
    result = model.transcribe(file_path, language="es")
    return result["text"]

# Paso 2: Preprocesamiento con spaCy
def preprocess_text(text):
    nlp = spacy.load("es_core_news_sm")
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Paso 3: Análisis de sentimientos con TextBlob
def analyze_sentiment_textblob(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity  # Rango de -1 a 1
    return "positivo" if sentiment > 0 else "negativo" if sentiment < 0 else "neutral"

# Paso 4: Análisis de sentimientos con VADER
def analyze_sentiment_vader(text):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(text)
    return "positivo" if scores["compound"] > 0 else "negativo" if scores["compound"] < 0 else "neutral"

# Paso 5: Análisis de temas con LDA
def analyze_topics(text, num_topics=3):
    vectorizer = CountVectorizer(stop_words="spanish")
    doc_term_matrix = vectorizer.fit_transform([text])
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(doc_term_matrix)

    terms = vectorizer.get_feature_names_out()
    topics = []
    for idx, topic in enumerate(lda.components_):
        top_terms = [terms[i] for i in topic.argsort()[-5:]]
        topics.append(f"Tema {idx+1}: {', '.join(top_terms)}")
    return topics

# Paso 6: Resumen de ideas
def summarize_text(text, num_sentences=3):
    nlp = spacy.load("es_core_news_sm")
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    sentence_scores = Counter()

    for sent in sentences:
        for word in preprocess_text(sent).split():
            sentence_scores[sent] += 1

    summary = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
    return "\n".join(summary)

# Paso 7: Entrenar modelo SVM para análisis de sentimientos
def train_svm_model():
    data = pd.DataFrame({
        "text": ["Me encanta este producto", "Odio este servicio", "Es simplemente aceptable", "Muy bueno", "Terrible"],
        "sentiment": ["positivo", "negativo", "neutral", "positivo", "negativo"]
    })
    data["processed_text"] = data["text"].apply(preprocess_text)

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(data["processed_text"])
    y = data["sentiment"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = SVC(kernel="linear")
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)
    print(classification_report(y_test, predictions))

    return model, vectorizer

# Cargar el modelo de spaCy para español
nlp = spacy.load("es_core_news_sm")

# Lista personalizada de stop words en español
spanish_stop_words = nlp.Defaults.stop_words

# Función para preprocesar texto y eliminar stop words en español
def preprocess_text_spanish(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.lemma_ not in spanish_stop_words and not token.is_punct]
    return " ".join(tokens)

# Análisis de temas en español
def analyze_topics_spanish(text, num_topics=3):
    vectorizer = CountVectorizer(stop_words=spanish_stop_words)
    doc_term_matrix = vectorizer.fit_transform([text])
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(doc_term_matrix)

    terms = vectorizer.get_feature_names_out()
    topics = []
    for idx, topic in enumerate(lda.components_):
        top_terms = [terms[i] for i in topic.argsort()[-5:]]
        topics.append(f"Tema {idx+1}: {', '.join(top_terms)}")
    return topics

# Traducción y análisis de temas en inglés
def translate_and_analyze_topics(text, num_topics=3):
    vectorizer = CountVectorizer(stop_words="english")
    doc_term_matrix = vectorizer.fit_transform([text])
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=0)
    lda.fit(doc_term_matrix)

    terms = vectorizer.get_feature_names_out()
    topics = []
    for idx, topic in enumerate(lda.components_):
        top_terms = [terms[i] for i in topic.argsort()[-5:]]
        topics.append(f"Tema {idx+1}: {', '.join(top_terms)}")
    return topics, translations

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
audio_file_path = '/content/drive/MyDrive/YouTube Audios/CHINOLO44_1_ffmpeg.mp3'

In [None]:
# Transcripción
text = transcribe_audio(audio_file_path)
print(f"Transcripción:\n{text}")

# Preprocesamiento
processed_text = preprocess_text(text)

100%|████████████████████████████████████████| 139M/139M [00:00<00:00, 183MiB/s]
  checkpoint = torch.load(fp, map_location=device)


Transcripción:
 que pasa chavales bienvenidos a chinolo bueno noticia triste no noticia triste me acabo de enterar y digo joder yo tengo que hacer un vídeo de Antonio no todo improvisación no tengo nada no tengo nada notado porque me acabo de enterar como ya os digo y la verdad me ha puesto hasta un poco triste no la noticia porque para los que llevamos siguiendo en los muchos años etcétera Antonio siempre ha sido un tío que estaba ahí en el sistema digamos no hasta o en la hasta o en la pomada y me ha puesto hasta triste no vamos a dedicar unas palabras ha improvisado un poco he pensado de cosas buenas y cosas malas no porque ya os digo que me ha puesto hasta un poco triste se van los mejores no entonces bueno lo primero que hay que yo el titular no el titular lo diría de ganar wols a una prejuvilación anticipada no o sea hace de un mes estaba diciendo que iba a ganar la lequí y va a ganar wols bueno al final la realidad es la que es no se puede luchar contra esos chavales supongo que

In [None]:
# Análisis de sentimientos
sentiment_textblob = analyze_sentiment_textblob(processed_text)
sentiment_vader = analyze_sentiment_vader(processed_text)

# Análisis de temas
topics = analyze_topics(processed_text)

# Resumen de ideas
summary = summarize_text(text)

# Entrenamiento del modelo SVM
svm_model, vectorizer = train_svm_model()
sentiment_svm = svm_model.predict(vectorizer.transform([processed_text]))[0]

# Resultados
print("\nResultados del análisis de sentimientos:")
print(f"TextBlob: {sentiment_textblob}")
print(f"VADER: {sentiment_vader}")
print(f"SVM: {sentiment_svm}")

print("\nTemas principales:")
for topic in topics:
  print(topic)

print("\nResumen del texto:")
print(summary)

InvalidParameterError: The 'stop_words' parameter of CountVectorizer must be a str among {'english'}, an instance of 'list' or None. Got 'spanish' instead.

In [None]:
# Dividir texto en fragmentos
def split_text(text, chunk_size=100):
    words = text.split()
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# Traducir fragmentos y unir
def translate_text_in_chunks(text, translator="Google"):
    chunks = split_text(text, chunk_size=50)
    translated_chunks = []

    if translator == "Google":
        for chunk in chunks:
            translated_chunks.append(GoogleTranslator(source="es", target="en").translate(chunk))
    elif translator == "Pons":
        for chunk in chunks:
            translated_chunks.append(PonsTranslator(source="es", target="en").translate(chunk))
    elif translator == "Linguee":
        for chunk in chunks:
            translated_chunks.append(LingueeTranslator(source="es", target="en").translate(chunk))
    else:
        raise ValueError("Translator not supported")

    return ' '.join(translated_chunks)

# Uso
translated_text_google = translate_text_in_chunks(text, translator="Google")
translated_text_pons = translate_text_in_chunks(text, translator="Pons")
translated_text_linguee = translate_text_in_chunks(text, translator="Linguee")

# Comparar traducciones
print("\nGoogle Translator:\n", translated_text_google)
print("\nPons Translator:\n", translated_text_pons)
print("\nLinguee Translator:\n", translated_text_linguee)


In [None]:
# Opción 2: Traducción y análisis en inglés
print("\nAnálisis de temas en inglés:")
topics_english, translations = translate_and_analyze_topics(audio_file_path)
for topic in topics_english:
    print(topic)


Análisis de temas en inglés:


NotValidLength: /content/drive/MyDrive/YouTube Audios/CHINOLO44_1_ffmpeg.mp3 --> Text length need to be between 0 and 50 characters

In [None]:
# Opción 1: Análisis en español
print("Análisis de temas en español:")
processed_text = preprocess_text_spanish(audio_file_path)
topics_spanish = analyze_topics_spanish(processed_text)
for topic in topics_spanish:
    print(topic)

Análisis de temas en español:
Tema 1: content, drive, mp3, mydrive, youtube
Tema 2: content, drive, mp3, mydrive, youtube
Tema 3: content, drive, mp3, mydrive, youtube
Tema 4: content, drive, mp3, mydrive, youtube
Tema 5: content, drive, mp3, mydrive, youtube


In [None]:
# Dividir texto en fragmentos
def split_text(text, chunk_size=100):
    words = text.split()
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# Traducir fragmentos y unir
def translate_text_in_chunks(text, translator):
    chunks = split_text(text, chunk_size=200)
    translated_chunks = []

    if translator == "Google":
        for chunk in chunks:
            translated_chunks.append(GoogleTranslator(source="es", target="en").translate(chunk))
    elif translator == "Pons":
        for chunk in chunks:
            translated_chunks.append(PonsTranslator(source="es", target="en").translate(chunk))
    elif translator == "Linguee":
        for chunk in chunks:
            translated_chunks.append(LingueeTranslator(source="es", target="en").translate(chunk))
    else:
        raise ValueError("Translator not supported")

    return ' '.join(translated_chunks)

texto = split_text(processed_text)
texto

['pasar chaval bienvenido chinolo noticia triste noticia triste acabar enterar decir joder vídeo Antonio improvisación notado acabar enterar decir poner triste noticia llevar seguir año etcétera Antonio tío sistema decir pomada poner triste dedicar palabra improvisar pensar cosa cosa mala decir poner triste mejor titular titular decir ganar wols prejuvilación anticipado mes decir ir ganar lequí ganar wols realidad luchar chaval suponer vídeo contar pena chaval pena tiempo ser joven año salir tío peluquilla internet hacer gracia faltar cocida chavalo tontico caer gracia caer gracia comunidad querer comunidad comunidad acabar queriéndolo faltar amor genuino comunidad salir personaje personaje metido calzador',
 'gente gente Antonio nocent chavarín chavarín decir metro metro sesenta faltar cocida mágico caer gracia hombre olvidar olvidar aneth dota aneth dota antonio sabéis aneth dota famoso antonio limpiar cara toalla yizú limpiar culo tipo aneth dota regar alguien antonio lavar cara cog

In [None]:
translate_and_analyze_topics

In [None]:
# Uso
translated_text_google = translate_text_in_chunks(text, translator="Google")


# Comparar traducciones
print("\nGoogle Translator:\n", translated_text_google)


Google Translator:
 What's up guys welcome to Chinolo well sad news no sad news I just found out and I say fuck I have to make a video of Antonio not all improvisation I have nothing I have nothing noticed because I just found out as I already told you and the truth is I It has even made me a little sad, the news because for those of us who have been following Antonio for many years etcetera, he has always been a guy who was there in the system, let's say, not until or in the middle of it, and it has even made me sad, no. We are going to dedicate a few words, I have improvised a little, I have thought of good things and bad things, no, because as I told you, it has even made me a little sad, the best are leaving, no, well, the first thing is that I would say the headline, no, the headline of winning wols to an early retirement no, that is, a month ago I was saying that I was going to win the lequí and wols is going to win, well in the end the reality is what it is, you can't fight aga

In [None]:
# Convertir las stop words a lista
spanish_stop_words = list(nlp.Defaults.stop_words)

# Reemplazar en el análisis de temas
def analyze_topics_spanish(text, num_topics=5):
    vectorizer = CountVectorizer(stop_words=spanish_stop_words)
    doc_term_matrix = vectorizer.fit_transform([text])
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=0)
    lda.fit(doc_term_matrix)

    terms = vectorizer.get_feature_names_out()
    topics = []
    for idx, topic in enumerate(lda.components_):
        top_terms = [terms[i] for i in topic.argsort()[-5:]]
        topics.append(f"Tema {idx+1}: {', '.join(top_terms)}")
    return topics

In [None]:
analyze_topics_spanish(processed_text)

['Tema 1: furbolista, gachiesa, gapéada, genuino, jubilar',
 'Tema 2: furbolista, gachiesa, gapéada, genuino, jubilar',
 'Tema 3: furbolista, gachiesa, gapéada, genuino, jubilar',
 'Tema 4: furbolista, gachiesa, gapéada, genuino, jubilar',
 'Tema 5: persona, jugar, acabar, chaval, antonio']