In [9]:
import re
import pandas as pd
import nltk
nltk.download('punkt')

# Cargar el dataset (asegúrate de tener el archivo CSV en tu ruta)
data = pd.read_csv('test.csv')

# Función para limpiar los subtítulos
def clean_subtitles(subtitles):
    clean_text = re.sub(r'\[.*?\]', '', subtitles)  # Eliminar etiquetas como '[Aplausos]'
    clean_text = re.sub(r'WEBVTT.*Language: es', '', clean_text)  # Eliminar metadatos
    clean_text = clean_text.lower()  # Convertir a minúsculas
    clean_text = re.sub(r'[^\w\s]', '', clean_text)  # Eliminar puntuación
    return clean_text

# Expresiones regulares para extraer eventos clave
event_patterns = {
    'goal': r'\b(gol|marca|anota)\b',
    'yellow_card': r'\b(tarjeta amarilla|amonestado|amonestar)\b',
    'red_card': r'\b(tarjeta roja|expulsado|expulsion|expulsar)\b',
    'corner': r'\b(tiro de esquina|corner)\b',
    'offside': r'\b(fuera de juego|offside)\b',
    'foul': r'\b(falta|infracción)\b',
    'shot': r'\b(tiro|disparo)\b',
    'penalty': r'\b(penal|penalty)\b'
}

# Función para detectar eventos
def detect_events(cleaned_text, event_patterns):
    events_detected = {}
    for event, pattern in event_patterns.items():
        events_detected[event] = len(re.findall(pattern, cleaned_text))
    return events_detected

# Aplicación del proceso de limpieza y detección de eventos
data['Cleaned_Subtitles'] = data['Subtitles'].apply(clean_subtitles)
data['Expanded_Event_Counts'] = data['Cleaned_Subtitles'].apply(lambda text: detect_events(text, event_patterns))

# Mostrar los resultados
data[['Video URL', 'Expanded_Event_Counts']].head()


Unnamed: 0,Video URL,Expanded_Event_Counts
0,https://www.youtube.com/watch?v=iyVECkH86Mw,"{'goal': 66, 'yellow_card': 1, 'red_card': 1, ..."
1,https://www.youtube.com/watch?v=Iel1cl5aefk,"{'goal': 13, 'yellow_card': 0, 'red_card': 0, ..."
2,https://www.youtube.com/watch?v=mUYahKeNf7E,"{'goal': 67, 'yellow_card': 7, 'red_card': 1, ..."
3,https://www.youtube.com/watch?v=USNfMwo1i30,"{'goal': 36, 'yellow_card': 4, 'red_card': 0, ..."
4,https://www.youtube.com/watch?v=8klVIK4Lwbc,"{'goal': 24, 'yellow_card': 0, 'red_card': 0, ..."


In [10]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Prepare text data and labels
texts = data['Cleaned_Subtitles'].tolist()
labels = data['Expanded_Event_Counts'].apply(lambda x: list(x.values())).tolist()

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Padding sequences for uniform input size
max_sequence_length = 1000
X = pad_sequences(sequences, maxlen=max_sequence_length)

# Convert labels to a suitable format
Y = pd.DataFrame(labels)

# Split the dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_sequence_length))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(event_patterns), activation='linear'))  # Multi-output regression

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
history = model.fit(X_train, Y_train, epochs=5, batch_size=32, validation_data=(X_test, Y_test))

# Evaluate the model
loss, mae = model.evaluate(X_test, Y_test)

# Predictions
predictions = model.predict(X_test)

# Display predictions and actual values for comparison
comparison_df = pd.DataFrame({'Actual': Y_test.values.tolist(), 'Predicted': predictions.tolist()})
print(comparison_df.head())


Epoch 1/5




[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 375ms/step - loss: 309.4113 - mae: 9.7861 - val_loss: 272.9648 - val_mae: 9.1799
Epoch 2/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 269ms/step - loss: 315.0133 - mae: 9.5821 - val_loss: 271.6274 - val_mae: 9.1418
Epoch 3/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 256ms/step - loss: 287.6452 - mae: 9.3312 - val_loss: 264.0396 - val_mae: 8.9233
Epoch 4/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 258ms/step - loss: 310.1055 - mae: 9.4119 - val_loss: 229.6618 - val_mae: 8.0625
Epoch 5/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 252ms/step - loss: 237.8437 - mae: 8.2245 - val_loss: 211.1994 - val_mae: 7.7645
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - loss: 211.1994 - mae: 7.7645
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step
                         Actual  \
0    [11, 3, 0, 0, 5, 0, 13, 5]   