In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Importar librerías
import re, os
import math
import nltk
import json
from nltk.corpus import stopwords
from wordcloud import WordCloud

from helper_prabowo_ml import clean_html, remove_links, non_ascii, lower, email_address, removeStopWords, punct, remove_
import re

from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from transformers import AutoTokenizer, TFBertModel 

: 

In [None]:
def remove_nan_rows(dataframe, columns=['messages', 'sender_labels', 'receiver_labels']):
    """
    Remove rows with NaN values in specified columns.

    Parameters:
    - dataframe: pd.DataFrame
        The DataFrame to process.
    - columns: list
        List of columns to check for NaN values.

    Returns:
    - pd.DataFrame
        DataFrame with rows containing NaN values removed.
    """
    return dataframe.dropna(subset=columns)


def remove_tags(string, remove_special_chars=False, remove_stopwords=False, remove_newlines=False):
    result = re.sub(r'<.*?>', '', string)  # Remove HTML tags
    result = re.sub('@[\w]+', '', result)  # Remove Twitter usernames
    result = re.sub('#[\w]+', '', result)  # Remove hashtags
    result = re.sub("\d+", " ", result)  # Remove numbers
    result = re.sub(r'http\S+', '', result)  # Remove URLs

    if remove_special_chars:
        result = re.sub(r'[^\w\s]', ' ', result)  # Remove non-alphanumeric characters

    if remove_newlines:
        result = re.sub(r'\n\n', ' ', result)  # Remove newline characters
        result = ' '.join(result.split())  # Split and join to remove extra spaces

    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        result = ' '.join([w for w in result.split() if w.lower() not in stop_words])

    # Remove words of length 1
    result = ' '.join([word for word in result.split() if len(word) > 1])

    result = result.lower()
    return result

def compute_class_weight(train_y):
    """
    Compute class weight given imbalanced training data
    Usually used in the neural network model to augment the loss function (weighted loss function)
    Favouring/giving more weights to the rare classes.
    """
    import sklearn.utils.class_weight as scikit_class_weight

    class_list = list(set(train_y))
    class_weight_value = scikit_class_weight.compute_class_weight(class_weight ='balanced', classes = class_list, y = train_y)
    class_weight = dict()

    # Initialize all classes in the dictionary with weight 1
    curr_max = int(np.max(class_list))
    for i in range(curr_max):
        class_weight[i] = 1

    # Build the dictionary using the weight obtained the scikit function
    for i in range(len(class_list)):
        class_weight[class_list[i]] = class_weight_value[i]

    return class_weight

: 

In [None]:
data_list = []
messages = []
with open('NLP_Diplomacy/train.jsonl', 'r') as archivo:
    for line in archivo:
        # Load each line as a JSON object
        try:
            data = json.loads(line)
            data_list.append(data)
            messages.extend(data['messages'])
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            
            
validation_list = []
messages = []
with open('NLP_Diplomacy/validation.jsonl', 'r') as archivo:
    for line in archivo:
        # Load each line as a JSON object
        try:
            validation = json.loads(line)
            validation_list.append(validation)
            messages.extend(validation['messages'])
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            
test_list = []
messages = []
with open('NLP_Diplomacy/test.jsonl', 'r') as archivo:
    for line in archivo:
        # Load each line as a JSON object
        try:
            test = json.loads(line)
            test_list.append(validation)
            messages.extend(validation['messages'])
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            
df = pd.DataFrame(data_list)
df_val = pd.DataFrame(validation_list)
df_test = pd.DataFrame(test_list)
            
df_explode=df.explode(['messages',
 'sender_labels',
 'receiver_labels',
 'speakers',
 'receivers',
 'absolute_message_index',
 'relative_message_index',
 'seasons',
 'years',
 'game_score',
 'game_score_delta'
 ], ignore_index=True)

df_val=df_val.explode(['messages',
 'sender_labels',
 'receiver_labels',
 'speakers',
 'receivers',
 'absolute_message_index',
 'relative_message_index',
 'seasons',
 'years',
 'game_score',
 'game_score_delta'
 ], ignore_index=True)

df_test=df_test.explode(['messages',
 'sender_labels',
 'receiver_labels',
 'speakers',
 'receivers',
 'absolute_message_index',
 'relative_message_index',
 'seasons',
 'years',
 'game_score',
 'game_score_delta'
 ], ignore_index=True)

# Assuming df_explode is your DataFrame
df_explode_cleaned = remove_nan_rows(df_explode)

df_val_cleaned = remove_nan_rows(df_val)

df_test_cleaned = remove_nan_rows(df_test)

# Assuming df_explode_cleaned is your DataFrame
df_explode_cleaned['messages'] = df_explode_cleaned['messages'].astype(str)
df_explode_cleaned.loc[:, 'messages_clean'] = df_explode_cleaned['messages'].apply(lambda cw: remove_tags(cw, remove_special_chars=True, remove_stopwords=True, remove_newlines=True))

# Assuming df_explode_cleaned is your DataFrame
df_val_cleaned['messages'] = df_val_cleaned['messages'].astype(str)
df_val_cleaned.loc[:, 'messages_clean'] = df_val_cleaned['messages'].apply(lambda cw: remove_tags(cw, remove_special_chars=True, remove_stopwords=True, remove_newlines=True))

# Assuming df_explode_cleaned is your DataFrame
df_test_cleaned['messages'] = df_test_cleaned['messages'].astype(str)
df_test_cleaned.loc[:, 'messages_clean'] = df_test_cleaned['messages'].apply(lambda cw: remove_tags(cw, remove_special_chars=True, remove_stopwords=True, remove_newlines=True))


df_explode_cleaned = remove_nan_rows(df_explode_cleaned)
df_val_cleaned = remove_nan_rows(df_val_cleaned)
df_test_cleaned = remove_nan_rows(df_test_cleaned)
