In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/europarl-es/europarl-v7.es-en.es
/kaggle/input/jrc-es/JRC.txt


# DATASETS

In [2]:
!pip install datasets



# LOAD SPACY

In [3]:
!python -m spacy download es_core_news_sm

Collecting es-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: es-core-news-sm
Successfully installed es-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')


In [4]:
import spacy
from tqdm import tqdm
from spacy.lang.es.stop_words import STOP_WORDS
import time

nlp = spacy.load("es_core_news_sm")

# TOKENIZATION

In [5]:
!pip install tqdm



In [6]:
import re
import string

# CLEAN SENTENCES
def clean_sentence(sent):
    # StopWords
    new_sentence = [token.text for token in sent if not token.is_stop]
    # Digitos
    new_sentence = [w for w in new_sentence if not w.isdigit()]
    new_sentence = [re.sub(r'[0-9]', '', w) for w in new_sentence]
    # puntuacion y vacíos
    new_sentence = [re.sub("\!|\'|\?|\¿|\¡|\«|\»|\—", "", w) for w in new_sentence]
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    new_sentence = [re_punc.sub('', w) for w in new_sentence]
    new_sentence = [w for w in new_sentence if w.strip()]
    return new_sentence

In [7]:
from tqdm import tqdm

europarl_file_path="/kaggle/input/europarl-es/europarl-v7.es-en.es"
jrc_file_path="/kaggle/input/jrc-es/JRC.txt"

def unique_words_sentences():
    watched_words = set()  # Conjunto para palabras ya vistas
    result = []  # Lista de listas para oraciones únicas
    # Procesar europarl
    with open(europarl_file_path, 'r', encoding='utf-8') as europar_file:
        num_lines_europarl = 100#sum(1 for _ in europar_file)
    with open(europarl_file_path, 'r', encoding='utf-8') as europar_file:
        for i, line in enumerate(tqdm(europar_file, desc="Procesando europarl", total=num_lines_europarl)):
            if i > num_lines_europarl:
                break
            sent_tokens = nlp(line.strip())  # Procesar la línea con spacy
            clean_sent = clean_sentence(sent_tokens)  # Limpiar los tokens
            sublist = []
            for word in clean_sent:
                if word not in watched_words:  # Añadir solo si no ha sido vista
                    watched_words.add(word)
                    sublist.append(word)
            if sublist:  # Solo añadir si la sublista no está vacía
                result.append(sublist)

    # Procesar jrc
    with open(jrc_file_path, 'r', encoding='utf-8') as jrc_file:
        num_lines_jrc = 100#sum(1 for _ in jrc_file)
    with open(jrc_file_path, 'r', encoding='utf-8') as jrc_file:
        for i, line in enumerate(tqdm(jrc_file, desc="Procesando jrc", total=num_lines_jrc)):
            if i > num_lines_jrc:
                break
            sent_tokens = nlp(line.strip())  # Procesar la línea con spacy
            clean_sent = clean_sentence(sent_tokens)  # Limpiar los tokens
            sublist = []
            for word in clean_sent:
                if word not in watched_words:  # Añadir solo si no ha sido vista
                    watched_words.add(word)
                    sublist.append(word)
            if sublist:  # Solo añadir si la sublista no está vacía
                result.append(sublist)
                
    return result

# FASTTEXT

In [8]:
from gensim.models import FastText
from gensim.test.utils import common_texts  # some example sentences
from gensim.models.word2vec import Word2Vec
import os
import sys
import multiprocessing
import pickle
from gensim.models import FastText
num_features = [300]                      #Dimensionality of the resulting word vectors
min_word_count = 1
sg=1                      #Minimum word count threshold
num_workers = multiprocessing.cpu_count() #Number of threads to run in parallel
context_size = 5                          #Context window length
seed = 1
for p in num_features:
    fasttext_model = FastText(
        sentences=unique_words_sentences(),
        vector_size=300,
        window=context_size,
        min_count=min_word_count,
        workers=num_workers,
        sg=1                              #skipgram
    )

    fasttext_model.wv.save_word2vec_format('/kaggle/working/Europarl_fasttext_skip_model11_' + str(p) +  '.txt', binary=False)

Procesando europarl: 101it [00:01, 84.88it/s]                        
Procesando jrc: 101it [00:00, 106.73it/s]                        
