[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/NahuelCostaCortez/InteligeciaNegocio/blob/main/scrapping%2Btokenization.ipynb)

In [None]:
import random

## Obtener datos

1.   Elegir fuente de datos (sitio web)
2.   Acceder a la URL del sitio web y descargar todo el contenido HTML
3.   Formatear el contenido descargado en un formato legible



In [None]:
from bs4 import BeautifulSoup
import requests

# Solicitar al sitio web y descargar los contenidos HTML
url='https://vas3k.com/blog/machine_learning/#scroll20'
req=requests.get(url)
content=req.text

In [None]:
content

'\n<!DOCTYPE html>\n<html>\n    <head>\n        <title>\n    Machine Learning for Everyone :: In simple words. With real-world examples. Yes, again :: vas3k.com\n</title>\n        <meta charset="UTF-8"/>\n        <meta name="description" content="Blog about survival in the world of technology and all this cyberpunk around"/>\n        <meta name="keywords" content=""/>\n        <meta name="author" content=""/>\n        <meta name="viewport" content="width=device-width, height=device-height, initial-scale=1.0" />\n        \n    <meta name="robots" content="index, follow">\n    <meta property="author" content="vas3k">\n    <meta property="article:publisher" content="https://vas3k.com">\n    <meta property="article:author" content="https://vas3k.com">\n    <meta property="article:published_time" content="2018-11-21 16:23:29">\n\n    <meta property="og:type" content="article">\n    <meta property="og:title" content="Machine Learning for Everyone" />\n    <meta property="og:description" cont

## Preprocesar datos

Formatear el contenido descargado en un formato legible

In [None]:
soup=BeautifulSoup(content)

raw=soup.findAll("div", {"class": "block-text"})

In [None]:
posts = []
for i in range(len(raw)):
  posts.append(raw[i].text)

In [None]:
selected_post = random.choice(posts)
selected_post

"\nIf you are too lazy for long reads, take a look at the picture below to get some understanding.\n\n\n\nAlways important to remember — there is never a sole way to solve a problem in the machine learning world. There are always several algorithms that fit, and you have to choose which one fits better. Everything can be solved with a neural network, of course, but who will pay for all these GeForces?\nLet's start with a basic overview. Nowadays there are four main directions in machine learning.\n\n\n\n"

## Tokenizar los datos

In [None]:
import re
import nltk
from nltk.tokenize import WordPunctTokenizer
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
tokenized_post = WordPunctTokenizer().tokenize(selected_post)
tokenized_post

['If',
 'you',
 'are',
 'too',
 'lazy',
 'for',
 'long',
 'reads',
 ',',
 'take',
 'a',
 'look',
 'at',
 'the',
 'picture',
 'below',
 'to',
 'get',
 'some',
 'understanding',
 '.',
 'Always',
 'important',
 'to',
 'remember',
 '—',
 'there',
 'is',
 'never',
 'a',
 'sole',
 'way',
 'to',
 'solve',
 'a',
 'problem',
 'in',
 'the',
 'machine',
 'learning',
 'world',
 '.',
 'There',
 'are',
 'always',
 'several',
 'algorithms',
 'that',
 'fit',
 ',',
 'and',
 'you',
 'have',
 'to',
 'choose',
 'which',
 'one',
 'fits',
 'better',
 '.',
 'Everything',
 'can',
 'be',
 'solved',
 'with',
 'a',
 'neural',
 'network',
 ',',
 'of',
 'course',
 ',',
 'but',
 'who',
 'will',
 'pay',
 'for',
 'all',
 'these',
 'GeForces',
 '?',
 'Let',
 "'",
 's',
 'start',
 'with',
 'a',
 'basic',
 'overview',
 '.',
 'Nowadays',
 'there',
 'are',
 'four',
 'main',
 'directions',
 'in',
 'machine',
 'learning',
 '.']

## Normalizar los datos

In [None]:
# Eliminar tokens que no son palabras
def clear_tokens(tokens):
  clean_token=[]
  for token in tokens:
      token = token.lower()
      # eliminar valores que no sean alfabéticos
      new_token = re.sub(r'[^a-zA-Z]+', '', token) 
      # eliminar espacios y caractéres únicos
      if new_token != "" and len(new_token) >= 2: 
          vowels=len([v for v in new_token if v in "aeiou"])
          if vowels != 0: # eliminar lineas que solo contengan consonantes
              clean_token.append(new_token)
  return clean_token

In [None]:
post = clear_tokens(tokenized_post)
post

['if',
 'you',
 'are',
 'too',
 'lazy',
 'for',
 'long',
 'reads',
 'take',
 'look',
 'at',
 'the',
 'picture',
 'below',
 'to',
 'get',
 'some',
 'understanding',
 'always',
 'important',
 'to',
 'remember',
 'there',
 'is',
 'never',
 'sole',
 'way',
 'to',
 'solve',
 'problem',
 'in',
 'the',
 'machine',
 'learning',
 'world',
 'there',
 'are',
 'always',
 'several',
 'algorithms',
 'that',
 'fit',
 'and',
 'you',
 'have',
 'to',
 'choose',
 'which',
 'one',
 'fits',
 'better',
 'everything',
 'can',
 'be',
 'solved',
 'with',
 'neural',
 'network',
 'of',
 'course',
 'but',
 'who',
 'will',
 'pay',
 'for',
 'all',
 'these',
 'geforces',
 'let',
 'start',
 'with',
 'basic',
 'overview',
 'nowadays',
 'there',
 'are',
 'four',
 'main',
 'directions',
 'in',
 'machine',
 'learning']

## Eliminar puntuación

In [None]:
# Obtener la lista de stop words
stop_words = stopwords.words('english')
# Añadir más palabras a la lista
stop_words.extend(["could","though","would","also","many",'much'])
print(stop_words)
# Eliminar las palabras de la lista de tokens
tokens = [x for x in post if x not in stop_words]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
tokens

['lazy',
 'long',
 'reads',
 'take',
 'look',
 'picture',
 'get',
 'understanding',
 'always',
 'important',
 'remember',
 'never',
 'sole',
 'way',
 'solve',
 'problem',
 'machine',
 'learning',
 'world',
 'always',
 'several',
 'algorithms',
 'fit',
 'choose',
 'one',
 'fits',
 'better',
 'everything',
 'solved',
 'neural',
 'network',
 'course',
 'pay',
 'geforces',
 'let',
 'start',
 'basic',
 'overview',
 'nowadays',
 'four',
 'main',
 'directions',
 'machine',
 'learning']