In [None]:
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from pprint import pprint
import os
import nltk
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

## Скачиваем wordnet и omw

`Wordnet` нужен чтобы на основе него строить леммы

`omw` нужен чтобы делать стеммы

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

stemmer = SnowballStemmer(language="english")
lemmatizer = WordNetLemmatizer()

## Регулярки

Эти регулярки используются для разбиения по сентензам или по токенам

In [None]:
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|me|edu|ru|ua)"
digits = "([0-9])"
word = "([A-Za-z0-9][A-Za-z0-9]*)"
number = "([1-9][0-9]*)"


## Бьём на сентензы (утверждения)

Суть в том чтобы разбить по условным знакам окончания предложения разбить на эти самые предложения.

Внутри предложения могут быть другие точки, прим: адрес электронной почты. Эти точки нужно экранировать например как `<dot>` и в самом конце сделать `replace` `text` по `<dot>`
У меня в коде `<dot>` это `<prd>`

In [None]:
def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n>>>>"," ")
    text = text.replace("\n>>>"," ")
    text = text.replace("\n>>"," ")
    text = text.replace("\n>"," ")
    text = text.replace("\n"," ")
    text = re.sub(f"<{word}[.]{word}[.]{word}[.]{word}@{word}[.]{word}[.]{word}>", "<\\1<prd>\\2<prd>\\3<prd>\\4@\\5<prd>\\6<prd>\\7>", text)
    text = re.sub(f"<{word}[.]{word}@{word}[.]{word}[.]{word}>", "<\\1<prd>\\2@\\3<prd>\\4<prd>\\5>", text)
    text = re.sub(f"{word}@{word}[.]{word}[.]{word}", "\\1@\\2<prd>\\3<prd>\\4", text)
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    if "..." in text: text = text.replace("...","<prd><prd><prd>")
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

split_into_sentences("""
In article <1993Mar25.161909.8110@wuecl.wustl.edu> dp@cec1.wustl.edu (David Prutchi) writes:
>In article <C4CntG.Jv4@spk.hp.com> long@spk.hp.com (Jerry Long) writes:
>>Fred W. Culpepper (fculpepp@norfolk.vak12ed.edu) wrote:
>>[...]
>>A couple of years ago I put together a Tesla circuit which
>>was published in an electronics magazine and could have been
>>the circuit which is referred to here. This one used a
>>flyback transformer from a tv onto which you wound your own
>>primary windings. It also used 2 power transistors in a TO 3
>[...]
>10 years ago I built a 1'000,000 volt Tesla, and the thing was VERY
>spectacular, but besides scaring/amazing friends (depending on their
>knowledge of Science), and generating strong EMI, I never found anything
>useful that could be done with it ...  Is there any real-world application
>for Tesla coils today ?
>
>David Prutchi
""")

## Функция для сплита письма на заголовок и тело письма

In [None]:
def split_mail(text):
    head_body = text.split("\n\n")
    return head_body[0], "\n\n".join(head_body[1:]) 

## Пример письма

In [None]:
mail = open("20news-bydate-train/sci.electronics/52434", "r").read()
print(mail)


## Результат сплита

In [None]:
head, body = split_mail(mail)

In [None]:
head

In [None]:
body

In [None]:
items = split_into_sentences(body)
# for item in items:
#     print(item)

items

## Бьём на токены

`tokenize` принимает на вход предложение и внутри него ищет токены.

Снова Сначала ищем email и т п, реплейсим все точки запятые на `<dot>` `<coma>`


После того как их зареплейсили, можно бить по точкам, запятым и прочему мусору.

После того как побили токен, внутри него реплейсим `<dot>` обратно на точку, `<coma> обратно на запятую`

В конце функции считаем стемму по токену, по стемме считаем лемму

Интеракт функция - слайдер чтобы можно было листать предложения и смотреть результат разбиения в интеративе

In [None]:
from re import S


files = os.listdir("20news-bydate-train/alt.atheism/")
file = 0
print(files[file])
mail = open("20news-bydate-train/alt.atheism/" + files[file], "r").read()
head, body = split_mail(mail)

def tokenize(sentence):
    #TODO: Можно еще смайлики добавить но мне влом их все перегонять в словарь
    #TODO: Можно еще добавить время 22:11
    sentence = re.sub(f">{word}", "><split>\\1", sentence)

    sentence = re.sub(f"\\({word}", "(<split>\\1", sentence)
    sentence = re.sub(f"{word}\\)", "\\1<split>)", sentence)
    sentence = re.sub(f"I'm", "I<split>am", sentence)

    sentence = re.sub(f"${word}'d", "\\1<split>woud", sentence)
    sentence = re.sub(f"{number},{number}", "\\1<coma>\\2", sentence)

    sentence = re.sub(f"{number},{number},{number}", "\\1<coma>\\2<coma>\\3", sentence)

    sentence = re.sub(f"{number}'{number},{number}", "\\1'\\2<coma>\\3", sentence)

    sentence = re.sub(f"{number}[.]{number}", "\\1<dot>\\2", sentence)
    sentence = re.sub(f"{number}[.]{number}.{number}", "\\1<dot>\\2<dot>\\3", sentence)

    sentence = re.sub(f"{word}[.]{word}[.]{word}@{word}[.]{word}[.]{word}", "<split>\\1<dot>\\2<dot>\\3@\\4<dot>\\5<dot>\\6<split>", sentence)
    sentence = re.sub(f"<{word}[.]{word}@{word}[.]{word}[.]{word}>", "<split>\\1<dot>\\2@\\3<dot>\\4<dot>\\5<split>", sentence)
    sentence = re.sub(f"{word}@{word}[.]{word}[.]{word}", "\\1@\\2<dot>\\3<dot>\\4<split>", sentence)
    sentence = re.sub(f"<{word}@{word}[.]{word}>", "<<split>\\1@\\2<dot>\\3<split>><split>", sentence)
    sentence = re.sub(f"{word}@{word}[.]{word}", "\\1@\\2<dot>\\3<split>", sentence)
    sentence = re.sub(f"<{word}[.]{word}@{word}[.]{word}[.]{word}>", "<split>\\1<dot>\\2@\\3<dot>\\4<dot>\\5<split>", sentence)

    sentence = sentence.replace("...", '<dot><dot><dot>')
    for r in " ":
        sentence = sentence.replace(r, '<split>')
    for r in [".", ",",":",";","?","!", '"', "'", "/", "*", "$"]:
        sentence = re.sub(f"\\{r}{word}", f"{r}<split>\\1", sentence)
        sentence = re.sub(f"{word}\\{r}", f"\\1<split>{r}", sentence)

    sentence = sentence.replace("<dot>", ".")

    sentence = sentence.replace("<coma>", ",")

    tokens = sentence.split('<split>')
    return [(x, stemmer.stem(x), lemmatizer.lemmatize(stemmer.stem(x))) for x in tokens if x]

maxI = len(split_into_sentences(body)) - 1

@interact
def test(i=widgets.IntSlider(min=0,max=maxI,step=1,value=0)):
    sentence = split_into_sentences(body)[i]
    pprint(sentence)
    pprint(tokenize(sentence))





## Бьём на токены все файлы в директории

In [None]:
folders = os.listdir("20news-bydate-train/")
print(folders)

for folder in folders:
    files = os.listdir(f"20news-bydate-train/{folder}/")
    
    print(folder)
    
    for file in files:
        if not os.path.exists(f"out/train/{folder}"):
            os.makedirs(f"out/train/{folder}")
        out = open(f"out/train/{folder}/{file}.tsv", "w")

        mail = open(f"20news-bydate-train/{folder}/{file}", "r").read()
        head, body = split_mail(mail)

        for sentence in split_into_sentences(head):
            for token, stem, lem in tokenize(sentence):
                out.write(f"{token}\t{stem}\t{lem}\n")
            out.write("\n")

        for sentence in split_into_sentences(body):
            for token, stem, lem in tokenize(sentence):
                out.write(f"{token}\t{stem}\t{lem}\n")
            out.write("\n")

        out.close()

In [None]:
folders = os.listdir("20news-bydate-train/")
print(folders)

for folder in folders:
    files = os.listdir(f"20news-bydate-train/{folder}/")
    
    print(folder)
    out = open(f"out/train/{folder}.tsv", "w")
    for file in files:
        if not os.path.exists(f"out/train/{folder}"):
            os.makedirs(f"out/train/{folder}")
        

        mail = open(f"20news-bydate-train/{folder}/{file}", "r").read()
        head, body = split_mail(mail)

        for sentence in split_into_sentences(head):
            for token, stem, lem in tokenize(sentence):
                out.write(f"{token}\t{stem}\t{lem}\n")
            out.write("\n")

        for sentence in split_into_sentences(body):
            for token, stem, lem in tokenize(sentence):
                out.write(f"{token}\t{stem}\t{lem}\n")
            out.write("\n")

    out.close()