In [1]:
import os
import re
import codecs
import string

import emot.core
import unicodedata
import multiprocessing as mp
from multiprocessing import Pool

In [2]:
from bs4 import BeautifulSoup

In [None]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

In [4]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [186]:
import contractions
import emot
from emot import EMOTICONS_EMO
from spellchecker import SpellChecker
from chardet.universaldetector import UniversalDetector

In [6]:
def read_document(filepath, encoding):
    with codecs.open(filepath, 'r', encoding) as file:
        text = file.read()
    return text

In [7]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'lxml')
    text = soup.get_text()
    return text

In [8]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [9]:
def to_lower(text):
    return text.lower()

In [10]:
def remove_accented_characters(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [11]:
remove_accented_characters('Sómě Áccěñtěd téxt')

'Some Accented text'

In [12]:
def expand_contractions(text):
    expanded_text = []
    for word in text.split():
        expanded_text.append(contractions.fix(word))
    expanded_text = ' '.join(expanded_text)
    return expanded_text

In [13]:
print(expand_contractions("ain't"))
print(expand_contractions("Y'all can't expand contractions I'd think"))
print(expand_contractions("can't've"))
print(expand_contractions("you'll've"))

are not
You all cannot expand contractions I would think
cannot have
you shall have


In [14]:
def detect_encoding(filepath):
    detector = UniversalDetector()
    with open(filepath, 'rb') as file:
        for line in file:
            detector.feed(line)
            if detector.done:
                break
    detector.close()

    encoding = detector.result['encoding']
    confidence = detector.result['confidence']

    return encoding, confidence

In [15]:
# Fajl 2624_7 ima emotikone

In [16]:
encoding_, confidence_ = detect_encoding('data/train/pos/2624_7.txt')
print(f'{encoding_} {confidence_}')

ascii 1.0


In [17]:
review_1 = read_document('data/train/pos/2624_7.txt', encoding=encoding_)
print(review_1)

In this film we have the fabulous opportunity to see what happened to Timon and Pumbaa in the film when they are not shown - which is a lot! This film even goes back to before Simba and (presumbably) just after the birth of Kiara. <br /><br />Quite true to the first film, "Lion King 1/2 (or Lion King 3 in other places)" is a funny, entertaining, exciting and surprising film (or sequel if that's what you want to call it). A bundle of surprises and hilarity await for you!<br /><br />While Timon and Pumbaa are watching a film at the cinema (with a remote control), Timon and Pumbaa have an argument of what point of "The Lion King" they are going to start watching, as Timon wants to go to the part when he and Pumbaa come in and Pumbaa wants to go back to the beginning. They have a very fair compromise of watching the film of their own story, which is what awaits... It starts with Timon's first home...<br /><br />For anyone with a good sense of humour who liked the first films of just about 

In [18]:
print(remove_html_tags(review_1))

In this film we have the fabulous opportunity to see what happened to Timon and Pumbaa in the film when they are not shown - which is a lot! This film even goes back to before Simba and (presumbably) just after the birth of Kiara. Quite true to the first film, "Lion King 1/2 (or Lion King 3 in other places)" is a funny, entertaining, exciting and surprising film (or sequel if that's what you want to call it). A bundle of surprises and hilarity await for you!While Timon and Pumbaa are watching a film at the cinema (with a remote control), Timon and Pumbaa have an argument of what point of "The Lion King" they are going to start watching, as Timon wants to go to the part when he and Pumbaa come in and Pumbaa wants to go back to the beginning. They have a very fair compromise of watching the film of their own story, which is what awaits... It starts with Timon's first home...For anyone with a good sense of humour who liked the first films of just about any age, enjoy "Lion King 1/2"! :-)


In [19]:
encoding_2, confidence_2 = detect_encoding('data/train/pos/15_7.txt')
review_2 = read_document('data/train/pos/15_7.txt', encoding=encoding_2)
print(review_2)

I guess if a film has magic, I don't need it to be fluid or seamless. It can skip background information, go too fast in some places, too slow in others, etc. Magic in this film: the scene in the library. There are many minor flaws in Stanley & Iris, yet they don't detract from the overall positive impact of watching people help each other in areas of life that seem the most incomprehensible, the hardest to fix. Both characters are smart. Yet Stanley can't understand enough to function because he can't read; he can't read because he's had too much adventure in his childhood. Iris, although well-educated, hasn't had enough adventure and so can't understand how to move past the U-turn her life took. In both their faults and strengths, the characters compliment each other. It may be a bit of a stretch to accept that an Iris would wind up working year after year in a factory, or that a Stanley never hid his illiteracy enough to work in construction or some other better-paying job. And whil

In [20]:
print(expand_contractions(review_2))

I guess if a film has magic, I do not need it to be fluid or seamless. It can skip background information, go too fast in some places, too slow in others, etc. Magic in this film: the scene in the library. There are many minor flaws in Stanley & Iris, yet they do not detract from the overall positive impact of watching people help each other in areas of life that seem the most incomprehensible, the hardest to fix. Both characters are smart. Yet Stanley cannot understand enough to function because he cannot read; he cannot read because he is had too much adventure in his childhood. Iris, although well-educated, has not had enough adventure and so cannot understand how to move past the YOU-turn her life took. In both their faults and strengths, the characters compliment each other. It may be a bit of a stretch to accept that an Iris would wind up working year after year in a factory, or that a Stanley never hid his illiteracy enough to work in construction or some other better-paying job

In [21]:
print(remove_accented_characters(review_2))

I guess if a film has magic, I don't need it to be fluid or seamless. It can skip background information, go too fast in some places, too slow in others, etc. Magic in this film: the scene in the library. There are many minor flaws in Stanley & Iris, yet they don't detract from the overall positive impact of watching people help each other in areas of life that seem the most incomprehensible, the hardest to fix. Both characters are smart. Yet Stanley can't understand enough to function because he can't read; he can't read because he's had too much adventure in his childhood. Iris, although well-educated, hasn't had enough adventure and so can't understand how to move past the U-turn her life took. In both their faults and strengths, the characters compliment each other. It may be a bit of a stretch to accept that an Iris would wind up working year after year in a factory, or that a Stanley never hid his illiteracy enough to work in construction or some other better-paying job. And whil

In [22]:
def tokenize(text):
    tokens = word_tokenize(text)
    return tokens

In [23]:
review_3 = read_document('data/train/neg/3_4.txt', 'utf-8')
print(review_3)

If I had not read Pat Barker's 'Union Street' before seeing this film, I would have liked it. Unfortuntately this is not the case. It is actually my kind of film, it is well made, and in no way do I want to say otherwise, but as an adaptation, it fails from every angle.<br /><br />The harrowing novel about the reality of living in a northern England working-class area grabbed hold of my heartstrings and refused to let go for weeks after I had finished. I was put through tears, repulsion, shock, anger, sympathy and misery when reading about the women of Union Street. Excellent. A novel that at times I felt I could not read any more of, but I novel I simply couldn't put down. Depressing yes, but utterly gripping.<br /><br />The film. Oh dear. Hollywood took Barker's truth and reality, and showered a layer of sweet icing sugar over the top of it. A beautiful film, an inspiring soundtrack, excellent performances, a tale of hope and romance...yes. An adaptation of 'Union Street'...no.<br />

In [24]:
for token in tokenize(review_2):
    print(token, end=' | ')

I | guess | if | a | film | has | magic | , | I | do | n't | need | it | to | be | fluid | or | seamless | . | It | can | skip | background | information | , | go | too | fast | in | some | places | , | too | slow | in | others | , | etc | . | Magic | in | this | film | : | the | scene | in | the | library | . | There | are | many | minor | flaws | in | Stanley | & | Iris | , | yet | they | do | n't | detract | from | the | overall | positive | impact | of | watching | people | help | each | other | in | areas | of | life | that | seem | the | most | incomprehensible | , | the | hardest | to | fix | . | Both | characters | are | smart | . | Yet | Stanley | ca | n't | understand | enough | to | function | because | he | ca | n't | read | ; | he | ca | n't | read | because | he | 's | had | too | much | adventure | in | his | childhood | . | Iris | , | although | well-educated | , | has | n't | had | enough | adventure | and | so | ca | n't | understand | how | to | move | past | the | U

In [25]:
doc = nlp(review_3)
for entity in doc.ents:
    print(f'{entity} - {entity.label_}')
    print(str(spacy.explain(entity.label_)))
    print('\n')

Pat Barker's ' - PERSON
People, including fictional


Union Street' - FAC
Buildings, airports, highways, bridges, etc.


England - GPE
Countries, cities, states


weeks - DATE
Absolute or relative dates or periods


Union Street - FAC
Buildings, airports, highways, bridges, etc.


Hollywood - GPE
Countries, cities, states


Barker - ORG
Companies, agencies, institutions, etc.


Union Street - FAC
Buildings, airports, highways, bridges, etc.


Fonda - PERSON
People, including fictional


Barker - ORG
Companies, agencies, institutions, etc.


7 - CARDINAL
Numerals that do not fall under another type


William - PERSON
People, including fictional


first - ORDINAL
"first", "second", etc.


Schindler - PERSON
People, including fictional




In [230]:
def remove_special_characters(text, remove_digits=False):
    special_chars_pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    compiled_pattern = re.compile(special_chars_pattern)
    return compiled_pattern.sub(r'', text)

In [228]:
print(review_2)

I guess if a film has magic, I don't need it to be fluid or seamless. It can skip background information, go too fast in some places, too slow in others, etc. Magic in this film: the scene in the library. There are many minor flaws in Stanley & Iris, yet they don't detract from the overall positive impact of watching people help each other in areas of life that seem the most incomprehensible, the hardest to fix. Both characters are smart. Yet Stanley can't understand enough to function because he can't read; he can't read because he's had too much adventure in his childhood. Iris, although well-educated, hasn't had enough adventure and so can't understand how to move past the U-turn her life took. In both their faults and strengths, the characters compliment each other. It may be a bit of a stretch to accept that an Iris would wind up working year after year in a factory, or that a Stanley never hid his illiteracy enough to work in construction or some other better-paying job. And whil

In [229]:
print(remove_special_characters(review_2, remove_digits=True))

I guess if a film has magic  I don t need it to be fluid or seamless  It can skip background information  go too fast in some places  too slow in others  etc  Magic in this film  the scene in the library  There are many minor flaws in Stanley   Iris  yet they don t detract from the overall positive impact of watching people help each other in areas of life that seem the most incomprehensible  the hardest to fix  Both characters are smart  Yet Stanley can t understand enough to function because he can t read  he can t read because he s had too much adventure in his childhood  Iris  although well educated  hasn t had enough adventure and so can t understand how to move past the U turn her life took  In both their faults and strengths  the characters compliment each other  It may be a bit of a stretch to accept that an Iris would wind up working year after year in a factory  or that a Stanley never hid his illiteracy enough to work in construction or some other better paying job  And whil

In [75]:
def correct_spelling(text):
    spell = SpellChecker()

    corrected_text = []
    misspelled_words = spell.unknown(text.split())

    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)

    return ' '.join(corrected_text)

In [83]:
# def emoticon_to_words(text):
#     for emoticon in EMOTICONS_EMO:
#         text = re.sub(u'(' + emoticon + ')', "_".join(EMOTICONS_EMO[emoticon].replace(",", "").split()), text)
#     return text

In [121]:
# def emoticon_to_words(text):
#     for emoticon in EMOTICONS_EMO:
#         text = re.sub(re.escape(emoticon), EMOTICONS_EMO[emoticon], text)
#     return text

In [120]:
expr = '(a^b)'
print(re.escape(expr))

eqn = 'f*(a^b) - 3*(a^b)'
re.sub(expr, 'c', eqn)
print(eqn)
print(re.sub(re.escape(expr), 'c', eqn))

\(a\^b\)
f*(a^b) - 3*(a^b)
f*c - 3*c


In [233]:
def convert_emojis_and_emoticons(text):
    emot_obj = emot.core.emot()
    emoji_info = emot_obj.emoji(text)
    emoticon_info = emot_obj.emoticons(text)

    if emoji_info['flag']:
        for emoji, meaning in zip(emoji_info['value'], emoji_info['mean']):
            meaning = ' '.join(re.split(r'[_-]', meaning))
            text = re.sub(emoji, ' ' + meaning + ' ', text)

    if emoticon_info['flag']:
        for emoticon, meaning in zip(emoticon_info['value'], emoticon_info['mean']):
            text = re.sub(re.escape(emoticon), ' ' + meaning + ' ', text)

    return text

In [219]:
s1 = 'couple_with_heart_man_man_medium-light_skin_tone_dark_skin_tone'
s2 = 'dolphin'
s = ' '.join(re.split(r'[_-]', s1))
s

'couple with heart man man medium light skin tone dark skin tone'

In [182]:
text_with_emoji = read_document('data/train/pos/11572_8.txt', 'utf-8')

In [183]:
print(text_with_emoji)

"In April 1946, the University of Chicago agreed to operate Argonne National Laboratory, with an association of Midwestern universities offering to sponsor the research. Argonne thereby became the first "national" laboratory. It did not, however, remain at its original location in the Argonne forest. In 1947, it moved farther west from the "Windy City" to a new site on Illinois farmland. When Alvin Weinberg visited Argonne's director, Walter Zinn, in 1947, he asked him what kind of reactor was to be built at the new site. When Zinn described a heavy-water reactor operating at one-tenth the power of the Materials Testing Reactor under design at Oak Ridge, Weinberg joked it would be simpler if Zinn took the Oak Ridge design and operated the Materials Testing Reactor at one-tenth capacity. The joke proved unintentionally prophetic."<br /><br />The S-50 plant used convection to separate the isotopes in thousands of tall columns. It was built next to the K-25 power plant, which provided the

In [232]:
emot_obj_ = emot.core.emot()
tmp = emot_obj_.emoticons(review_1)
print(tmp)

{'value': [':-)'], 'location': [[1032, 1035]], 'mean': ['Happy face smiley'], 'flag': True}


In [180]:
s = "hello 😇"
c = '😇'
print(s)

txt = re.sub(c, '', s)
print(txt)

hello 😇
hello 


In [189]:
review_1

'In this film we have the fabulous opportunity to see what happened to Timon and Pumbaa in the film when they are not shown - which is a lot! This film even goes back to before Simba and (presumbably) just after the birth of Kiara. <br /><br />Quite true to the first film, "Lion King 1/2 (or Lion King 3 in other places)" is a funny, entertaining, exciting and surprising film (or sequel if that\'s what you want to call it). A bundle of surprises and hilarity await for you!<br /><br />While Timon and Pumbaa are watching a film at the cinema (with a remote control), Timon and Pumbaa have an argument of what point of "The Lion King" they are going to start watching, as Timon wants to go to the part when he and Pumbaa come in and Pumbaa wants to go back to the beginning. They have a very fair compromise of watching the film of their own story, which is what awaits... It starts with Timon\'s first home...<br /><br />For anyone with a good sense of humour who liked the first films of just abo

In [231]:
print(convert_emojis_and_emoticons(review_1))

In this film we have the fabulous opportunity to see what happened to Timon and Pumbaa in the film when they are not shown - which is a lot! This film even goes back to before Simba and (presumbably) just after the birth of Kiara. <br /><br />Quite true to the first film, "Lion King 1/2 (or Lion King 3 in other places)" is a funny, entertaining, exciting and surprising film (or sequel if that's what you want to call it). A bundle of surprises and hilarity await for you!<br /><br />While Timon and Pumbaa are watching a film at the cinema (with a remote control), Timon and Pumbaa have an argument of what point of "The Lion King" they are going to start watching, as Timon wants to go to the part when he and Pumbaa come in and Pumbaa wants to go back to the beginning. They have a very fair compromise of watching the film of their own story, which is what awaits... It starts with Timon's first home...<br /><br />For anyone with a good sense of humour who liked the first films of just about 