In [1]:
import os
import re
import codecs
import string

import emot.core
import unicodedata
import multiprocessing as mp
from multiprocessing import Pool

In [2]:
from bs4 import BeautifulSoup
import json

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Darko\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Darko\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Darko\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Darko\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [5]:
import contractions
import emot
from emot import EMOTICONS_EMO
from spellchecker import SpellChecker
from chardet.universaldetector import UniversalDetector

In [99]:
def read_document(filepath, encoding='utf-8'):
    with codecs.open(filepath, 'r', encoding) as file:
        text = file.read()
    return text

In [7]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'lxml')
    text = soup.get_text()
    return text

In [8]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [9]:
def to_lower(text):
    return text.lower()

In [10]:
def remove_accented_characters(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [11]:
remove_accented_characters('Sómě Áccěñtěd téxt')

'Some Accented text'

In [12]:
def expand_contractions(text):
    expanded_text = []
    for word in text.split():
        expanded_text.append(contractions.fix(word))
    expanded_text = ' '.join(expanded_text)
    return expanded_text

In [13]:
print(expand_contractions("ain't"))
print(expand_contractions("Y'all can't expand contractions I'd think"))
print(expand_contractions("can't've"))
print(expand_contractions("you'll've"))

are not
You all cannot expand contractions I would think
cannot have
you shall have


In [14]:
def detect_encoding(filepath):
    detector = UniversalDetector()
    with open(filepath, 'rb') as file:
        for line in file:
            detector.feed(line)
            if detector.done:
                break
    detector.close()

    encoding = detector.result['encoding']
    confidence = detector.result['confidence']

    return encoding, confidence

In [15]:
# Fajl 2624_7 ima emotikone

In [16]:
encoding_, confidence_ = detect_encoding('data/train/pos/2624_7.txt')
print(f'{encoding_} {confidence_}')

ascii 1.0


In [17]:
review_1 = read_document('data/train/pos/2624_7.txt', encoding=encoding_)
print(review_1)

In this film we have the fabulous opportunity to see what happened to Timon and Pumbaa in the film when they are not shown - which is a lot! This film even goes back to before Simba and (presumbably) just after the birth of Kiara. <br /><br />Quite true to the first film, "Lion King 1/2 (or Lion King 3 in other places)" is a funny, entertaining, exciting and surprising film (or sequel if that's what you want to call it). A bundle of surprises and hilarity await for you!<br /><br />While Timon and Pumbaa are watching a film at the cinema (with a remote control), Timon and Pumbaa have an argument of what point of "The Lion King" they are going to start watching, as Timon wants to go to the part when he and Pumbaa come in and Pumbaa wants to go back to the beginning. They have a very fair compromise of watching the film of their own story, which is what awaits... It starts with Timon's first home...<br /><br />For anyone with a good sense of humour who liked the first films of just about 

In [18]:
print(remove_html_tags(review_1))

In this film we have the fabulous opportunity to see what happened to Timon and Pumbaa in the film when they are not shown - which is a lot! This film even goes back to before Simba and (presumbably) just after the birth of Kiara. Quite true to the first film, "Lion King 1/2 (or Lion King 3 in other places)" is a funny, entertaining, exciting and surprising film (or sequel if that's what you want to call it). A bundle of surprises and hilarity await for you!While Timon and Pumbaa are watching a film at the cinema (with a remote control), Timon and Pumbaa have an argument of what point of "The Lion King" they are going to start watching, as Timon wants to go to the part when he and Pumbaa come in and Pumbaa wants to go back to the beginning. They have a very fair compromise of watching the film of their own story, which is what awaits... It starts with Timon's first home...For anyone with a good sense of humour who liked the first films of just about any age, enjoy "Lion King 1/2"! :-)


In [19]:
encoding_2, confidence_2 = detect_encoding('data/train/pos/15_7.txt')
review_2 = read_document('data/train/pos/15_7.txt', encoding=encoding_2)
print(review_2)

I guess if a film has magic, I don't need it to be fluid or seamless. It can skip background information, go too fast in some places, too slow in others, etc. Magic in this film: the scene in the library. There are many minor flaws in Stanley & Iris, yet they don't detract from the overall positive impact of watching people help each other in areas of life that seem the most incomprehensible, the hardest to fix. Both characters are smart. Yet Stanley can't understand enough to function because he can't read; he can't read because he's had too much adventure in his childhood. Iris, although well-educated, hasn't had enough adventure and so can't understand how to move past the U-turn her life took. In both their faults and strengths, the characters compliment each other. It may be a bit of a stretch to accept that an Iris would wind up working year after year in a factory, or that a Stanley never hid his illiteracy enough to work in construction or some other better-paying job. And whil

In [20]:
print(expand_contractions(review_2))

I guess if a film has magic, I do not need it to be fluid or seamless. It can skip background information, go too fast in some places, too slow in others, etc. Magic in this film: the scene in the library. There are many minor flaws in Stanley & Iris, yet they do not detract from the overall positive impact of watching people help each other in areas of life that seem the most incomprehensible, the hardest to fix. Both characters are smart. Yet Stanley cannot understand enough to function because he cannot read; he cannot read because he is had too much adventure in his childhood. Iris, although well-educated, has not had enough adventure and so cannot understand how to move past the YOU-turn her life took. In both their faults and strengths, the characters compliment each other. It may be a bit of a stretch to accept that an Iris would wind up working year after year in a factory, or that a Stanley never hid his illiteracy enough to work in construction or some other better-paying job

In [21]:
print(remove_accented_characters(review_2))

I guess if a film has magic, I don't need it to be fluid or seamless. It can skip background information, go too fast in some places, too slow in others, etc. Magic in this film: the scene in the library. There are many minor flaws in Stanley & Iris, yet they don't detract from the overall positive impact of watching people help each other in areas of life that seem the most incomprehensible, the hardest to fix. Both characters are smart. Yet Stanley can't understand enough to function because he can't read; he can't read because he's had too much adventure in his childhood. Iris, although well-educated, hasn't had enough adventure and so can't understand how to move past the U-turn her life took. In both their faults and strengths, the characters compliment each other. It may be a bit of a stretch to accept that an Iris would wind up working year after year in a factory, or that a Stanley never hid his illiteracy enough to work in construction or some other better-paying job. And whil

In [22]:
def tokenize(text):
    tokens = word_tokenize(text)
    return tokens

In [23]:
review_3 = read_document('data/train/neg/3_4.txt', 'utf-8')
print(review_3)

If I had not read Pat Barker's 'Union Street' before seeing this film, I would have liked it. Unfortuntately this is not the case. It is actually my kind of film, it is well made, and in no way do I want to say otherwise, but as an adaptation, it fails from every angle.<br /><br />The harrowing novel about the reality of living in a northern England working-class area grabbed hold of my heartstrings and refused to let go for weeks after I had finished. I was put through tears, repulsion, shock, anger, sympathy and misery when reading about the women of Union Street. Excellent. A novel that at times I felt I could not read any more of, but I novel I simply couldn't put down. Depressing yes, but utterly gripping.<br /><br />The film. Oh dear. Hollywood took Barker's truth and reality, and showered a layer of sweet icing sugar over the top of it. A beautiful film, an inspiring soundtrack, excellent performances, a tale of hope and romance...yes. An adaptation of 'Union Street'...no.<br />

In [24]:
for token in tokenize(review_2):
    print(token, end=' | ')

I | guess | if | a | film | has | magic | , | I | do | n't | need | it | to | be | fluid | or | seamless | . | It | can | skip | background | information | , | go | too | fast | in | some | places | , | too | slow | in | others | , | etc | . | Magic | in | this | film | : | the | scene | in | the | library | . | There | are | many | minor | flaws | in | Stanley | & | Iris | , | yet | they | do | n't | detract | from | the | overall | positive | impact | of | watching | people | help | each | other | in | areas | of | life | that | seem | the | most | incomprehensible | , | the | hardest | to | fix | . | Both | characters | are | smart | . | Yet | Stanley | ca | n't | understand | enough | to | function | because | he | ca | n't | read | ; | he | ca | n't | read | because | he | 's | had | too | much | adventure | in | his | childhood | . | Iris | , | although | well-educated | , | has | n't | had | enough | adventure | and | so | ca | n't | understand | how | to | move | past | the | U

In [25]:
doc = nlp(review_3)
for entity in doc.ents:
    print(f'{entity} - {entity.label_}')
    print(str(spacy.explain(entity.label_)))
    print('\n')

Pat Barker's ' - PERSON
People, including fictional


Union Street' - FAC
Buildings, airports, highways, bridges, etc.


England - GPE
Countries, cities, states


weeks - DATE
Absolute or relative dates or periods


Union Street - FAC
Buildings, airports, highways, bridges, etc.


Hollywood - GPE
Countries, cities, states


Barker - ORG
Companies, agencies, institutions, etc.


Union Street - FAC
Buildings, airports, highways, bridges, etc.


Fonda - PERSON
People, including fictional


Barker - ORG
Companies, agencies, institutions, etc.


7 - CARDINAL
Numerals that do not fall under another type


William - PERSON
People, including fictional


first - ORDINAL
"first", "second", etc.


Schindler - PERSON
People, including fictional




In [26]:
def remove_special_characters(text, remove_digits=False):
    special_chars_pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    compiled_pattern = re.compile(special_chars_pattern)
    return compiled_pattern.sub(r'', text)

In [27]:
print(review_2)

I guess if a film has magic, I don't need it to be fluid or seamless. It can skip background information, go too fast in some places, too slow in others, etc. Magic in this film: the scene in the library. There are many minor flaws in Stanley & Iris, yet they don't detract from the overall positive impact of watching people help each other in areas of life that seem the most incomprehensible, the hardest to fix. Both characters are smart. Yet Stanley can't understand enough to function because he can't read; he can't read because he's had too much adventure in his childhood. Iris, although well-educated, hasn't had enough adventure and so can't understand how to move past the U-turn her life took. In both their faults and strengths, the characters compliment each other. It may be a bit of a stretch to accept that an Iris would wind up working year after year in a factory, or that a Stanley never hid his illiteracy enough to work in construction or some other better-paying job. And whil

In [28]:
print(remove_special_characters(review_2, remove_digits=True))

I guess if a film has magic I dont need it to be fluid or seamless It can skip background information go too fast in some places too slow in others etc Magic in this film the scene in the library There are many minor flaws in Stanley  Iris yet they dont detract from the overall positive impact of watching people help each other in areas of life that seem the most incomprehensible the hardest to fix Both characters are smart Yet Stanley cant understand enough to function because he cant read he cant read because hes had too much adventure in his childhood Iris although welleducated hasnt had enough adventure and so cant understand how to move past the Uturn her life took In both their faults and strengths the characters compliment each other It may be a bit of a stretch to accept that an Iris would wind up working year after year in a factory or that a Stanley never hid his illiteracy enough to work in construction or some other betterpaying job And while these mysteries are explained i

In [29]:
def correct_spelling(text):
    spell = SpellChecker()

    corrected_text = []
    misspelled_words = spell.unknown(text.split())

    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)

    return ' '.join(corrected_text)

In [30]:
# def emoticon_to_words(text):
#     for emoticon in EMOTICONS_EMO:
#         text = re.sub(u'(' + emoticon + ')', "_".join(EMOTICONS_EMO[emoticon].replace(",", "").split()), text)
#     return text

In [31]:
# def emoticon_to_words(text):
#     for emoticon in EMOTICONS_EMO:
#         text = re.sub(re.escape(emoticon), EMOTICONS_EMO[emoticon], text)
#     return text

In [32]:
expr = '(a^b)'
print(re.escape(expr))

eqn = 'f*(a^b) - 3*(a^b)'
re.sub(expr, 'c', eqn)
print(eqn)
print(re.sub(re.escape(expr), 'c', eqn))

\(a\^b\)
f*(a^b) - 3*(a^b)
f*c - 3*c


In [33]:
def convert_emojis_and_emoticons(text):
    emot_obj = emot.core.emot()
    emoji_info = emot_obj.emoji(text)
    emoticon_info = emot_obj.emoticons(text)

    if emoji_info['flag']:
        for emoji, meaning in zip(emoji_info['value'], emoji_info['mean']):
            meaning = ' '.join(re.split(r'[_-]', meaning))
            text = re.sub(emoji, ' ' + meaning + ' ', text)

    if emoticon_info['flag']:
        for emoticon, meaning in zip(emoticon_info['value'], emoticon_info['mean']):
            text = re.sub(re.escape(emoticon), ' ' + meaning + ' ', text)

    return text

In [34]:
s1 = 'couple_with_heart_man_man_medium-light_skin_tone_dark_skin_tone'
s2 = 'dolphin'
s = ' '.join(re.split(r'[_-]', s1))
s

'couple with heart man man medium light skin tone dark skin tone'

In [35]:
text_with_emoji = read_document('data/train/pos/11572_8.txt', 'utf-8')

In [36]:
print(text_with_emoji)

"In April 1946, the University of Chicago agreed to operate Argonne National Laboratory, with an association of Midwestern universities offering to sponsor the research. Argonne thereby became the first "national" laboratory. It did not, however, remain at its original location in the Argonne forest. In 1947, it moved farther west from the "Windy City" to a new site on Illinois farmland. When Alvin Weinberg visited Argonne's director, Walter Zinn, in 1947, he asked him what kind of reactor was to be built at the new site. When Zinn described a heavy-water reactor operating at one-tenth the power of the Materials Testing Reactor under design at Oak Ridge, Weinberg joked it would be simpler if Zinn took the Oak Ridge design and operated the Materials Testing Reactor at one-tenth capacity. The joke proved unintentionally prophetic."<br /><br />The S-50 plant used convection to separate the isotopes in thousands of tall columns. It was built next to the K-25 power plant, which provided the

In [37]:
emot_obj_ = emot.core.emot()
tmp = emot_obj_.emoticons(review_1)
print(tmp)

{'value': [':-)'], 'location': [[1032, 1035]], 'mean': ['Happy face smiley'], 'flag': True}


In [38]:
s = "hello 😇"
c = '😇'
print(s)

txt = re.sub(c, '', s)
print(txt)

hello 😇
hello 


In [39]:
review_1

'In this film we have the fabulous opportunity to see what happened to Timon and Pumbaa in the film when they are not shown - which is a lot! This film even goes back to before Simba and (presumbably) just after the birth of Kiara. <br /><br />Quite true to the first film, "Lion King 1/2 (or Lion King 3 in other places)" is a funny, entertaining, exciting and surprising film (or sequel if that\'s what you want to call it). A bundle of surprises and hilarity await for you!<br /><br />While Timon and Pumbaa are watching a film at the cinema (with a remote control), Timon and Pumbaa have an argument of what point of "The Lion King" they are going to start watching, as Timon wants to go to the part when he and Pumbaa come in and Pumbaa wants to go back to the beginning. They have a very fair compromise of watching the film of their own story, which is what awaits... It starts with Timon\'s first home...<br /><br />For anyone with a good sense of humour who liked the first films of just abo

In [40]:
print(convert_emojis_and_emoticons(review_1))

In this film we have the fabulous opportunity to see what happened to Timon and Pumbaa in the film when they are not shown - which is a lot! This film even goes back to before Simba and (presumbably) just after the birth of Kiara. <br /><br />Quite true to the first film, "Lion King 1/2 (or Lion King 3 in other places)" is a funny, entertaining, exciting and surprising film (or sequel if that's what you want to call it). A bundle of surprises and hilarity await for you!<br /><br />While Timon and Pumbaa are watching a film at the cinema (with a remote control), Timon and Pumbaa have an argument of what point of "The Lion King" they are going to start watching, as Timon wants to go to the part when he and Pumbaa come in and Pumbaa wants to go back to the beginning. They have a very fair compromise of watching the film of their own story, which is what awaits... It starts with Timon's first home...<br /><br />For anyone with a good sense of humour who liked the first films of just about 

STOPWORDS

In [41]:
from nltk.corpus import stopwords as stopwords
stopwords_nltk = stopwords.words('english')
print(stopwords_nltk)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [42]:
stopwords_spacy = nlp.Defaults.stop_words
print(sorted(stopwords_spacy))

["'d", "'ll", "'m", "'re", "'s", "'ve", 'a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'both', 'bottom', 'but', 'by', 'ca', 'call', 'can', 'cannot', 'could', 'did', 'do', 'does', 'doing', 'done', 'down', 'due', 'during', 'each', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'has', 'have', 'he', 'hence', 'her', 'here', 'he

In [43]:
def remove_stopwords(text, stopword_list):
    tokens = tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [44]:
review_4 = read_document('data/train/neg/0_3.txt', 'utf-8')
print(review_4)

Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.


In [45]:
t1 = remove_stopwords(review_4.lower(), stopwords_nltk)
print(t1)

story man unnatural feelings pig . starts opening scene terrific example absurd comedy . formal orchestra audience turned insane , violent mob crazy chantings 's singers . unfortunately stays absurd whole time general narrative eventually making putting . even era turned . cryptic dialogue would make shakespeare seem easy third grader . technical level 's better might think good cinematography future great vilmos zsigmond . future stars sally kirkland frederic forrest seen briefly .


In [46]:
t2 = remove_stopwords(review_4.lower(), stopwords_spacy)
print(t2)

story man unnatural feelings pig . starts opening scene terrific example absurd comedy . formal orchestra audience turned insane , violent mob crazy chantings singers . unfortunately stays absurd time general narrative eventually making putting . era turned . cryptic dialogue shakespeare easy grader . technical level better think good cinematography future great vilmos zsigmond . future stars sally kirkland frederic forrest seen briefly .


In [47]:
print(t1 == t2)

False


STEMMING

In [48]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer

In [49]:
ps = PorterStemmer()
ls = LancasterStemmer()
ss = SnowballStemmer('english')

In [50]:
stemmers = [ps, ls, ss]
words = ['cared', 'university', 'fairly', 'easily', 'singing', 'sings', 'sung', 'singer', 'sportingly', 'program',
         'programming', 'programmer', 'programs', 'programmed']

In [51]:
print("{0:20}{1:20}".format("--Word--", "--Stem--"))
for word in words:
    for stemmer in stemmers:
        print("{0:20}{1:20}".format(word, stemmer.stem(word)))
    print("--------------------------------")

--Word--            --Stem--            
cared               care                
cared               car                 
cared               care                
--------------------------------
university          univers             
university          univers             
university          univers             
--------------------------------
fairly              fairli              
fairly              fair                
fairly              fair                
--------------------------------
easily              easili              
easily              easy                
easily              easili              
--------------------------------
singing             sing                
singing             sing                
singing             sing                
--------------------------------
sings               sing                
sings               sing                
sings               sing                
--------------------------------
sung                sun

In [98]:
def stem_text(text, stemmer='ss'):
    stemmer_map = {
        'ps': PorterStemmer(),
        'ls': LancasterStemmer(),
        'ss': SnowballStemmer('english')
    }

    if stemmer not in stemmer_map:
        raise ValueError(f"Invalid stemmer: '{stemmer}'. Choose from 'ps', 'ls', or 'ss'.")

    selected_stemmer = stemmer_map[stemmer]
    tokenized_text = word_tokenize(text)
    return ' '.join([selected_stemmer.stem(word) for word in tokenized_text])

In [95]:
print(review_4)

Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.


In [97]:
stem_text(to_lower(review_4))

"stori of a man who has unnatur feel for a pig . start out with a open scene that is a terrif exampl of absurd comedi . a formal orchestra audienc is turn into an insan , violent mob by the crazi chant of it 's singer . unfortun it stay absurd the whole time with no general narrat eventu make it just too off put . even those from the era should be turn off . the cryptic dialogu would make shakespear seem easi to a third grader . on a technic level it 's better than you might think with some good cinematographi by futur great vilmo zsigmond . futur star salli kirkland and freder forrest can be seen briefli ."

LEMMATIZATION

In [52]:
from nltk.stem import WordNetLemmatizer

In [53]:
wnl = WordNetLemmatizer()

In [54]:
print("{0:20}{1:20}".format("--Word--", "--Lemma--"))
for word in words:
    print("{0:20}{1:20}".format(word, wnl.lemmatize(word)))

--Word--            --Lemma--           
cared               cared               
university          university          
fairly              fairly              
easily              easily              
singing             singing             
sings               sings               
sung                sung                
singer              singer              
sportingly          sportingly          
program             program             
programming         programming         
programmer          programmer          
programs            program             
programmed          programmed          


In [55]:
def lemmatize(text):
    text = nlp(text)
    lemmatized_text = ' '.join([word.lemma_ for word in text])
    return lemmatized_text

In [56]:
print(review_4)

Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.


In [57]:
print(lemmatize(review_4))

story of a man who have unnatural feeling for a pig . start out with a opening scene that be a terrific example of absurd comedy . a formal orchestra audience be turn into an insane , violent mob by the crazy chanting of it 's singer . unfortunately it stay absurd the WHOLE time with no general narrative eventually make it just too off putting . even those from the era should be turn off . the cryptic dialogue would make Shakespeare seem easy to a third grader . on a technical level it be well than you might think with some good cinematography by future great Vilmos Zsigmond . future star Sally Kirkland and Frederic Forrest can be see briefly .


In [58]:
# Mora se prvo ocistiti tekst da bi se primenila ova funkcija
# Pogledati izlaz dole
def remove_repeated_characters(text):
    repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
    match_substitution = r'\1\2\3'
    def replace(old_word):
        if wordnet.synsets(old_word):
            return old_word
        new_word = repeat_pattern.sub(match_substitution, old_word)
        return replace(new_word) if new_word != old_word else new_word

    tokenized_text = tokenize(text)
    correct_tokens = [replace(word) for word in tokenized_text]
    return ' '.join(correct_tokens)

In [59]:
review_5 = read_document('data/train/neg/259_3.txt', 'utf-8')
print(review_5)

photography was too jumpy to follow. dark scenes hard to see.<br /><br />Had good story line too bad it got lost somewhere. Too noisy for what was really happening Bottom line is it's a baddddd movie


In [60]:
print(remove_repeated_characters(review_5))

photography was too jumpy to follow . dark scenes hard to se. < br / > < br / > Had good story line too bad it got lost somewhere . Too noisy for what was really happening Bottom line is it 's a bad movie


In [61]:
tokenize(review_5)

['photography',
 'was',
 'too',
 'jumpy',
 'to',
 'follow',
 '.',
 'dark',
 'scenes',
 'hard',
 'to',
 'see.',
 '<',
 'br',
 '/',
 '>',
 '<',
 'br',
 '/',
 '>',
 'Had',
 'good',
 'story',
 'line',
 'too',
 'bad',
 'it',
 'got',
 'lost',
 'somewhere',
 '.',
 'Too',
 'noisy',
 'for',
 'what',
 'was',
 'really',
 'happening',
 'Bottom',
 'line',
 'is',
 'it',
 "'s",
 'a',
 'baddddd',
 'movie']

In [62]:
remove_repeated_characters('dark scenes hard to see.<br /><br />')

'dark scenes hard to se. < br / > < br / >'

In [63]:
remove_repeated_characters('Aaron')

'Aaron'

In [64]:
remove_repeated_characters('eel')

'eel'

In [65]:
def convert_slang(text):
    with open('slangs.json', 'r') as file:
        slangs = json.load(file)

    new_text = []
    for word in tokenize(text):
        if word in slangs:
            new_text.append(slangs[word])
        else:
            new_text.append(word)

    return ' '.join(new_text)


In [66]:
convert_slang('lmao, wdyt abt that film? it is so gr8! just film 2d4')

'laughing my ass off , what do you think? about that film ? it is so great ! just film to die for'

In [67]:
import pandas as pd

In [68]:
# Ubio dve muve jednim udarcem: resio se duplikata i lakse
# cu pripremiti podatke za dalju analizu/obradu
def corpus_to_csv(root_directory):
    labels = {'pos': 1, 'neg': 0}
    reviews = {}

    for s in ('test', 'train'):
        for l in ('pos', 'neg'):
            path = os.path.join(root_directory, s, l)
            for file in os.listdir(path):
                text = read_document(os.path.join(path, file), encoding='utf-8')
                reviews[text] = labels[l]

    df = pd.DataFrame(list(reviews.items()))
    df = df.sample(frac=1, random_state=42, ignore_index=True)
    df.columns = ['review', 'sentiment']
    df.to_csv('imdb_movie_data.csv', index=False, encoding='utf-8')

In [69]:
%%time
corpus_to_csv('./data')

CPU times: total: 49.2 s
Wall time: 5min 19s


In [126]:
df_ = pd.read_csv('imdb_movie_data.csv')

In [127]:
df_

Unnamed: 0,review,sentiment
0,Due to reading bad reviews and being told by f...,1
1,The funniest scene of this movie is probably w...,0
2,Do not bother to waste your money on this movi...,0
3,If you like bad movies (and you must to watch ...,0
4,This is one of those games where you love it t...,1
...,...,...
49577,"Yes, this film has many gay characters. It als...",1
49578,The film is pretty confusing and ludicrous. Th...,0
49579,"This movie is like real life, by which I mean ...",0
49580,Oliver Stone hits the bull's eye with this fil...,1


In [131]:
df_.describe()

Unnamed: 0,sentiment
count,49582.0
mean,0.501876
std,0.500002
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [132]:
df_['sentiment'].isnull().count()

49582