In [340]:
import zipfile as zip
import string
import pandas as pd
import re
import chardet
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.stem import PorterStemmer

In [341]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mosta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mosta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mosta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mosta\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [342]:
DATASET_PATH='../data/comments1k.zip'
CORRUPTED_STRING_DROP_THRESHOLD=0.2

In [343]:
def clean(text):
    text = re.sub(r'[\.\,]{2,}', ' ', text)
    return text

def decode(text):
    text = text.decode(chardet.detect(text)['encoding'])
    return text

def standardize(text):
    text = text.lower()
    text = text.strip()
    return text

def is_corrupted(text, threshold=CORRUPTED_STRING_DROP_THRESHOLD):
    printable_chars = set(string.printable)
    non_printable_count = sum(
        1 for char in text if char not in printable_chars)
    total_chars = len(text)
    return (non_printable_count / total_chars) > threshold

def tokenize_core(text):
    stop_words = set(stopwords.words('english'))
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    return [token for token in tokens if token not in stop_words]


def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    

def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    pos_tagged = pos_tag(tokens)  
    return [lemmatizer.lemmatize(token, get_wordnet_pos(tag)).lower() for token, tag in pos_tagged]

def stem_tokens(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

In [344]:
dataset=[]

In [345]:
with zip.ZipFile(DATASET_PATH, 'r') as d:
    for filename in d.namelist():
        if filename.endswith('.txt'):
            with d.open(filename,'r') as file:
                dataset.append(file.read())

In [346]:
df=pd.DataFrame(dataset,columns=['comment'])

In [347]:
df['comment'] = df['comment'].apply(decode)
df['is_corrupted'] = df['comment'].apply(is_corrupted)
df = df.drop_duplicates()
df = df[~df['is_corrupted']]
df['comment'] = df['comment'].apply(clean)

In [348]:
print(df.head(3))

                                             comment  is_corrupted
0  Hello. This movie is well okay. Just kidding! ...         False
2  As Most Off You Might off Seen Star Wars: Retu...         False
4  Pierce Brosnan the newest but no longer James ...         False


In [349]:
df['sentences'] = df['comment'].apply(sent_tokenize)
df.head(3)

Unnamed: 0,comment,is_corrupted,sentences
0,Hello. This movie is well okay. Just kidding! ...,False,"[Hello., This movie is well okay., Just kiddin..."
2,As Most Off You Might off Seen Star Wars: Retu...,False,[As Most Off You Might off Seen Star Wars: Ret...
4,Pierce Brosnan the newest but no longer James ...,False,[Pierce Brosnan the newest but no longer James...


In [350]:
df['sentences_count'] = df['sentences'].apply(len)
df.head(3)

Unnamed: 0,comment,is_corrupted,sentences,sentences_count
0,Hello. This movie is well okay. Just kidding! ...,False,"[Hello., This movie is well okay., Just kiddin...",11
2,As Most Off You Might off Seen Star Wars: Retu...,False,[As Most Off You Might off Seen Star Wars: Ret...,1
4,Pierce Brosnan the newest but no longer James ...,False,[Pierce Brosnan the newest but no longer James...,7


In [351]:
# Average number of sentences per comment
round(df['sentences_count'].mean())

11

In [352]:
df['tokens'] = df['comment'].apply(word_tokenize)
df['tokens_count'] = df['tokens'].apply(len)
df.head(3)

Unnamed: 0,comment,is_corrupted,sentences,sentences_count,tokens,tokens_count
0,Hello. This movie is well okay. Just kidding! ...,False,"[Hello., This movie is well okay., Just kiddin...",11,"[Hello, ., This, movie, is, well, okay, ., Jus...",63
2,As Most Off You Might off Seen Star Wars: Retu...,False,[As Most Off You Might off Seen Star Wars: Ret...,1,"[As, Most, Off, You, Might, off, Seen, Star, W...",202
4,Pierce Brosnan the newest but no longer James ...,False,[Pierce Brosnan the newest but no longer James...,7,"[Pierce, Brosnan, the, newest, but, no, longer...",139


In [353]:
# Average number of tokens per comment
round(df['tokens_count'].mean())

282

In [354]:
df['core_tokens'] = df['comment'].apply(tokenize_core)
df['core_tokens_count'] = df['core_tokens'].apply(len)
df.head(3)

Unnamed: 0,comment,is_corrupted,sentences,sentences_count,tokens,tokens_count,core_tokens,core_tokens_count
0,Hello. This movie is well okay. Just kidding! ...,False,"[Hello., This movie is well okay., Just kiddin...",11,"[Hello, ., This, movie, is, well, okay, ., Jus...",63,"[Hello, This, movie, well, okay, Just, kidding...",33
2,As Most Off You Might off Seen Star Wars: Retu...,False,[As Most Off You Might off Seen Star Wars: Ret...,1,"[As, Most, Off, You, Might, off, Seen, Star, W...",202,"[As, Most, Off, You, Might, Seen, Star, Wars, ...",155
4,Pierce Brosnan the newest but no longer James ...,False,[Pierce Brosnan the newest but no longer James...,7,"[Pierce, Brosnan, the, newest, but, no, longer...",139,"[Pierce, Brosnan, newest, longer, James, Bond,...",69


In [355]:
# Average number of core tokens per comment
round(df['core_tokens_count'].mean())

136

In [356]:
df['lemmas'] = df['core_tokens'].apply(lemmatize_tokens)
df.head(3)

Unnamed: 0,comment,is_corrupted,sentences,sentences_count,tokens,tokens_count,core_tokens,core_tokens_count,lemmas
0,Hello. This movie is well okay. Just kidding! ...,False,"[Hello., This movie is well okay., Just kiddin...",11,"[Hello, ., This, movie, is, well, okay, ., Jus...",63,"[Hello, This, movie, well, okay, Just, kidding...",33,"[hello, this, movie, well, okay, just, kid, it..."
2,As Most Off You Might off Seen Star Wars: Retu...,False,[As Most Off You Might off Seen Star Wars: Ret...,1,"[As, Most, Off, You, Might, off, Seen, Star, W...",202,"[As, Most, Off, You, Might, Seen, Star, Wars, ...",155,"[as, most, off, you, might, seen, star, wars, ..."
4,Pierce Brosnan the newest but no longer James ...,False,[Pierce Brosnan the newest but no longer James...,7,"[Pierce, Brosnan, the, newest, but, no, longer...",139,"[Pierce, Brosnan, newest, longer, James, Bond,...",69,"[pierce, brosnan, newest, longer, james, bond,..."


In [357]:
df['stems'] = df['core_tokens'].apply(stem_tokens)
df.head(3)

Unnamed: 0,comment,is_corrupted,sentences,sentences_count,tokens,tokens_count,core_tokens,core_tokens_count,lemmas,stems
0,Hello. This movie is well okay. Just kidding! ...,False,"[Hello., This movie is well okay., Just kiddin...",11,"[Hello, ., This, movie, is, well, okay, ., Jus...",63,"[Hello, This, movie, well, okay, Just, kidding...",33,"[hello, this, movie, well, okay, just, kid, it...","[hello, thi, movi, well, okay, just, kid, it, ..."
2,As Most Off You Might off Seen Star Wars: Retu...,False,[As Most Off You Might off Seen Star Wars: Ret...,1,"[As, Most, Off, You, Might, off, Seen, Star, W...",202,"[As, Most, Off, You, Might, Seen, Star, Wars, ...",155,"[as, most, off, you, might, seen, star, wars, ...","[as, most, off, you, might, seen, star, war, r..."
4,Pierce Brosnan the newest but no longer James ...,False,[Pierce Brosnan the newest but no longer James...,7,"[Pierce, Brosnan, the, newest, but, no, longer...",139,"[Pierce, Brosnan, newest, longer, James, Bond,...",69,"[pierce, brosnan, newest, longer, james, bond,...","[pierc, brosnan, newest, longer, jame, bond, a..."


In [358]:
df[['lemmas','stems']].head(3)

Unnamed: 0,lemmas,stems
0,"[hello, this, movie, well, okay, just, kid, it...","[hello, thi, movi, well, okay, just, kid, it, ..."
2,"[as, most, off, you, might, seen, star, wars, ...","[as, most, off, you, might, seen, star, war, r..."
4,"[pierce, brosnan, newest, longer, james, bond,...","[pierc, brosnan, newest, longer, jame, bond, a..."


In [359]:
def find_lemmas_stems_intersection(row):
    return list(set(row['lemmas']) & set(row['stems']))

In [360]:
df['inner_lemmas_stems'] = df.apply(find_lemmas_stems_intersection, axis=1)
df[['lemmas', 'stems', 'inner_lemmas_stems']].head(3)

Unnamed: 0,lemmas,stems,inner_lemmas_stems
0,"[hello, this, movie, well, okay, just, kid, it...","[hello, thi, movi, well, okay, just, kid, it, ...","[me, and, just, for, not, but, kid, hello, we,..."
2,"[as, most, off, you, might, seen, star, wars, ...","[as, most, off, you, might, seen, star, war, r...","[and, tv, when, name, ben, just, are, most, st..."
4,"[pierce, brosnan, newest, longer, james, bond,...","[pierc, brosnan, newest, longer, jame, bond, a...","[drawn, see, get, now, notch, begin, event, fi..."


In [361]:
def unique_to_lemmas(row):
    return list(set(row['lemmas']) - set(row['stems']))


def unique_to_stems(row):
    return list(set(row['stems']) - set(row['lemmas']))

In [362]:
df['outer_lemmas'] = df.apply(unique_to_lemmas, axis=1)
df['outer_stems'] = df.apply(unique_to_stems, axis=1)
df[['lemmas', 'stems', 'inner_lemmas_stems',
    'outer_lemmas', 'outer_stems']].head(3)

Unnamed: 0,lemmas,stems,inner_lemmas_stems,outer_lemmas,outer_stems
0,"[hello, this, movie, well, okay, just, kid, it...","[hello, thi, movi, well, okay, just, kid, it, ...","[me, and, just, for, not, but, kid, hello, we,...","[this, ramones, movie, awesome, rollers, its]","[roller, it, ramon, awesom, thi, movi]"
2,"[as, most, off, you, might, seen, star, wars, ...","[as, most, off, you, might, seen, star, war, r...","[and, tv, when, name, ben, just, are, most, st...","[spirits, little, anything, get, producers, kn...","[it, youngl, ill, origin, smile, got, movi, li..."
4,"[pierce, brosnan, newest, longer, james, bond,...","[pierc, brosnan, newest, longer, jame, bond, a...","[drawn, see, get, now, notch, begin, event, fi...","[funny, think, separate, pierce, james, really...","[separ, funni, pierc, hilari, togeth, live, pe..."
