### Importing Libraries

In [1]:
import numpy as np
import pandas as pd

import re

import nltk
import string

---

### Loading Datasets

In [2]:
full_df = pd.read_csv("essays.csv")

text_df = full_df[["TEXT"]]
text_df["TEXT"] = text_df["TEXT"].astype(str)

full_df.head()
text_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_df["TEXT"] = text_df["TEXT"].astype(str)


Unnamed: 0,TEXT
0,"Well, right now I just woke up from a mid-day ..."
1,"Well, here we go with the stream of consciousn..."
2,An open keyboard and buttons to push. The thin...
3,I can't believe it! It's really happening! M...
4,"Well, here I go with the good old stream of co..."


---

### Lower Casing

In [3]:
text_df["Processed Text"] = text_df["TEXT"].str.lower()
text_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_df["Processed Text"] = text_df["TEXT"].str.lower()


Unnamed: 0,TEXT,Processed Text
0,"Well, right now I just woke up from a mid-day ...","well, right now i just woke up from a mid-day ..."
1,"Well, here we go with the stream of consciousn...","well, here we go with the stream of consciousn..."
2,An open keyboard and buttons to push. The thin...,an open keyboard and buttons to push. the thin...
3,I can't believe it! It's really happening! M...,i can't believe it! it's really happening! m...
4,"Well, here I go with the good old stream of co...","well, here i go with the good old stream of co..."


---

### Removing Punctuation

#### Using Regular Expression

In [4]:
def remove_punctuation_re(text):
    return re.sub(r'[^\w\s]', '', text)

text_df["Processed Text"] = text_df["Processed Text"].apply(lambda text: remove_punctuation_re(text))
text_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_df["Processed Text"] = text_df["Processed Text"].apply(lambda text: remove_punctuation_re(text))


Unnamed: 0,TEXT,Processed Text
0,"Well, right now I just woke up from a mid-day ...",well right now i just woke up from a midday na...
1,"Well, here we go with the stream of consciousn...",well here we go with the stream of consciousne...
2,An open keyboard and buttons to push. The thin...,an open keyboard and buttons to push the thing...
3,I can't believe it! It's really happening! M...,i cant believe it its really happening my pu...
4,"Well, here I go with the good old stream of co...",well here i go with the good old stream of con...


---

#### Using String Translate

In [25]:
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation_st(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

text_df["Processed Text"] = text_df["Processed Text"].apply(lambda text: remove_punctuation_st(text))
text_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_df["Processed Text"] = text_df["Processed Text"].apply(lambda text: remove_punctuation_st(text))


Unnamed: 0,TEXT,Processed Text
0,"Well, right now I just woke up from a mid-day ...",well right now i just woke up from a midday na...
1,"Well, here we go with the stream of consciousn...",well here we go with the stream of consciousne...
2,An open keyboard and buttons to push. The thin...,an open keyboard and buttons to push the thing...
3,I can't believe it! It's really happening! M...,i cant believe it its really happening my pu...
4,"Well, here I go with the good old stream of co...",well here i go with the good old stream of con...


In [11]:
# drop the new column created in last cell
text_df.drop(["text_without_punc"], axis=1, inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


---

### Removing Stopwords

In [1]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

print(len(stop_words), "stopwords:", stop_words)

179 stopwords: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 

In [7]:
stopwords = {"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mustn't, needn, needn't, shan, shan't, shouldn, shouldn't, wasn, wasn't, weren, weren't, won, won't, wouldn, wouldn't"}

def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in stop_words])

text_df["Processed Text"] = text_df["Processed Text"].apply(lambda text: remove_stopwords(text))
text_df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_df["Processed Text"] = text_df["Processed Text"].apply(lambda text: remove_stopwords(text))


Unnamed: 0,TEXT,Processed Text
0,"Well, right now I just woke up from a mid-day ...",well right woke midday nap sort weird ever sin...
1,"Well, here we go with the stream of consciousn...",well go stream consciousness essay used things...
2,An open keyboard and buttons to push. The thin...,open keyboard buttons push thing finally worke...
3,I can't believe it! It's really happening! M...,cant believe really happening pulse racing lik...
4,"Well, here I go with the good old stream of co...",well go good old stream consciousness assignme...


---

### Lemmatization

In [19]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    #print(pos_tagged_text)
    #return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
    #for word, pos in pos_tagged_text:
        #print(pos[0])
        #print(wordnet.NOUN)
        #print(wordnet_map.get(pos[0], wordnet.NOUN))
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

text_df["text_lemmatized"] = text_df["Processed Text"].apply(lambda text: lemmatize_words(text))
text_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_df["text_lemmatized"] = text_df["Processed Text"].apply(lambda text: lemmatize_words(text))


Unnamed: 0,TEXT,Processed Text,text_lemmatized
0,"Well, right now I just woke up from a mid-day ...",well right woke midday nap sort weird ever sin...,well right wake midday nap sort weird ever sin...
1,"Well, here we go with the stream of consciousn...",well go stream consciousness essay used things...,well go stream consciousness essay use thing l...
2,An open keyboard and buttons to push. The thin...,open keyboard buttons push thing finally worke...,open keyboard button push thing finally work n...
3,I can't believe it! It's really happening! M...,cant believe really happening pulse racing lik...,cant believe really happen pulse race like mad...
4,"Well, here I go with the good old stream of co...",well go good old stream consciousness assignme...,well go good old stream consciousness assignme...
