### Part 1. Basic Regular Expressions

In [1]:
import re

sample = "I am 20 years old. My previous license plate number was 4XUI302 and my new one is 3ABC278. My ID is J987492 and my address is 123 Main street, San Jose, CA. Please email me at myemail123+spam@google.com"

In [2]:
license = re.findall("[0-9]{1}[A-Z0-9]{6}", sample)
print("License plates: ",license)
IDs = re.findall("[A-Z]{1}[0-9]{6}", sample)
print("IDs: ",IDs)
email = re.findall("[A-Za-z0-9+]*@{1}[a-z]*.*", sample)
print("Email addresses: ",email)
mailing = re.findall("\d+\s[A-z]+\s[A-z]+,\s[A-z]+\s*[A-z]*,\s[A-Z]{2}", sample)
print("Mailing addresses: ",mailing)

license_sub = re.sub("[0-9]{1}[A-Z0-9]{6}", "LP_NUM", sample)
print("Replaced sentence: \n", license_sub)

License plates:  ['4XUI302', '3ABC278']
IDs:  ['J987492']
Email addresses:  ['myemail123+spam@google.com']
Mailing addresses:  ['123 Main street, San Jose, CA']
Replaced sentence: 
 I am 20 years old. My previous license plate number was LP_NUM and my new one is LP_NUM. My ID is J987492 and my address is 123 Main street, San Jose, CA. Please email me at myemail123+spam@google.com


### Part 2. Basic Text Operations
#### Load the dataset

In [3]:
import nltk
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to C:\Users\Do-
[nltk_data]     While\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

#### Remove punctuation from corpus

In [4]:
import string
no_punct = [''.join(c for c in s if c not in string.punctuation) for s in movie_reviews.words()]
new_list = [s for s in no_punct if s]
new_list

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'to',
 'a',
 'church',
 'party',
 'drink',
 'and',
 'then',
 'drive',
 'they',
 'get',
 'into',
 'an',
 'accident',
 'one',
 'of',
 'the',
 'guys',
 'dies',
 'but',
 'his',
 'girlfriend',
 'continues',
 'to',
 'see',
 'him',
 'in',
 'her',
 'life',
 'and',
 'has',
 'nightmares',
 'what',
 's',
 'the',
 'deal',
 'watch',
 'the',
 'movie',
 'and',
 'sorta',
 'find',
 'out',
 'critique',
 'a',
 'mind',
 'fuck',
 'movie',
 'for',
 'the',
 'teen',
 'generation',
 'that',
 'touches',
 'on',
 'a',
 'very',
 'cool',
 'idea',
 'but',
 'presents',
 'it',
 'in',
 'a',
 'very',
 'bad',
 'package',
 'which',
 'is',
 'what',
 'makes',
 'this',
 'review',
 'an',
 'even',
 'harder',
 'one',
 'to',
 'write',
 'since',
 'i',
 'generally',
 'applaud',
 'films',
 'which',
 'attempt',
 'to',
 'break',
 'the',
 'mold',
 'mess',
 'with',
 'your',
 'head',
 'and',
 'such',
 'lost',
 'highway',
 'memento',
 'but',
 'there',
 'are',
 'good',
 'and',
 'bad',
 'ways',

#### Number of unique words in corpus

In [5]:
unique_items = set(new_list)
print("Number of unique words: ",len(unique_items))

Number of unique words:  39451


#### Top 20 most frequent words in corpus

In [6]:
freq_words = nltk.FreqDist(new_list)
print("Top 20 most frequent words: ",freq_words.most_common(20))

Top 20 most frequent words:  [('the', 76565), ('a', 38107), ('and', 35580), ('of', 34123), ('to', 31938), ('is', 25203), ('in', 21825), ('s', 18514), ('it', 16109), ('that', 15927), ('as', 11378), ('with', 10792), ('for', 9961), ('his', 9588), ('this', 9579), ('film', 9519), ('i', 8889), ('he', 8864), ('but', 8635), ('on', 7385)]


#### Remove stop words from corpus

In [7]:
# Removing stop words from corpus
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

no_stop = [w for w in new_list if not w in stop_words] 
no_stop

[nltk_data] Downloading package stopwords to C:\Users\Do-
[nltk_data]     While\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['plot',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get',
 'accident',
 'one',
 'guys',
 'dies',
 'girlfriend',
 'continues',
 'see',
 'life',
 'nightmares',
 'deal',
 'watch',
 'movie',
 'sorta',
 'find',
 'critique',
 'mind',
 'fuck',
 'movie',
 'teen',
 'generation',
 'touches',
 'cool',
 'idea',
 'presents',
 'bad',
 'package',
 'makes',
 'review',
 'even',
 'harder',
 'one',
 'write',
 'since',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 'mess',
 'head',
 'lost',
 'highway',
 'memento',
 'good',
 'bad',
 'ways',
 'making',
 'types',
 'films',
 'folks',
 'snag',
 'one',
 'correctly',
 'seem',
 'taken',
 'pretty',
 'neat',
 'concept',
 'executed',
 'terribly',
 'problems',
 'movie',
 'well',
 'main',
 'problem',
 'simply',
 'jumbled',
 'starts',
 'normal',
 'downshifts',
 'fantasy',
 'world',
 'audience',
 'member',
 'idea',
 'going',
 'dreams',
 'characters',
 'coming',
 'back',
 'dead',
 'others',
 'look',
 'like',
 'dead

#### Unique word count after stop word removal

In [8]:
# Unique words after removal of stop words

unique_items = set(no_stop)
print("Number of unique words without stop words: ",len(unique_items))

Number of unique words without stop words:  39300


#### Top 20 most frequent words after stop word removal

In [9]:
# Most frequent after removal of stop words

freq_words = nltk.FreqDist(no_stop)
print("Top 20 most frequent words with stop word removal: ",freq_words.most_common(20))

Top 20 most frequent words with stop word removal:  [('film', 9519), ('one', 5853), ('movie', 5774), ('like', 3690), ('even', 2565), ('good', 2411), ('time', 2411), ('story', 2170), ('would', 2110), ('much', 2050), ('character', 2020), ('also', 1967), ('get', 1949), ('two', 1912), ('well', 1906), ('characters', 1859), ('first', 1836), ('see', 1749), ('way', 1693), ('make', 1642)]


#### First 20 words only used once in corpus

In [10]:
# Words only used once in corpus

once_words = freq_words.hapaxes()
print("20 words used only once: ", once_words[0:20])

20 words used only once:  ['looooot', 'schnazzy', 'timex', 'indiglo', 'jessalyn', 'gilsig', 'ruber', 'jaleel', 'balki', 'wavers', 'statistics', 'snapshot', 'guesswork', 'maryam', 'daylights', 'terraformed', 'stagnated', 'napolean', 'millimeter', 'enmeshed']


### Part 3. Perform the following operations

#### Unique words after stemming

In [11]:
# Performing on altered corpus
# Apply stemming and get unique words
from nltk.stem import PorterStemmer
porter = PorterStemmer()
stemmed_list = []
for word in no_stop:
    stemmed_list.append(porter.stem(word))
unique_items = set(stemmed_list)
print("Number of unique words after stemming: ",len(unique_items))

Number of unique words after stemming:  25815


#### Unique words after lemmatization

In [12]:
# Unique words after lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()
lem_list = []
for word in no_stop:
    lem_list.append(wordnet_lemmatizer.lemmatize(word))
unique_items = set(lem_list)
print("Number of unique words after lemmatization: ",len(unique_items)) 

[nltk_data] Downloading package wordnet to C:\Users\Do-
[nltk_data]     While\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Number of unique words after lemmatization:  34886


In [13]:
lem_list

['plot',
 'two',
 'teen',
 'couple',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get',
 'accident',
 'one',
 'guy',
 'dy',
 'girlfriend',
 'continues',
 'see',
 'life',
 'nightmare',
 'deal',
 'watch',
 'movie',
 'sorta',
 'find',
 'critique',
 'mind',
 'fuck',
 'movie',
 'teen',
 'generation',
 'touch',
 'cool',
 'idea',
 'present',
 'bad',
 'package',
 'make',
 'review',
 'even',
 'harder',
 'one',
 'write',
 'since',
 'generally',
 'applaud',
 'film',
 'attempt',
 'break',
 'mold',
 'mess',
 'head',
 'lost',
 'highway',
 'memento',
 'good',
 'bad',
 'way',
 'making',
 'type',
 'film',
 'folk',
 'snag',
 'one',
 'correctly',
 'seem',
 'taken',
 'pretty',
 'neat',
 'concept',
 'executed',
 'terribly',
 'problem',
 'movie',
 'well',
 'main',
 'problem',
 'simply',
 'jumbled',
 'start',
 'normal',
 'downshift',
 'fantasy',
 'world',
 'audience',
 'member',
 'idea',
 'going',
 'dream',
 'character',
 'coming',
 'back',
 'dead',
 'others',
 'look',
 'like',
 'dead',
 'strange',
 'ap