In [33]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk import pos_tag
from nltk.corpus import wordnet

# Ensure NLTK components are available
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/elloyd/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/elloyd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/elloyd/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/elloyd/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/elloyd/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [34]:
with open("NLP.txt", "r") as file:
    raw_text = file.read()

In [35]:
# Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
punct_table = str.maketrans('', '', string.punctuation)

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default to noun

def preprocess(text):
    tokens = word_tokenize(text)
    print(f"Tokens: {tokens}")  # Debugging line

    tokens = [t.lower().translate(punct_table) for t in tokens if t.isalnum()]
    print(f"Tokens after punctuation removal: {tokens}")  #
    
    tokens = [t for t in tokens if t not in stop_words]
    print(f"Tokens after stopword removal: {tokens}")  # Debugging line

    pos_tags = pos_tag(tokens)
    print(f"POS tags: {pos_tags}")

    tokens = [lemmatizer.lemmatize(t, get_wordnet_pos(tag)) for t, tag in pos_tags]
    print(f"Tokens after lemmatization: {tokens}")  # Debugging line
    print("")

    return ' '.join(tokens)

In [36]:
sentences = raw_text.strip().split('\n')
df = pd.DataFrame(sentences, columns=['original'])
df['cleaned'] = df['original'].apply(preprocess)


Tokens: ['Until', 'recently', ',', 'the', 'conventional', 'wisdom', 'was', 'that', 'while', 'AI', 'was', 'better', 'than', 'humans', 'at', 'data-driven', 'decision', 'making', 'tasks', ',', 'it', 'was', 'still', 'inferior', 'to', 'humans', 'for', 'cognitive', 'and', 'creative', 'ones', '.', 'But', 'in', 'the', 'past', 'two', 'years', 'language-based', 'AI', 'has', 'advanced', 'by', 'leaps', 'and', 'bounds', ',', 'changing', 'common', 'notions', 'of', 'what', 'this', 'technology', 'can', 'do', '.']
Tokens after punctuation removal: ['until', 'recently', 'the', 'conventional', 'wisdom', 'was', 'that', 'while', 'ai', 'was', 'better', 'than', 'humans', 'at', 'decision', 'making', 'tasks', 'it', 'was', 'still', 'inferior', 'to', 'humans', 'for', 'cognitive', 'and', 'creative', 'ones', 'but', 'in', 'the', 'past', 'two', 'years', 'ai', 'has', 'advanced', 'by', 'leaps', 'and', 'bounds', 'changing', 'common', 'notions', 'of', 'what', 'this', 'technology', 'can', 'do']
Tokens after stopword remo

In [37]:
df

Unnamed: 0,original,cleaned
0,"Until recently, the conventional wisdom was th...",recently conventional wisdom ai good human dec...
1,The most visible advances have been in what’s ...,visible advance call natural language processi...
2,Yet while these stunts may be attention grabbi...,yet stunt may attention grab really indicative...
3,What NLP Can Do,nlp
4,The best known natural language processing too...,best know natural language processing tool ope...
5,"For businesses, the three areas where GPT-3 ha...",business three area appear promise write cod r...
6,Models like GPT-3 are considered to be foundat...,model like consider foundation model emerge ai...
7,A Language-Based AI Research Assistant,ai research assistant
8,"In my own work, I’ve been looking at how GPT-3...",work look tool assist researcher research proc...
9,I’ve found — not surprisingly — that Elicit wo...,find surprisingly elicit work well task others...
