#### Exercise1

#### Import library

In [10]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import string
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\seakl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\seakl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Make a sentence

In [2]:
sentence1 = "Welcome to NLP Learning, Now start learning"
sentence2 = "Learning is a good practice"

#### Convert to Lower Case

In [3]:
tokens1 = word_tokenize(sentence1.lower())
tokens2 = word_tokenize(sentence2.lower())

In [4]:
tokens1

['welcome', 'to', 'nlp', 'learning', ',', 'now', 'start', 'learning']

#### Token1 + Token2

In [5]:
# Vocabulary in order of first appearance
vocab = []
for token in tokens1 + tokens2:
    if token not in vocab:
        vocab.append(token)

In [6]:
vocab

['welcome',
 'to',
 'nlp',
 'learning',
 ',',
 'now',
 'start',
 'is',
 'a',
 'good',
 'practice']

#### Bag Of Word

In [7]:

def bow_vector(tokens, vocab):
    return [tokens.count(word) for word in vocab]

vec1 = bow_vector(tokens1, vocab)
vec2 = bow_vector(tokens2, vocab)

In [8]:
vec1

[1, 1, 1, 2, 1, 1, 1, 0, 0, 0, 0]

#### Remove stop words

In [None]:

stop_words = set(stopwords.words('english'))
filtered_tokens1 = [word for word in tokens1 if word not in stop_words]
filtered_tokens2 = [word for word in tokens2 if word not in stop_words]
filtered_vocab = []
for token in filtered_tokens1 + filtered_tokens2:
    if token not in filtered_vocab:
        filtered_vocab.append(token)
filtered_vec1 = bow_vector(filtered_tokens1, filtered_vocab)
filtered_vec2 = bow_vector(filtered_tokens2, filtered_vocab)
print("Vocabulary:", vocab)
print("BoW Vector for Sentence 1:", vec1)
print("BoW Vector for Sentence 2:", vec2)
print("Filtered Vocabulary:", filtered_vocab)
print("Filtered BoW Vector for Sentence 1:", filtered_vec1)
print("Filtered BoW Vector for Sentence 2:", filtered_vec2)


Vocabulary: ['welcome', 'to', 'nlp', 'learning', ',', 'now', 'start', 'is', 'a', 'good', 'practice']
BoW Vector for Sentence 1: [1, 1, 1, 2, 1, 1, 1, 0, 0, 0, 0]
BoW Vector for Sentence 2: [0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1]
Filtered Vocabulary: ['welcome', 'nlp', 'learning', ',', 'start', 'good', 'practice']
Filtered BoW Vector for Sentence 1: [1, 1, 2, 1, 1, 0, 0]
Filtered BoW Vector for Sentence 2: [0, 0, 1, 0, 0, 1, 1]


#### Exercise2

In [None]:


sentence1 = "This is a good job. I will not miss it for anything"
sentence2 = "This is not good at all"

# Custom vocabulary to match expected output
vocab = ["good", "job", "miss"]

vectorizer = CountVectorizer(vocabulary=vocab)

X = vectorizer.fit_transform([sentence1, sentence2])

print(vectorizer.get_feature_names_out())
print(X.toarray())


['good' 'job' 'miss']
[[1 1 1]
 [1 0 0]]


#### Exercise3

In [11]:

# 1. Define sample documents
documents = [
    "This is a good job. I will not miss it for anything",
    "This is not good at all"
]

In [12]:
# 2. Tokenize and convert to lowercase
tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]

In [13]:
# 3. Remove stopwords and punctuation
stop_words = set(stopwords.words('english'))
punct = set(string.punctuation)

In [14]:
filtered_docs = []
for tokens in tokenized_docs:
    filtered = [word for word in tokens if word not in stop_words and word not in punct]
    filtered_docs.append(filtered)

print("Filtered documents:", filtered_docs)

Filtered documents: [['good', 'job', 'miss', 'anything'], ['good']]


In [15]:
# 4. Create vocabulary (unique words across all docs)
vocab = sorted(list(set([word for doc in filtered_docs for word in doc])))
print("Vocabulary:", vocab)

Vocabulary: ['anything', 'good', 'job', 'miss']


In [16]:

# 5. Initialize BoW dictionary for each document
bow_list = []

for doc in filtered_docs:
    bow = dict.fromkeys(vocab, 0)   # start all counts at 0
    for word in doc:
        bow[word] += 1
    bow_list.append(bow)

In [None]:
# 6. Print BoW representation
for i, bow in enumerate(bow_list):
    print(f"\nBoW for Document {i+1}:")
    print(bow)


BoW for Document 1:
{'anything': 1, 'good': 1, 'job': 1, 'miss': 1}

BoW for Document 2:
{'anything': 0, 'good': 1, 'job': 0, 'miss': 0}


#### Exercise4

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from tabulate import tabulate

In [2]:
df = pd.read_csv(r"C:\Users\seakl\Documents\I5-AMS\WR\TP\TP-03\Data\IMDB Dataset.csv")
print(tabulate(df.head(), headers='keys', tablefmt='fancy_grid'))

╒════╤══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════

In [3]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['review'])
y = df['sentiment']


In [4]:
## Encode labels
y = y.map({'positive': 1, 'negative': 0})


In [29]:
X.shape

(50000, 101583)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [31]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))


Decision Tree Accuracy: 0.7317
              precision    recall  f1-score   support

    negative       0.73      0.73      0.73      4961
    positive       0.74      0.73      0.73      5039

    accuracy                           0.73     10000
   macro avg       0.73      0.73      0.73     10000
weighted avg       0.73      0.73      0.73     10000



In [6]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.8634
              precision    recall  f1-score   support

           0       0.86      0.87      0.86      4961
           1       0.87      0.86      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [7]:
new_review = ["This movie is very interesting and exciting"]
new_review_vector = vectorizer.transform(new_review)

print("Predicted Sentiment (RF):", rf.predict(new_review_vector)[0])


Predicted Sentiment (RF): 1
