In [1]:
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint

def size_mb(docs):
    return sum(len(s.encode("utf-8")) for s in docs) / 1e6

data_train = fetch_20newsgroups(subset='train')
data_test = fetch_20newsgroups(subset='test')

pprint(data_train.target_names)

print(f'Total of {len(data_train.data)} posts in the dataset and the total size is {size_mb(data_train.data):.2f}MB')

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']
Total of 11314 posts in the dataset and the total size is 22.05MB


In [2]:
print("First few documents in the dataset:")
for i in range(5):  # Print the first 5 documents
    print(f"Document {i + 1}:")
    print(data_train.data[i])  # Print the content of the document
    print("=" * 50)  # Print a separator line

First few documents in the dataset:
Document 1:
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----





Document 2:
From: guykuo@carson.u.washington.edu (Guy Kuo)
Subject: SI Clock Poll - Final Call
Summary: Final call for SI clock reports
Keywords: SI,acceleration,clock,upgrade
Article-I.D.: shelley.1qvfo9INNc3s
Organization: Unive

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

# Vectorize Training data
vectorizer = CountVectorizer(stop_words="english")
X_train = vectorizer.fit_transform(data_train.data)
print(f'Size of the vocabulary is {len(vectorizer.get_feature_names_out())}')

# Vectorize Testing data
X_test = vectorizer.transform(data_test.data)
y_train, y_test = data_train.target, data_test.target


print(X_train.shape)
print(y_train.shape)

Size of the vocabulary is 129796
(11314, 129796)
(11314,)


In [6]:
# Part b: Baseline classifier using Sklearn's DummyClassifier
baseline_clf = DummyClassifier(strategy="most_frequent")
baseline_clf = baseline_clf.fit(X_train, y_train)

NameError: name 'DummyClassifier' is not defined

In [41]:
# Part c: Measure computation time for baseline classifier
baseline_time = timeit.timeit(lambda: baseline_clf.predict(X_test), number=1)


In [44]:
# Part d: Computation time and classification accuracy 

import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

pred = baseline_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'Classification accuracy {acc:.2f}')
print("Time for sample classification:", baseline_time,"secs")

Classification accuracy 0.05
Time for sample classification: 0.00014507600280921906 secs


In [62]:
# Part e: Implement own nearest neighbor classifier

import numpy as np

def nearest_neighbor_classifier(X_train, y_train, X_test):
    predictions = []
    for test_sample in X_test:
        test_sample_dense = test_sample.toarray()  
        distances = np.linalg.norm(X_train - test_sample_dense, axis=1) 
        nearest_index = np.argmin(distances) 
        prediction = y_train[nearest_index]  
        predictions.append(prediction)
    return np.array(predictions)


predictions = nearest_neighbor_classifier(X_train, y_train, X_test)

# Print the predictions
print("Predictions:", predictions)

KeyboardInterrupt: 

In [65]:
X_test.shape

(7532, 129796)

In [66]:
# Part f: Computation time and classification accuracy and portion sample size

# Select only 5 test samples
X_test_subset = X_test[:5]
y_test_subset = y_test[:5]
predictions = nearest_neighbor_classifier(X_train, y_train, X_test_subset)
print(predictions)


[19  4  0 19  0]


In [67]:
computation_time = timeit.timeit(lambda: nearest_neighbor_classifier(X_train, y_train, X_test_subset), number=1)
acc = accuracy_score(y_test_subset, predictions)

In [68]:
print("Sample Size: 5")
print(f'Classification accuracy {acc:.2f}')
print("Time for sample classification:", computation_time,"secs")

Sample Size: 5
Classification accuracy 0.20
Time for sample classification: 252.55178767200414 secs


In [72]:
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train, y_train)

predictions = knn_classifier.predict(X_test_subset)
computation_time = timeit.timeit(lambda: knn_classifier.predict(X_test_subset), number=1)
acc = accuracy_score(y_test_subset, predictions)

print("Predictions:", predictions)
print(f'Classification accuracy {acc:.2f}')
print("Time for sample classification:", computation_time,"secs")

Predictions: [19  4  0  3  0]
Classification accuracy 0.20
Time for sample classification: 0.03415647000656463 secs
