In [None]:
# About Codecademy Project

# In this project, we will use scikit-learn’s Naive Bayes implementation on
# several different datasets. By reporting the accuracy of the classifier, we
# can find which datasets are harder to distinguish. For example, how difficult
# do you think it is to distinguish the difference between emails about hockey
# and emails about soccer? How hard is it to tell the difference between emails
# about hockey and emails about tech? In this project, we’ll find out exactly
# how difficult those two tasks are.

In [None]:
# Examining the Data

# Modules
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

# Dataframe
emails = fetch_20newsgroups()

# Print all targets
print(f"Target Names:\n {emails.target_names}\n")

# Modify our dataframe, take only hockey and baseball targets
emails = fetch_20newsgroups(categories=['rec.sport.baseball', 'rec.sport.hockey'])

# Print interested targets
print(f" New Target Names:\n {emails.target_names}\n")

# Get 5th email by index
print(f"5th Email:\n {emails.data[5]} End of Email\n")

# Check the class of 5th email
print(f"Class of 5th Email: {emails.target[5]}\n")  # returns 1st index, hockey

Target Names:
 ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

 New Target Names:
 ['rec.sport.baseball', 'rec.sport.hockey']

5th Email:
 From: mmb@lamar.ColoState.EDU (Michael Burger)
Subject: More TV Info
Distribution: na
Nntp-Posting-Host: lamar.acns.colostate.edu
Organization: Colorado State University, Fort Collins, CO  80523
Lines: 36

United States Coverage:
Sunday April 18
  N.J./N.Y.I. at Pittsburgh - 1:00 EDT to Eastern Time Zone
  ABC - Gary Thorne and Bill Clement

  St. Louis at Chicago - 12:00 CDT and 11:00 MDT - to Central/Mountain Zones
  ABC - Mike Emerick and Jim Schoenfeld

  Los Angeles at Calgary - 12:00 PDT and 11:00 ADT -

In [None]:
# Making Training and Test Sets

# Create training set
emails_train = fetch_20newsgroups(
    categories=['rec.sport.baseball', 'rec.sport.hockey'],
    subset='train',
    shuffle=True,
    random_state=42
)

# Create test set
emails_test = fetch_20newsgroups(
    categories=['rec.sport.baseball', 'rec.sport.hockey'],
    subset='test',
    shuffle=True,
    random_state=42
)

In [None]:
# Counting Words

# Instantiate count vectorizer
counter = CountVectorizer()

# Fit counter (train it on all emails vocabulary)
counter.fit(emails_test.data + emails_train.data)

# Counts for training set
train_counts = counter.transform(emails_train.data)

# Counts for test set
test_counts = counter.transform(emails_test.data)

In [None]:
# Naive Bayes Classifier

# Instantiate classifier
nb_classifier = MultinomialNB()

# Fit the classifier to data feeding training set ant its labels
nb_classifier.fit(train_counts, emails_train.target)

# Check the score on test set
print(f"Score: {nb_classifier.score(test_counts, emails_test.target)}")

# We can change categories parameter to play around.

Score: 0.9723618090452262
