<a href="https://colab.research.google.com/github/Jondoloh/Data-Science-in-practice_STA2546/blob/main/STA2546_Data_Analytics_in_Practice_p04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1) Libraries and Dependencies

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn import metrics

import numpy as np
import warnings
warnings.simplefilter('ignore')

# 2) Preliminary Data Analysis

####      a) Data extraction and train/test split

In [None]:
# Import the fetch_20newsgroups function from sklearn's datasets module
# The function retrieves the 20 newsgroups dataset

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

print(len(newsgroups_train.data))
print(len(newsgroups_test.data))

11314
7532


#### b) Mapping to new categories

In [None]:
# This code creates a new target label mapping from the original target labels in the "newsgroups_train" and "newsgroups_test" datasets
# The original target labels are grouped into 6 categories and mapped to new target labels

# Define the mapping from original target labels to new target labels
target_names_new = ["Religion", "Technology", "Other", "Recreation", "Science", "Politics"]
old_to_new_target = {0: 0, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 2,
                      7: 3, 8: 3, 9: 3, 10: 3, 11: 4, 12: 1, 13: 4,
                      14: 4, 15: 0, 16: 5, 17: 5, 18: 5, 19: 5}

# Create a list of new target labels for each example in the training and test sets
new_target_list_train = [old_to_new_target[target] for target in newsgroups_train.target]
new_target_list_test = [old_to_new_target[target] for target in newsgroups_test.target]

# Add the new target labels and target label names as columns in the training and test sets
newsgroups_train["new_target"] = np.array(new_target_list_train)
newsgroups_train["new_target_names"] = np.array(target_names_new)
newsgroups_test["new_target"] = np.array(new_target_list_test)
newsgroups_test["new_target_names"] = np.array(target_names_new)

#### c) Vectorizing the data

In [None]:
# Initializing the Tf-Idf vectorizer
vectorizer = TfidfVectorizer()

# Creating vectors from the training data
vectors = vectorizer.fit_transform(newsgroups_train.data)

# Checking the shape of the resulting vector representation of the training data
vectors.shape


(11314, 130107)

# 3) First Iteration of the model
#### (train based on data including headers/footers/quotes)

### Multinomial NB Model

In [None]:
# Transforming test data into document-term matrix
vectors_test = vectorizer.transform(newsgroups_test.data)

# Initializing the Multinomial Naive Bayes classifier object with alpha=0.01
clf = MultinomialNB(alpha=.01)

# Fitting the classifier on the training data
clf.fit(vectors, newsgroups_train.new_target)

# Making predictions on the test data
pred = clf.predict(vectors_test)

# Calculating the Weighted Average F1-Score
score1 = metrics.f1_score(newsgroups_test.new_target, pred, average='weighted')

# Calculating the Macro Average F1-Score
score2 = metrics.f1_score(newsgroups_test.new_target, pred, average='macro')

# Printing the results
print("Weighted Average F1-Score:", score1)
print("Macro Average F1-Score:", score2)


Weighted Average F1-Score: 0.9057288478213792
Macro Average F1-Score: 0.8778762480896006


#### a) Look at top ten feature/words after model training

In [None]:
def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-10:]
        print("%s: %s" % (category, " ".join(feature_names[top10])))

show_top10(clf, vectorizer, newsgroups_train.new_target_names)

Religion: it you god in and is that to of the
Technology: that for in edu it is and of to the
Other: shipping offer of 00 to and edu the for sale
Recreation: you is that it edu of and in to the
Science: be edu it that in is and of to the
Politics: edu it is you that in and to of the


#### b) Test out with 4 made-up text strings

In [None]:
docs_new = ['God is love', 'ChatGPT is useful for assignments', 'Vote Jimmy for president', 'I need a trip']
X_new_counts = vectorizer.transform(docs_new)

predicted = clf.predict(X_new_counts)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, newsgroups_train.new_target_names[category]))

'God is love' => Religion
'ChatGPT is useful for assignments' => Technology
'Vote Jimmy for president' => Politics
'I need a trip' => Recreation


### SVM Model

In [None]:
# Initializing a Support Vector Machine Classifier with a linear kernel, C=1 and a random seed of 42
clf = SVC(kernel='linear', C=1, random_state=42)

# Fitting the model to the training data and target labels
clf.fit(vectors, newsgroups_train.new_target)

# Predicting the target labels for the test data
pred = clf.predict(vectors_test)

# Calculating the weighted average F1-Score for the predictions
score1 = metrics.f1_score(newsgroups_test.new_target, pred, average='weighted')

# Calculating the macro average F1-Score for the predictions
score2 = metrics.f1_score(newsgroups_test.new_target, pred, average='macro')

# Printing the weighted average F1-Score and the macro average F1-Score
print("Weighted Average F1-Score:", score1)
print("Macro Average F1-Score:", score2)


Weighted Average F1-Score: 0.910525601732771
Macro Average F1-Score: 0.8989236822491088


In [None]:
#Exploring the classification output of the first ten features
def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-10:]
        print("%s: %s" % (category, " ".join(feature_names[top10])))

show_top10(clf, vectorizer, newsgroups_train.new_target_names)

In [None]:
#Testing the model with completely new external data
docs_new = ['God is love', 'ChatGPT is useful for assignments', 'Vote Jimmy for president', 'I need a trip']
X_new_counts = vectorizer.transform(docs_new)

predicted = clf.predict(X_new_counts)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, newsgroups_train.new_target_names[category]))

'God is love' => Religion
'ChatGPT is useful for assignments' => Technology
'Vote Jimmy for president' => Technology
'I need a trip' => Technology


# 4) Second Iteration of the model
#### (train based on data including headers/footers/quotes, dropping common words)

In [None]:
#vectorization
vectorizer = TfidfVectorizer(stop_words = 'english')
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(11314, 129796)

### Multinomial NB Model

In [None]:
#Training, testing ane evaluation of the model
vectors_test = vectorizer.transform(newsgroups_test.data)
clf = MultinomialNB(alpha=.01)
clf.fit(vectors, newsgroups_train.new_target)
pred = clf.predict(vectors_test)
score1 = metrics.f1_score(newsgroups_test.new_target, pred, average='weighted')
score2 = metrics.f1_score(newsgroups_test.new_target, pred, average='macro')

print("Weighted Average F1-Score:", score1)
print("Macro Average F1-Score:", score2)

Weighted Average F1-Score: 0.9061555127259279
Macro Average F1-Score: 0.8770841773714819


In [None]:
#Exploring the classification of the first ten documents
def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-10:]
        print("%s: %s" % (category, " ".join(feature_names[top10])))

show_top10(clf, vectorizer, newsgroups_train.new_target_names)

Religion: church com christians christian bible keith people jesus edu god
Technology: host use thanks university organization subject lines com windows edu
Other: lines condition distribution university new shipping offer 00 edu sale
Recreation: subject organization game writes team article car ca com edu
Science: article writes nasa chip encryption clipper space key com edu
Politics: israeli government don article gun writes israel people com edu


In [None]:
#Testing our model using external data
docs_new = ['God is love', 'ChatGPT is useful for assignments', 'Vote Jimmy for president', 'I need a trip']
X_new_counts = vectorizer.transform(docs_new)

predicted = clf.predict(X_new_counts)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, newsgroups_train.new_target_names[category]))

'God is love' => Religion
'ChatGPT is useful for assignments' => Technology
'Vote Jimmy for president' => Politics
'I need a trip' => Recreation


### SVM Model

In [None]:
#Training, testing and evaluating the SVM model
clf = SVC(kernel='linear', C=1, random_state=42)
clf.fit(vectors, newsgroups_train.new_target)
pred = clf.predict(vectors_test)
score1 = metrics.f1_score(newsgroups_test.new_target, pred, average='weighted')
score2 = metrics.f1_score(newsgroups_test.new_target, pred, average='macro')

print("Weighted Average F1-Score:", score1)
print("Macro Average F1-Score:", score2)

Weighted Average F1-Score: 0.912611928711664
Macro Average F1-Score: 0.8992232049992123


In [None]:
#Testing our model using completely new extrenal data
docs_new = ['God is love', 'ChatGPT is useful for assignments', 'Vote Jimmy for president', 'I need a trip']
X_new_counts = vectorizer.transform(docs_new)

predicted = clf.predict(X_new_counts)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, newsgroups_train.new_target_names[category]))

'God is love' => Religion
'ChatGPT is useful for assignments' => Technology
'Vote Jimmy for president' => Politics
'I need a trip' => Technology


# 5) Third Iteration of the model
#### (train based on data excluding headers/titles/subjects, dropping common words)

In [None]:
# Load the training set of the 20 newsgroups dataset, with headers, footers and quotes removed
newsgroups_train_2 = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

# Load the testing set of the 20 newsgroups dataset, with headers, footers and quotes removed
newsgroups_test_2 = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))


In [None]:
# Define the target names for the new categories
target_names_new = ["Religion", "Technology", "Other", "Recreation", "Science", "Politics"]

# Create a mapping from the old target categories to the new ones
old_to_new_target = ({0: 0, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1,
                      6: 2, 7: 3, 8: 3, 9: 3, 10: 3, 11: 4, 
                      12: 1, 13: 4, 14: 4, 15: 0, 16: 5, 17: 5, 18: 5, 19: 5})

# Add the new target column to the training dataset
newsgroups_train_2["new_target"] = np.array([old_to_new_target[x] for x in newsgroups_train_2.target])

# Add the new target names column to the training dataset
newsgroups_train_2["new_target_names"] = np.array(target_names_new)

# Add the new target column to the testing dataset
newsgroups_test_2["new_target"] = np.array([old_to_new_target[x] for x in newsgroups_test_2.target])

# Add the new target names column to the testing dataset
newsgroups_test_2["new_target_names"] = np.array(target_names_new)


In [None]:
#vectorize the data
vectors_2 = vectorizer.fit_transform(newsgroups_train_2.data)
vectors_2.shape

(11314, 101322)

## Multinomial NB model

In [None]:
#Training, texting and evaluation our model
vectors_test_2 = vectorizer.transform(newsgroups_test_2.data)
clf = MultinomialNB(alpha=.01)
clf.fit(vectors_2, newsgroups_train_2.new_target)
pred = clf.predict(vectors_test_2)
score1 = metrics.f1_score(newsgroups_test_2.new_target, pred, average='weighted')
score2 = metrics.f1_score(newsgroups_test_2.new_target, pred, average='macro')

print("Weighted Average F1-Score:", score1)
print("Macro Average F1-Score:", score2)

Weighted Average F1-Score: 0.8163039273316107
Macro Average F1-Score: 0.7825500811273862


In [None]:
#Exploring the classification output of the first ten documnents
def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-10:]
        print("%s: %s" % (category, " ".join(feature_names[top10])))

show_top10(clf, vectorizer, newsgroups_train_2.new_target_names)

Religion: just say believe church don think bible people jesus god
Technology: file drive problem card like does know use thanks windows
Other: asking email sell price condition new shipping offer 00 sale
Recreation: good think don bike like just year team game car
Science: chip know like just don clipper people encryption space key
Politics: like know did think gun government just israel don people


In [None]:
#Testing our model with new external data
docs_new = ['God is love', 'ChatGPT is useful for assignments', 'Vote Jimmy for president', 'I need a trip']
X_new_counts = vectorizer.transform(docs_new)

predicted = clf.predict(X_new_counts)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, newsgroups_train.new_target_names[category]))

'God is love' => Religion
'ChatGPT is useful for assignments' => Science
'Vote Jimmy for president' => Recreation
'I need a trip' => Recreation


## SVM model

In [None]:
# clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)
clf = SVC(kernel='linear', C=1, random_state=42)
clf.fit(vectors_2, newsgroups_train_2.new_target)
pred = clf.predict(vectors_test_2)
score1 = metrics.f1_score(newsgroups_test_2.new_target, pred, average='weighted')
score2 = metrics.f1_score(newsgroups_test_2.new_target, pred, average='macro')

print("Weighted Average F1-Score:", score1)
print("Macro Average F1-Score:", score2)

Weighted Average F1-Score: 0.795195286101736
Macro Average F1-Score: 0.7745307883443774


In [None]:
#Exploring the classification of the top 10 documents
def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-10:]
        print("%s: %s" % (category, " ".join(feature_names[top10])))

show_top10(clf, vectorizer, newsgroups_train_2.new_target_names)

In [None]:
#Testing our model using new external data
docs_new = ['God is love', 'ChatGPT is useful for assignments', 'Vote Jimmy for president', 'I need a trip']
X_new_counts = vectorizer.transform(docs_new)

predicted = clf.predict(X_new_counts)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, newsgroups_train.new_target_names[category]))

'God is love' => Religion
'ChatGPT is useful for assignments' => Science
'Vote Jimmy for president' => Recreation
'I need a trip' => Politics
