<a href="https://colab.research.google.com/github/Jondoloh/Data-Science-in-practice_STA2546/blob/main/Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn import metrics
import warnings
warnings.simplefilter('ignore')

import numpy as np

# 1) Preliminary Analysis of data

####      a) train and test split

In [None]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

print(len(newsgroups_train.data))
print(len(newsgroups_test.data))

11314
7532


In [None]:
new_target_list_train = []
new_target_list_test = []
curr_target_list_train = newsgroups_train.target
curr_target_list_test = newsgroups_test.target
target_names_new = ["Religion", "Technology", "Other", "Recreation", "Science", "Politics"]

def target_transform(data):
    output_list = []
    for i in range(len(data)):
        if data[i] in (0, 15):
            output_list.append(0)
        elif data[i] in (1, 2, 3, 4, 5, 12):
            output_list.append(1)
        elif data[i] == 6:
            output_list.append(2)
        elif data[i] in (7, 8, 9, 10):
            output_list.append(3)
        elif data[i] in (11, 13, 14):
            output_list.append(4)
        elif data[i] in (16, 17, 18, 19):
            output_list.append(5)
    return output_list

new_target_list_train = target_transform(curr_target_list_train)
new_target_list_test = target_transform(curr_target_list_test)

newsgroups_train["new_target"] = np.array(new_target_list_train)
newsgroups_train["new_target_names"] = np.array(target_names_new)
newsgroups_test["new_target"] = np.array(new_target_list_test)
newsgroups_test["new_target_names"] = np.array(target_names_new)

In [None]:
#vectorize the data
vectorizer = TfidfVectorizer(stop_words='english')
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(11314, 129796)

# 2) First Iteration of the model
### (train based on all data)

# Multinomial NB Model

In [None]:
vectors_test = vectorizer.transform(newsgroups_test.data)
clf = MultinomialNB(alpha=.01)
clf.fit(vectors, newsgroups_train.new_target)
pred = clf.predict(vectors_test)
score1 = metrics.f1_score(newsgroups_test.new_target, pred, average='weighted')
score2 = metrics.f1_score(newsgroups_test.new_target, pred, average='macro')

print("Weighted Average F1-Score:", score1)
print("Macro Average F1-Score:", score2)

Weighted Average F1-Score: 0.9061555127259279
Macro Average F1-Score: 0.8770841773714819


In [None]:
def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-10:]
        print("%s: %s" % (category, " ".join(feature_names[top10])))

show_top10(clf, vectorizer, newsgroups_train.new_target_names)

Religion: church com christians christian bible keith people jesus edu god
Technology: host use thanks university organization subject lines com windows edu
Other: lines condition distribution university new shipping offer 00 edu sale
Recreation: subject organization game writes team article car ca com edu
Science: article writes nasa chip encryption clipper space key com edu
Politics: israeli government don article gun writes israel people com edu


In [None]:
docs_new = ['God is love', 'ChatGPT is useful for assignments', 'Vote Jimmy for president', 'I need a trip']
X_new_counts = vectorizer.transform(docs_new)

predicted = clf.predict(X_new_counts)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, newsgroups_train.new_target_names[category]))

'God is love' => Religion
'ChatGPT is useful for assignments' => Technology
'Vote Jimmy for president' => Politics
'I need a trip' => Recreation


# SVM Model

In [None]:
# clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)
clf = SVC(kernel='linear', C=1, random_state=42)
clf.fit(vectors, newsgroups_train.new_target)
pred = clf.predict(vectors_test)
score1 = metrics.f1_score(newsgroups_test.new_target, pred, average='weighted')
score2 = metrics.f1_score(newsgroups_test.new_target, pred, average='macro')

print("Weighted Average F1-Score:", score1)
print("Macro Average F1-Score:", score2)

Weighted Average F1-Score: 0.912611928711664
Macro Average F1-Score: 0.8992232049992123


In [None]:
def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-10:]
        print("%s: %s" % (category, " ".join(feature_names[top10])))

show_top10(clf, vectorizer, newsgroups_train.new_target_names)

Religion: 00
Technology: 00
Other: 00
Recreation: 00
Science: 00
Politics: 00


In [None]:
docs_new = ['God is love', 'ChatGPT is useful for assignments', 'Vote Jimmy for president', 'I need a trip']
X_new_counts = vectorizer.transform(docs_new)

predicted = clf.predict(X_new_counts)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, newsgroups_train.new_target_names[category]))

'God is love' => Religion
'ChatGPT is useful for assignments' => Technology
'Vote Jimmy for president' => Politics
'I need a trip' => Technology


# 3) Second Iteration of the model
### (train based on data excluding headers/titles/subjects)

In [None]:
newsgroups_train_2 = fetch_20newsgroups(subset='train',remove=('headers', 'footers', 'quotes'))
newsgroups_test_2 = fetch_20newsgroups(subset='test',remove=('headers', 'footers', 'quotes'))

In [None]:
new_target_list_train_2 = []
new_target_list_test_2 = []
curr_target_list_train_2 = newsgroups_train_2.target
curr_target_list_test_2 = newsgroups_test_2.target
target_names_new = ["Religion", "Technology", "Other", "Recreation", "Science", "Politics"]

def target_transform(data):
    output_list = []
    for i in range(len(data)):
        if data[i] in (0, 15):
            output_list.append(0)
        elif data[i] in (1, 2, 3, 4, 5, 12):
            output_list.append(1)
        elif data[i] == 6:
            output_list.append(2)
        elif data[i] in (7, 8, 9, 10):
            output_list.append(3)
        elif data[i] in (11, 13, 14):
            output_list.append(4)
        elif data[i] in (16, 17, 18, 19):
            output_list.append(5)
    return output_list

new_target_list_train_2 = target_transform(curr_target_list_train_2)
new_target_list_test_2 = target_transform(curr_target_list_test_2)

newsgroups_train_2["new_target"] = np.array(new_target_list_train_2)
newsgroups_train_2["new_target_names"] = np.array(target_names_new)
newsgroups_test_2["new_target"] = np.array(new_target_list_test_2)
newsgroups_test_2["new_target_names"] = np.array(target_names_new)

In [None]:
#vectorize the data
vectors_2 = vectorizer.fit_transform(newsgroups_train_2.data)
vectors_2.shape

(11314, 101322)

## Multinomial NB model

In [None]:
vectors_test_2 = vectorizer.transform(newsgroups_test_2.data)
clf = MultinomialNB(alpha=.01)
clf.fit(vectors_2, newsgroups_train_2.new_target)
pred = clf.predict(vectors_test_2)
score1 = metrics.f1_score(newsgroups_test_2.new_target, pred, average='weighted')
score2 = metrics.f1_score(newsgroups_test_2.new_target, pred, average='macro')

print("Weighted Average F1-Score:", score1)
print("Macro Average F1-Score:", score2)

Weighted Average F1-Score: 0.8163039273316107
Macro Average F1-Score: 0.7825500811273862


In [None]:
def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-10:]
        print("%s: %s" % (category, " ".join(feature_names[top10])))

show_top10(clf, vectorizer, newsgroups_train_2.new_target_names)

Religion: just say believe church don think bible people jesus god
Technology: file drive problem card like does know use thanks windows
Other: asking email sell price condition new shipping offer 00 sale
Recreation: good think don bike like just year team game car
Science: chip know like just don clipper people encryption space key
Politics: like know did think gun government just israel don people


In [None]:
docs_new = ['God is love', 'ChatGPT is useful for assignments', 'Vote Jimmy for president', 'I need a trip']
X_new_counts = vectorizer.transform(docs_new)

predicted = clf.predict(X_new_counts)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, newsgroups_train.new_target_names[category]))

'God is love' => Religion
'ChatGPT is useful for assignments' => Science
'Vote Jimmy for president' => Recreation
'I need a trip' => Recreation


## SVM model

In [None]:
# clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)
clf = SVC(kernel='linear', C=1, random_state=42)
clf.fit(vectors_2, newsgroups_train_2.new_target)
pred = clf.predict(vectors_test_2)
score1 = metrics.f1_score(newsgroups_test_2.new_target, pred, average='weighted')
score2 = metrics.f1_score(newsgroups_test_2.new_target, pred, average='macro')

print("Weighted Average F1-Score:", score1)
print("Macro Average F1-Score:", score2)

Weighted Average F1-Score: 0.795195286101736
Macro Average F1-Score: 0.7745307883443774


In [None]:
def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-10:]
        print("%s: %s" % (category, " ".join(feature_names[top10])))

show_top10(clf, vectorizer, newsgroups_train_2.new_target_names)



Religion: 00
Technology: 00
Other: 00
Recreation: 00
Science: 00
Politics: 00


In [None]:
docs_new = ['God is love', 'ChatGPT is useful for assignments', 'Vote Jimmy for president', 'I need a trip']
X_new_counts = vectorizer.transform(docs_new)

predicted = clf.predict(X_new_counts)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, newsgroups_train.new_target_names[category]))

'God is love' => Religion
'ChatGPT is useful for assignments' => Technology
'Vote Jimmy for president' => Recreation
'I need a trip' => Recreation
