<a href="https://colab.research.google.com/github/Jondoloh/Data-Science-in-practice_STA2546/blob/main/Data_science_in_practice_group_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Loading the Libraries and dependencies

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns

## 2. Loading the 20 newsgroups dataset

The 20 Newsgroups dataset is a collection of 20,000 news articles partitioned into 20 different categories. It was likely collected by Ken Lang for his paper "Newsweeder: Learning to filter netnews."

In [None]:
# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all')

In [None]:
#the data keys : represented as a dictionary
newsgroups.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [None]:
len(newsgroups.data)

18846

In [None]:
newsgroups.filenames[:2]

array(['/root/scikit_learn_data/20news_home/20news-bydate-test/rec.sport.hockey/54367',
       '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.ibm.pc.hardware/60215'],
      dtype='<U86')

In [None]:
len(newsgroups.filenames)

18846

In [None]:
#exploring the 20 categories
newsgroups.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [None]:
len(newsgroups.target_names)

20

In [None]:
# target: index of the category name in the target_names list
newsgroups.target

array([10,  3, 17, ...,  3,  1,  7])

In [None]:
#sample category names
for t in newsgroups.target[:10]:
  print(newsgroups.target_names[t])

rec.sport.hockey
comp.sys.ibm.pc.hardware
talk.politics.mideast
comp.sys.ibm.pc.hardware
comp.sys.mac.hardware
sci.electronics
comp.sys.mac.hardware
rec.sport.hockey
rec.sport.hockey
talk.religion.misc


## 3. Text Processing

In [None]:
# Preprocess the text data
vectorizer = TfidfVectorizer()
text_data = vectorizer.fit_transform(newsgroups.data)

## 4. Splitting the dataset into training, test sets

In [None]:
# Split the dataset into training and testing sets
train_data, test_data, train_target, test_target = train_test_split(text_data, newsgroups.target, test_size=0.2, random_state=42)

## 5. Training a classifier

In [None]:
# Train a multi-class classification model using SVM
clf = SVC(kernel='linear', C=1, random_state=42)

In [None]:
clf.fit(train_data, train_target)

SVC(C=1, kernel='linear', random_state=42)

## 6. Evaluating model performance

In [None]:
# Evaluate the model on the testing data
predictions = clf.predict(test_data)

In [None]:
predictions

array([ 9, 12, 14, ...,  0, 19, 14])

In [None]:
#Accuracy
print("Accuracy: ", accuracy_score(test_target, predictions))

Accuracy:  0.9124668435013262


SVM achieves 91.2% accuracy

In [None]:
# #confusion matrix
# print("Confusion Matrix: \n", confusion_matrix(test_target, predictions))

In [None]:
cm=confusion_matrix(test_target, predictions)

In [None]:
# Transform the confusion matrix into a dataframe
df_cm = pd.DataFrame(cm, index = [i for i in newsgroups.target_names],
                  columns = [i for i in newsgroups.target_names])

In [None]:
df_cm

Unnamed: 0,alt.atheism,comp.graphics,comp.os.ms-windows.misc,comp.sys.ibm.pc.hardware,comp.sys.mac.hardware,comp.windows.x,misc.forsale,rec.autos,rec.motorcycles,rec.sport.baseball,rec.sport.hockey,sci.crypt,sci.electronics,sci.med,sci.space,soc.religion.christian,talk.politics.guns,talk.politics.mideast,talk.politics.misc,talk.religion.misc
alt.atheism,137,0,0,0,0,0,0,1,0,0,0,0,0,1,2,0,0,0,1,9
comp.graphics,0,176,6,7,1,8,2,0,0,1,0,0,0,1,0,0,0,0,0,0
comp.os.ms-windows.misc,0,8,163,13,1,9,1,0,0,0,0,0,0,0,0,0,0,0,0,0
comp.sys.ibm.pc.hardware,0,9,7,149,6,2,6,0,0,0,0,0,4,0,0,0,0,0,0,0
comp.sys.mac.hardware,0,5,3,9,176,1,7,0,0,0,0,0,3,0,0,0,1,0,0,0
comp.windows.x,0,16,1,4,1,189,1,1,0,0,1,0,0,0,1,0,0,0,0,0
misc.forsale,0,0,2,13,2,0,164,4,0,0,1,0,6,0,1,0,0,0,0,0
rec.autos,0,1,0,0,0,0,2,188,1,0,0,0,3,1,0,0,0,0,0,0
rec.motorcycles,0,0,0,0,1,0,4,2,159,0,0,0,1,0,0,0,1,0,0,0
rec.sport.baseball,0,0,0,0,0,0,1,0,0,206,3,0,0,0,0,0,0,0,0,1


In [None]:
# print("Confusion Matrix: \n",sns.heatmap(confusion_matrix(test_target, predictions), annot=True,fmt=".0f"))

In [None]:
print("Classification Report: \n", classification_report(test_target, predictions))

Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.91      0.92       151
           1       0.77      0.87      0.82       202
           2       0.87      0.84      0.85       195
           3       0.73      0.81      0.77       183
           4       0.93      0.86      0.89       205
           5       0.88      0.88      0.88       215
           6       0.81      0.85      0.83       193
           7       0.92      0.96      0.94       196
           8       0.98      0.95      0.96       168
           9       0.97      0.98      0.97       211
          10       0.97      0.95      0.96       198
          11       0.99      0.95      0.97       201
          12       0.89      0.87      0.88       202
          13       0.94      0.94      0.94       194
          14       0.97      0.97      0.97       189
          15       0.97      0.98      0.97       202
          16       0.96      0.95      0.95       188
  