In [1]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

In [18]:
post = fetch_20newsgroups(subset='all', 
                          categories=['sci.electronics', 'sci.space'],
                          data_home='/home/luffy/Documents/Data_Science/NLP/notebooks/',
                          remove=('headers', 'footers', 'quotes')
                         )

In [19]:
post.data[1]

"AL>>        Question:   Is there a certain device out there that I can\nAL>>                    use to find out the number to the line?\nAL>>        Thanks for any response.\nAL>>                                                    Al\n\nAL>There is a number you can call which will return a synthesized\nAL>voice telling you the number of the line.  Unfortunately, for the\nAL>life of me I can't remember what it is. The telephone technicians\nAL>use it all the time.  We used to play around with this in our\nAL>dorm rooms since there were multiple phone lines running between\nAL>rooms.\n\nIt probably wouldn't help for you to post the number, since it appears\nto be different in each area.  For what it's worth, in the New Orleans\narea the number is 998-877-6655 (easy to remember, what?)\n\n\n * SLMR 2.1 * Ask me anything: if I don't know, I'll make up something.\n                                          "

In [20]:
post.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [21]:
df = pd.DataFrame({
    'text':post.data,
    'label':[post.target_names[target] for target in post.target]})
df

Unnamed: 0,text,label
0,\n >\tIf the new Kuiper belt object *is* ...,sci.space
1,AL>> Question: Is there a certain dev...,sci.electronics
2,"\nIt's not quite what you were asking, but a f...",sci.space
3,"\n\n\nNo, the sky does not, at this time, belo...",sci.space
4,"\nDigi-Key also sells Quad Line Receivers, pa...",sci.electronics
...,...,...
1966,\n\n\nThanks again. One final question. The ...,sci.space
1967,\nCheck the station's master sync generator. ...,sci.electronics
1968,Brian Yamauchi asks: [Regarding orbital billbo...,sci.space
1969,\n\nWhy not design the solar arrays to be deta...,sci.space


In [22]:
df.shape

(1971, 2)

In [32]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/luffy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/luffy/nltk_data...


True

In [33]:
# clean and pre-process text

def clean_text(text):

    tokens = word_tokenize(text)

    tokens = [word for word in tokens if word.isalpha()]

    tokens = [word.lower() for word in tokens]

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    clean_text = " ".join(tokens)

    return clean_text

In [34]:
df['clean_text'] = df['text'].apply(clean_text)

In [35]:
df

Unnamed: 0,text,label,clean_text
0,\n >\tIf the new Kuiper belt object *is* ...,sci.space,new kuiper belt object called next one called ...
1,AL>> Question: Is there a certain dev...,sci.electronics,al question certain device al use find number ...
2,"\nIt's not quite what you were asking, but a f...",sci.space,quite asking year ago helped ee remote sensing...
3,"\n\n\nNo, the sky does not, at this time, belo...",sci.space,sky time belong anyone ownership necessary def...
4,"\nDigi-Key also sells Quad Line Receivers, pa...",sci.electronics,also sell quad line receiver part quad line dr...
...,...,...,...
1966,\n\n\nThanks again. One final question. The ...,sci.space,thanks one final question name gehrels known t...
1967,\nCheck the station's master sync generator. ...,sci.electronics,check station master sync generator probably f...
1968,Brian Yamauchi asks: [Regarding orbital billbo...,sci.space,brian yamauchi asks regarding orbital billboar...
1969,\n\nWhy not design the solar arrays to be deta...,sci.space,design solar array detachable shuttle going re...


In [36]:
clean_data = df[['clean_text', 'label']]
clean_data

Unnamed: 0,clean_text,label
0,new kuiper belt object called next one called ...,sci.space
1,al question certain device al use find number ...,sci.electronics
2,quite asking year ago helped ee remote sensing...,sci.space
3,sky time belong anyone ownership necessary def...,sci.space
4,also sell quad line receiver part quad line dr...,sci.electronics
...,...,...
1966,thanks one final question name gehrels known t...,sci.space
1967,check station master sync generator probably f...,sci.electronics
1968,brian yamauchi asks regarding orbital billboar...,sci.space
1969,design solar array detachable shuttle going re...,sci.space


In [37]:
# Train Test Split

x = df['clean_text']
y = df['label']

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=1)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(1478,) (1478,)
(493,) (493,)


In [42]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(min_df = 10, ngram_range=(2,2))

x_train_count = count_vect.fit_transform(x_train)

x_test_counts = count_vect.transform(x_test)

In [43]:
counts_df = pd.DataFrame(x_train_count.toarray(), columns=count_vect.get_feature_names_out())

In [44]:
counts_df

Unnamed: 0,air force,almost certainly,also available,ames dryden,anonymous ftp,answer question,anyone know,anything else,appreciated thanks,available via,...,would appreciated,would go,would greatly,would help,would like,would make,would much,would need,would probably,year ago
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1473,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1474,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1475,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1476,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vector = TfidfVectorizer(max_df=0.7, min_df = 0.01)

tfidf_train = tfidf_vector.fit_transform(x_train)
tfidf_test = tfidf_vector.transform(x_test)

tfidf_df = pd.DataFrame(tfidf_train.toarray(), columns=tfidf_vector.get_feature_names_out())
tfidf_df

Unnamed: 0,able,ac,acceleration,access,according,across,act,action,active,activity,...,would,write,writing,written,wrong,wrote,year,yes,yet,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.070089,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.163896,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
1474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
1475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.112464,0.0,0.078465,0.104041,0.0,0.0
1476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0


In [49]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
nb = MultinomialNB()
nb.fit(tfidf_train, y_train)
y_pred = nb.predict(tfidf_test)
metrics.accuracy_score(y_test, y_pred)

0.9249492900608519

In [50]:
labels = ['sci.electronics', 'sci.space']
cm = metrics.confusion_matrix(y_test, y_pred, labels=labels)

cm_df = pd.DataFrame(cm, index=labels, columns=labels)

cm_df

Unnamed: 0,sci.electronics,sci.space
sci.electronics,227,7
sci.space,30,229
