<a href="https://colab.research.google.com/github/HiveCase/MachineLearningPractice/blob/main/Week8/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
?CountVectorizer

In [None]:
corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
 ]

In [None]:
vectorizer = CountVectorizer()

In [None]:
X = vectorizer.fit_transform(corpus)
X

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 21 stored elements and shape (4, 9)>

In [None]:
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [None]:
X.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

In [None]:
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0,1,1,1,0,0,1,0,1
1,0,2,0,1,0,1,1,0,1
2,1,0,0,1,1,0,1,1,1
3,0,1,1,1,0,0,1,0,1


Each text string in the corpus is represented with words present by ignoring the word order. This representation is called `Bag of Word` *(BoW)* representation.

In [None]:
v2 = CountVectorizer(lowercase=False)
v2.fit_transform(corpus).toarray()

array([[0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0],
       [0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 0],
       [1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1]])

In [None]:
v2.get_feature_names_out()

array(['And', 'Is', 'This', 'document', 'first', 'is', 'one', 'second',
       'the', 'third', 'this'], dtype=object)

In [None]:
d2 = pd.DataFrame(v2.fit_transform(corpus).toarray(), columns=v2.get_feature_names_out())
d2

Unnamed: 0,And,Is,This,document,first,is,one,second,the,third,this
0,0,0,1,1,1,1,0,0,1,0,0
1,0,0,1,2,0,1,0,1,1,0,0
2,1,0,0,0,0,1,1,0,1,1,1
3,0,1,0,1,1,0,0,0,1,0,1


### Remove Stopwords

In [None]:
v3 = CountVectorizer(
    lowercase=True,
    token_pattern= r"[a-zA-Z]+",
    stop_words='english'
)
X = v3.fit_transform(corpus)
print(f'Number of features: {len(v3.get_feature_names_out())}')
df = pd.DataFrame(X.toarray(), columns = v3.get_feature_names_out())
df


Number of features: 2


Unnamed: 0,document,second
0,1,0
1,2,1
2,0,0
3,1,0


In [None]:
v3 = CountVectorizer(
    lowercase=True,
    token_pattern= r"[a-zA-Z]+",
    # stop_words='english',
    ngram_range=(2,2)
)
X = v3.fit_transform(corpus)
print(f'Number of features: {len(v3.get_feature_names_out())}')
df = pd.DataFrame(X.toarray(), columns = v3.get_feature_names_out())
df

Number of features: 13


Unnamed: 0,and this,document is,first document,is the,is this,second document,the first,the second,the third,third one,this document,this is,this the
0,0,0,1,1,0,0,1,0,0,0,0,1,0
1,0,1,0,1,0,1,0,1,0,0,1,0,0
2,1,0,0,1,0,0,0,0,1,1,0,1,0
3,0,0,1,0,1,0,1,0,0,0,0,0,1


In [None]:
from sklearn.datasets import fetch_20newsgroups

In [None]:
newsgroups = fetch_20newsgroups(subset='all',categories=['alt.atheism','soc.religion.christian'])
X,y = newsgroups.data, newsgroups.target
target_names = newsgroups.target_names

In [None]:
X[0]

'From: pages!bwebster@uunet.uu.net (Bruce F. Webster)\nSubject: Re: Mormon beliefs about bastards\nReply-To: pages!bwebster@uunet.uu.net\nOrganization: Pages Software Inc.\nLines: 63\n\nIn article <May.9.05.41.46.1993.27571@athos.rutgers.edu> erh0362@tesla.njit.edu  \nwrites:\n> \n>     Could anyone enlighten me on how the Mormon church views \n> children born out of wedlock?  In particular I\'m interested to know if any \n> stigma is attached to the children as opposed to the parents.  I\'m especially \n> keen to learn if there is or is not any prohibition in the Mormon faith on \n> bastards entering heaven or having their names entered in the big  \ngenealogical \n> book the Mormons keep in Salt Lake City.  If this is an issue on which the \n> "official" position has changed over time, I\'m interested in learning both  \nold \n> and new beliefs.  E-mail or posting is fine.  All information or pointers are \n> appreciated.\n> \n\nWell, since my wife is (in your gentle term) a "bastard

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [None]:
len(X_train)

1257

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [None]:
log_pipe = Pipeline([
    ('tfidf',TfidfVectorizer(lowercase=True, stop_words='english',max_features=5000)),
    ('log',LogisticRegression(max_iter=1000, random_state=42))
])

In [None]:
log_pipe.fit(X_train,y_train)

In [None]:
y_pred = log_pipe.predict(X_test)
y_pred[:10]

array([1, 1, 1, 0, 1, 1, 0, 1, 0, 1])

In [None]:
y_test[:10]

array([1, 1, 1, 0, 1, 1, 0, 1, 0, 1])

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,y_pred,target_names=target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.99      0.94      0.96       235
soc.religion.christian       0.96      0.99      0.97       304

              accuracy                           0.97       539
             macro avg       0.97      0.97      0.97       539
          weighted avg       0.97      0.97      0.97       539

