# Text Analytics
#  Feature Extraction: Bag of Words

In [1]:
import pandas as pd
import numpy as np

import sklearn.feature_extraction.text as sk_txt
from sklearn.datasets import fetch_20newsgroups

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report

In [2]:
data = fetch_20newsgroups()
type(data)

sklearn.utils._bunch.Bunch

In [3]:
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

# Data Pre Processing

# Clean Your Text Data

In [4]:
# Instantiate an object cv by calling a method named as CountVectorzer()
cv = sk_txt.CountVectorizer(stop_words = 'english')

# Train the dataset by calling a fit_transform() method
X_fin = cv.fit_transform(data.data).toarray()

# Display the rows and colums
X_fin.shape

(11314, 129790)

In [5]:
# Instantiate an object model by calling a method MultinomialNB()
model = MultinomialNB()

In [6]:
# Split the dataset into training and testing parts
X_train, X_test, y_train, y_test = train_test_split(X_fin, data.target, test_size = 0.3)

In [7]:
# Train the model by calling a method fit()
model.fit(X_train,y_train)

In [8]:
# Call predict() method
y_pred = model.predict(X_test)

In [9]:
# Instantiate a the classification report
report_cv = classification_report(y_test, y_pred)

# Display the values of the report
print(report_cv)

              precision    recall  f1-score   support

           0       0.90      0.91      0.91       133
           1       0.69      0.86      0.77       185
           2       0.94      0.28      0.43       167
           3       0.64      0.82      0.72       177
           4       0.84      0.81      0.83       161
           5       0.76      0.88      0.82       180
           6       0.89      0.73      0.80       195
           7       0.89      0.91      0.90       186
           8       0.94      0.93      0.94       175
           9       0.94      0.93      0.94       180
          10       0.94      0.96      0.95       181
          11       0.95      0.94      0.94       187
          12       0.87      0.84      0.85       171
          13       0.93      0.96      0.94       175
          14       0.86      0.98      0.92       176
          15       0.85      0.93      0.89       184
          16       0.88      0.97      0.92       157
          17       0.92    

## TF-IDF

In [4]:
# Create an object 'tfidf' by calling a method TfidfVectorizer()
tfidf = sk_txt.TfidfVectorizer(stop_words = 'english', dtype=np.float32)

# Train the dataset by calling a method fit_tranform() 
X_tfidf = tfidf.fit_transform(data.data).toarray()

In [5]:
# Split the dataset into training and testing parts
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, data.target, test_size = 0.3)

In [7]:
# Instantiate an object model by calling a method MultinomialNB()
model_tfidf = MultinomialNB()

In [9]:
# Train the model by calling a method fit()
model_tfidf.fit(X_train, y_train)

In [10]:
# Call predict() method
y_pred = model_tfidf.predict(X_test)

In [11]:
# Instantiate a the classification report
report_tfidf = classification_report(y_test, y_pred)

# Display the values of the report
print(report_tfidf)

              precision    recall  f1-score   support

           0       0.90      0.84      0.87       141
           1       0.89      0.83      0.86       181
           2       0.85      0.84      0.84       179
           3       0.75      0.84      0.79       179
           4       0.96      0.80      0.87       188
           5       0.90      0.91      0.91       164
           6       0.86      0.75      0.80       171
           7       0.89      0.88      0.88       177
           8       0.88      0.98      0.93       168
           9       0.90      0.96      0.93       170
          10       0.89      0.97      0.93       185
          11       0.86      0.96      0.91       183
          12       0.90      0.77      0.83       177
          13       0.98      0.92      0.95       194
          14       0.88      0.97      0.92       183
          15       0.64      0.98      0.78       179
          16       0.91      0.97      0.94       176
          17       0.91    

## Reference: 
<p>https://medium.com/swlh/tweet-sentiment-analysis-using-python-for-complete-beginners-4aeb4456040</p>