# News prediction

20 News Groups dataset from scitkit learn. It is a dataset commonly used for classification problems. The link to this dateset can be found here: [20 News Groups](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html)

This project demonstrates my ability to use a Naive Bayes model to classify text. The inputs for this model are news articles. The goal is to create a model that accurately classifies news articles into the 6 below categories.




In [1]:
#Import Libraries
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
import string
import re
import nltk
nltk.download('wordnet') # 
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/liamhettinger/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liamhettinger/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/liamhettinger/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/liamhettinger/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/liamhettinger/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
#Finding a subset of categories. Only a subset of the 20 News group categories were used to improve training time.

categories = [
    'alt.atheism',
    'comp.windows.x',
    'rec.autos',
    'rec.sport.baseball',
    'sci.electronics',
    'sci.space',
]

#Importing training and testing data.

train = fetch_20newsgroups(subset='train', 
                                categories=categories,
                                remove=('headers', 'footers', 'quotes')
                          )

test = fetch_20newsgroups(subset='test', 
                                categories=categories,
                                remove=('headers', 'footers', 'quotes')
                          )

### Data Exploration

In [3]:
#Exploring Training dataset
train_data = pd.DataFrame({'text' : train['data'], 
                           'category' : train['target']})
train_data.head()

Unnamed: 0,text,category
0,"Benedikt Rosenau writes, with great authority:...",0
1,\nI don't understand this last statement about...,2
2,I'd like to compile X11r5 on a Sony NWS-1750 r...,1
3,"\n\n\nHow do you know it's based on ignorance,...",0
4,\nmuch crap deleted\n\n\nDEAD WRONG! Last tim...,3


In [4]:
#Exploring Testing dataset
test_data = pd.DataFrame({'text' : test['data'], 
                           'category' : test['target']})
test_data.head()

Unnamed: 0,text,category
0,I can see it now emblazened across the evening...,5
1,The color of the board shows the composition o...,4
2,Regarding the feasability of retrieving the HS...,5
3,\nI believe Acker got a ring from his wife whe...,3
4,\n\nThe new Cruisers DO NOT have independent s...,2


In [5]:
#This output shows the category name and its associated predictor variable.
pd.DataFrame({'Category names':train['target_names'],'Category Numbers':range(0,6)})

Unnamed: 0,Category names,Category Numbers
0,alt.atheism,0
1,comp.windows.x,1
2,rec.autos,2
3,rec.sport.baseball,3
4,sci.electronics,4
5,sci.space,5


## Stemming, Lemmatizing, removing stop words, lowercase

In [6]:
def process_pos(pos):
    if pos.startswith('J'): # adjectives
        return wordnet.ADJ
    elif pos.startswith('V'): # verbes
        return wordnet.VERB
    elif pos.startswith('N'): # nouns
        return wordnet.NOUN
    elif pos.startswith('R'): # adverbs
        return wordnet.ADV
    else:
        return wordnet.NOUN

def slrl(df, column):
    
    punctuation = [punc for punc in string.punctuation]
    stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    english_words = set(nltk.corpus.words.words())
    
    for i in range(len(df)):
        text = df.loc[i,column]
        words = word_tokenize(text)
        words = [word.lower() for word in words]
        words = [lemmatizer.lemmatize(word, pos=process_pos(pos)) 
                            for word,pos in nltk.pos_tag(words) 
                            if word not in stop_words]
        words = [re.sub('[^A-Za-z]', '', word) for word in words]
        words = [word for word in words if word in english_words]
        df.loc[i,column] = ' '.join(words)
    return df

In [7]:
train_data = slrl(train_data, 'text')

In [8]:
test_data = slrl(test_data, 'text')

## Training a Naive Bayes model

In [9]:
# Creating testing variables
X_train = train_data.text
y_train = train_data.category

In [10]:
pipe = Pipeline(steps=[
# A TfidfVectorizer uses an algorithm to determing the originality of a word. Words with greater originality recieve a higher variable weight.
    ('vect', TfidfVectorizer()), 
    ('clf', MultinomialNB()) #Setting up the Naive Bayes Model.
])

In [11]:
#Setting up parameters for grid search. Prefix Vect indicates it is adjusting TfidfVectorizer() parameters. 
params_dic =  {'vect__max_features' : [1000,2000,5000,10000],
               'vect__stop_words' : ['english', None],
               'vect__min_df' : [5,10,20,50],
               'vect__ngram_range' : [(1,1), (1,2),(1,3)],
               'vect__use_idf' : [True,False],
               'clf__alpha' : [0.0001, 0.001, 0.01,0.1, 1]}

In [12]:
#Searching for the optimal using cross validation
grid = GridSearchCV(pipe,params_dic,scoring='accuracy',cv=5, n_jobs=-1, verbose=True)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 960 candidates, totalling 4800 fits


In [13]:
#The training sets accuracy score. The best accuracy score acheived during cross validation.
grid.best_score_

0.8442611640478743

In [14]:
#The TfidfVectorizer parameters that acheive the best results on the holdout sample.
grid.best_params_

{'clf__alpha': 0.1,
 'vect__max_features': 5000,
 'vect__min_df': 5,
 'vect__ngram_range': (1, 3),
 'vect__stop_words': 'english',
 'vect__use_idf': True}

In [15]:
#Creating a Naive Bayes model using the optimal parameters.
best_clf = grid.best_estimator_

In [16]:
#Creating test set to determine models accuracy on new data.
X_test = test_data.text
y_test = test_data.category
y_test_pred = best_clf.predict(X_test)

In [17]:
#This model accurately predicts the new articles category 83.26% of the time.
accuracy_score(y_test, y_test_pred)

0.8003487358326068

In [18]:
#Confusion matrix comparting perdicted results to expected results.
confusion_matrix(y_test,y_test_pred)

array([[240,   8,  20,  14,   9,  28],
       [  2, 342,  11,  13,  14,  13],
       [ 10,   9, 309,  40,  14,  14],
       [ 13,  10,  12, 348,   4,  10],
       [  9,  34,  29,  17, 286,  18],
       [ 23,  11,  10,  23,  16, 311]])

In [19]:
X_test[(y_test==3) & (y_test_pred==4)]

1014    anyone idea get league regularly u wall wall w...
1077       anyone know site could please send also thanks
1231    s franco hardly use hear completely recover m ...
1252    want ticket anyone know get public sale sell w...
Name: text, dtype: object

In [20]:
#Reminder for target category names.
train['target_names']

['alt.atheism',
 'comp.windows.x',
 'rec.autos',
 'rec.sport.baseball',
 'sci.electronics',
 'sci.space']

In [21]:
#Testing out the model with my own predictions
best_clf.predict(['I hit a home run','Formula 1 cars are really fast'])

array([3, 2])

In [22]:
#Testing out the model with my own predictions
best_clf.predict(['They made it to the moon'])

array([5])

In [23]:
grid.best_estimator_.named_steps.clf.feature_count_[0,:]

array([1.19983751, 0.        , 0.76852621, ..., 0.        , 0.        ,
       0.        ])

In [24]:
articles = pd.DataFrame({'feature' : grid.best_estimator_.named_steps.vect.get_feature_names_out(), 
                       'alt.atheism' : grid.best_estimator_.named_steps.clf.feature_count_[0,:], 
                       'comp.windows.x' : grid.best_estimator_.named_steps.clf.feature_count_[1,:],
                       'rec.autos' : grid.best_estimator_.named_steps.clf.feature_count_[2,:],
                       'rec.sport.baseball' : grid.best_estimator_.named_steps.clf.feature_count_[3,:],
                       'sci.electronics' : grid.best_estimator_.named_steps.clf.feature_count_[4,:],
                       'sci.space' : grid.best_estimator_.named_steps.clf.feature_count_[5,:]}).set_index('feature')
articles.head()

Unnamed: 0_level_0,alt.atheism,comp.windows.x,rec.autos,rec.sport.baseball,sci.electronics,sci.space
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
aa,1.199838,0.314732,0.0,1.53831,0.437299,0.238578
abbreviation,0.0,0.0,0.054796,0.374678,0.51366,0.0
ability,0.768526,0.685768,0.346969,2.47808,0.532519,0.274483
able,1.682047,3.508683,1.31005,1.760524,3.696567,3.039196
able help,0.0,0.762574,0.0,0.219214,0.732737,0.0


In [25]:
articles['count'] = articles.sum(axis=1)

In [26]:
#Examining high impact words
articles.sort_values(by='count',ascending=False).head(10)

Unnamed: 0_level_0,alt.atheism,comp.windows.x,rec.autos,rec.sport.baseball,sci.electronics,sci.space,count
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
use,5.231209,29.417603,8.524109,4.359885,22.640516,10.17303,80.346352
like,6.732587,11.584822,14.108445,9.732445,12.371945,11.929445,66.45969
know,9.645785,12.821619,10.144534,9.845058,12.628434,10.494716,65.580146
think,14.183065,5.077558,10.54396,16.142158,5.662762,12.635028,64.244531
car,1.407123,0.114664,51.345508,0.0,3.635416,0.312205,56.814914
good,6.967036,4.237756,11.653796,14.314933,10.950726,7.154473,55.278721
say,18.940015,3.111751,8.622091,11.091401,4.861163,7.845505,54.471926
make,11.80725,5.700499,9.343056,9.923603,8.827061,8.615119,54.216588
year,2.872986,1.760699,8.018966,24.833165,3.169736,9.927557,50.583109
look,4.028859,10.330826,10.934333,7.428525,9.916133,5.816313,48.454989
