<a href="https://colab.research.google.com/github/Jimbrown234/GBC_MLP1/blob/github_task/news_articles_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries

In [4]:
import numpy as np
import pandas as pd


from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import time
import re
import pickle
from string import punctuation
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud

from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier

# Loading Dataset

In [17]:
df = pd.read_csv('News.csv')
df.head()

Unnamed: 0,headline,category,authors,short_description
0,There Were 2 Mass Shootings In Texas Last Wee...,CRIME,Melissa Jeltsen,She left her husband. He killed their childre...
1,Will Smith Joins Diplo And Nicky Jam For The ...,ENTERTAINMENT,Andy McDonald,Of course it has a song.
2,Hugh Grant Marries For The First Time At Age ...,ENTERTAINMENT,Ron Dicker,The actor and his longtime girlfriend Anna Eb...
3,Jim Carrey Blasts 'Castrato' Adam Schiff And ...,ENTERTAINMENT,Ron Dicker,The actor gives Dems an ass-kicking for not f...
4,Julianna Margulies Uses Donald Trump Poop Bag...,ENTERTAINMENT,Ron Dicker,The Dietland actress said using the bags is a...


Let's check is their any Nan values which we should take care of.

In [19]:
df['headline']

0          There Were 2 Mass Shootings In Texas Last Wee...
1          Will Smith Joins Diplo And Nicky Jam For The ...
2          Hugh Grant Marries For The First Time At Age ...
3          Jim Carrey Blasts 'Castrato' Adam Schiff And ...
4          Julianna Margulies Uses Donald Trump Poop Bag...
                                ...                        
200848     RIM CEO Thorsten Heins' 'Significant' Plans F...
200849     Maria Sharapova Stunned By Victoria Azarenka ...
200850     Giants Over Patriots Jets Over Colts Among  M...
200851                                 Aldon Smith Arrested
200852     Dwight Howard Rips Teammates After Magic Loss...
Name: headline, Length: 200853, dtype: object

In [7]:
df.isna().sum()

headline             0
category             0
authors              2
short_description    0
dtype: int64

Our dataset have no empty values.

In [8]:
# top 5 Categories of news in our datset
df.category.value_counts()[:5]

 POLITICS            32739
 WELLNESS            17827
 ENTERTAINMENT       16058
 TRAVEL               9887
 STYLE & BEAUTY       9649
Name: category, dtype: int64

In [20]:
df['headline']

0          There Were 2 Mass Shootings In Texas Last Wee...
1          Will Smith Joins Diplo And Nicky Jam For The ...
2          Hugh Grant Marries For The First Time At Age ...
3          Jim Carrey Blasts 'Castrato' Adam Schiff And ...
4          Julianna Margulies Uses Donald Trump Poop Bag...
                                ...                        
200848     RIM CEO Thorsten Heins' 'Significant' Plans F...
200849     Maria Sharapova Stunned By Victoria Azarenka ...
200850     Giants Over Patriots Jets Over Colts Among  M...
200851                                 Aldon Smith Arrested
200852     Dwight Howard Rips Teammates After Magic Loss...
Name: headline, Length: 200853, dtype: object

In [21]:
df['text'] = df['headline'] + df['short_description'] + df['authors']
df['label'] = df['category']
del df['category']

In [None]:
df.head(10)

Unnamed: 0,text,label
0,There Were 2 Mass Shootings In Texas Last Week...,CRIME
1,Will Smith Joins Diplo And Nicky Jam For The 2...,ENTERTAINMENT
2,Hugh Grant Marries For The First Time At Age 5...,ENTERTAINMENT
3,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,ENTERTAINMENT
4,Julianna Margulies Uses Donald Trump Poop Bags...,ENTERTAINMENT
5,Morgan Freeman 'Devastated' That Sexual Harass...,ENTERTAINMENT
6,Donald Trump Is Lovin' New McDonald's Jingle I...,ENTERTAINMENT
7,What To Watch On Amazon Prime That’s New This ...,ENTERTAINMENT
8,Mike Myers Reveals He'd 'Like To' Do A Fourth ...,ENTERTAINMENT
9,What To Watch On Hulu That’s New This WeekYou'...,ENTERTAINMENT


Let's check total number of words in our dataset.

pandas.core.series.Series

In [27]:
# df['text'].apply(lambda x: len(x.split(' '))).sum()

In [28]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Preprocessing the Data

 Next will go through Stopwords and Lemmatization steps to narrow down our dataset by removing words that our not relevant.

In [30]:
import nltk

In [32]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [33]:
REMOVE_SPECIAL_CHARACTER = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS = re.compile('[^0-9a-z #+_]')

STOPWORDS = set(stopwords.words('english'))
punctuation = list(punctuation)
STOPWORDS.update(punctuation)

lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # part 1
    text = text.lower() # lowering text
    text = REMOVE_SPECIAL_CHARACTER.sub('', text) # replace REPLACE_BY_SPACE symbols by space in text
    text = BAD_SYMBOLS.sub('', text) # delete symbols which are in BAD_SYMBOLS from text
    
    # part 2
    clean_text = []
    for w in word_tokenize(text):
        if w.lower() not in STOPWORDS:
            pos = pos_tag([w])
            new_w = lemmatizer.lemmatize(w, pos=get_simple_pos(pos[0][1]))
            clean_text.append(new_w)
    text = " ".join(clean_text)
    
    return text

In [35]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [37]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [39]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [41]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200853 entries, 0 to 200852
Data columns (total 5 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   headline           200853 non-null  object
 1   authors            200851 non-null  object
 2   short_description  200853 non-null  object
 3   text               200851 non-null  object
 4   label              200853 non-null  object
dtypes: object(5)
memory usage: 7.7+ MB


In [46]:
df['text'] = df['text'].astype('str')

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200853 entries, 0 to 200852
Data columns (total 5 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   headline           200853 non-null  object
 1   authors            200851 non-null  object
 2   short_description  200853 non-null  object
 3   text               200853 non-null  object
 4   label              200853 non-null  object
dtypes: object(5)
memory usage: 7.7+ MB


In [48]:
df['text'] = df['text'].apply(clean_text)

In [57]:
df.to_csv('news_text_cleaned.csv', columns=['text', 'label'])

In [49]:
df.head(10)

Unnamed: 0,headline,authors,short_description,text,label
0,There Were 2 Mass Shootings In Texas Last Wee...,Melissa Jeltsen,She left her husband. He killed their childre...,2 mass shooting texas last week 1 tv author le...,CRIME
1,Will Smith Joins Diplo And Nicky Jam For The ...,Andy McDonald,Of course it has a song.,smith join diplo nicky jam 2018 world cup offi...,ENTERTAINMENT
2,Hugh Grant Marries For The First Time At Age ...,Ron Dicker,The actor and his longtime girlfriend Anna Eb...,hugh grant marries first time age 57 author ac...,ENTERTAINMENT
3,Jim Carrey Blasts 'Castrato' Adam Schiff And ...,Ron Dicker,The actor gives Dems an ass-kicking for not f...,jim carrey blast castrato adam schiff democrat...,ENTERTAINMENT
4,Julianna Margulies Uses Donald Trump Poop Bag...,Ron Dicker,The Dietland actress said using the bags is a...,julianna margulies us donald trump poop bag pi...,ENTERTAINMENT
5,Morgan Freeman 'Devastated' That Sexual Haras...,Ron Dicker,It is not right to equate horrific incidents ...,morgan freeman devastate sexual harassment cla...,ENTERTAINMENT
6,Donald Trump Is Lovin' New McDonald's Jingle ...,Ron Dicker,It's catchy all right.,donald trump lovin new mcdonalds jingle tonigh...,ENTERTAINMENT
7,What To Watch On Amazon Prime That\u2019s New...,Todd Van Luling,There's a great mini-series joining this week.,watch amazon prime thatu2019s new week author ...,ENTERTAINMENT
8,Mike Myers Reveals He'd 'Like To' Do A Fourth...,Andy McDonald,Myer's kids may be pushing for a new Powers f...,mike myers reveals hed like fourth austin powe...,ENTERTAINMENT
9,What To Watch On Hulu That\u2019s New This We...,Todd Van Luling,You're getting a recent Academy Award-winning...,watch hulu thatu2019s new week author youre ge...,ENTERTAINMENT


Let's check how many words are remaining after cleaning the data.

In [None]:
df['text'].apply(lambda x: len(x.split(' '))).sum()

4035112

# Splitting Data

In [50]:
X=df.text
y=df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((150639,), (50214,), (150639,), (50214,))

# Comparing Models

In [52]:
# Creating Models
models = [('Logistic Regression', LogisticRegression(max_iter=500)),('Random Forest', RandomForestClassifier()),
          ('Linear SVC', LinearSVC()), ('Multinomial NaiveBayes', MultinomialNB()), ('SGD Classifier', SGDClassifier())]

best_accuracy = 0
best_model = None
names = []
results = []
model = []

for name, clf in models:
    pipe = Pipeline([('vect', CountVectorizer(max_features=30000, ngram_range=(1, 2))),
                    ('tfidf', TfidfTransformer()),
                    (name, clf),
                    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    if accuracy > best_accuracy:
      best_accuracy = accuracy
    
    names.append(name)
    results.append(accuracy)
    model.append(pipe)
    
    msg = "%s: %f" % (name, accuracy)
    print(msg)

Logistic Regression: 0.700283
Random Forest: 0.636396
Linear SVC: 0.712829
Multinomial NaiveBayes: 0.576293
SGD Classifier: 0.665432


# Saving Model

In [53]:
# Logistic Regression
filename = 'model_lr.sav'
pickle.dump(model[0], open(filename, 'wb'))

# Linear SVC
filename = 'model_lin_svc.sav'
pickle.dump(model[2], open(filename, 'wb'))

# Loading and Testing

In [54]:
lr_model = pickle.load(open('model_lin_svc.sav', 'rb'))

In [56]:
text1 = 'Could have been best all-rounder India ever produced in ODIs: Irfan Pathan'
text2 = "Ashwin Kkumar dances to Kamal Haasan's Annathe song on a treadmill. Actor is proud"
print(model[0].predict([text1,text2]))

[' TRAVEL  ' ' ENTERTAINMENT  ']


In [55]:
text1 = 'Could have been best all-rounder India ever produced in ODIs: Irfan Pathan'
text2 = "Ashwin Kkumar dances to Kamal Haasan's Annathe song on a treadmill. Actor is proud"
print(lr_model.predict([text1, text2]))

[' TRAVEL  ' ' ENTERTAINMENT  ']
