## News Classifier using NLP

### Importing packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from gensim import utils
import gensim.parsing.preprocessing as gsp
%matplotlib inline

: 

### Loading the Dataset

In [None]:
dataset = pd.read_csv('Datasets/bbc-text.csv')
dataset.head()

: 

In [None]:
#Checking the count of each Category in our dataset

plt.figure(figsize=(12,5))
sns.countplot(x=dataset.category)
plt.title('BBC text class distribution', fontsize=16)
plt.ylabel('Class Counts', fontsize=16)
plt.xlabel('Class Label', fontsize=16)

: 

So, we have business and sport has majority count.

In [None]:
dataset.info()

: 

In [None]:
dataset['text'][1]

: 

In [None]:
filters = [
    gsp.strip_tags,
    gsp.strip_punctuation,
    gsp.strip_multiple_whitespaces,
    gsp.strip_numeric,
    gsp.remove_stopwords,
    gsp.strip_short,
    gsp.stem_text
]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

: 

In [None]:
dataset.iloc[2,1]

: 

In [None]:
#Cleaning the text

clean_text(dataset.iloc[2,1])

: 

In [None]:
#!pip install wordcloud
from wordcloud import WordCloud

: 

In [None]:
#To display most frequent occurance of words in the document
def plot_word_cloud(text):
    wordcloud_instance = WordCloud(width = 800, height = 800, 
                background_color ='black', 
                stopwords=None,
                min_font_size = 10).generate(text) 
             
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud_instance) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.show()

: 

In [None]:
plot_word_cloud(clean_text(dataset.iloc[2,1]))

: 

In [None]:
#To display most frequent occurance of words in the entire Corpus
texts = ''
for index, item in dataset.iterrows():
    texts = texts + ' ' + clean_text(item['text'])
    
plot_word_cloud(texts)

: 

In [None]:
#To display most frequent occurance of words based on each Category
def plot_word_cloud_for_category(bbc_text_df, category):
    text_df = bbc_text_df.loc[bbc_text_df['category'] == str(category)]
    texts = ''
    for index, item in text_df.iterrows():
        texts = texts + ' ' + clean_text(item['text'])
    print('************'+category.title()+' Category********************')
    plot_word_cloud(texts)

: 

In [None]:
for cat in list(set(list(dataset["category"]))):
    plot_word_cloud_for_category(dataset,cat)

: 

In [None]:
X = dataset['text']
y = dataset['category']

: 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator

class Text2TFIDFTransformer(BaseEstimator):
    
    def __init__(self):
        self._model = TfidfVectorizer()
        pass
    def fit(self, df_x, y=None):
        df_x = df_x.apply(lambda x: clean_text(x))
        self._model.fit(df_x)
        return self
    
    def transform(self, df_x):
        return self._model.transform(df_x)

: 

In [None]:
tfidf_transformers = Text2TFIDFTransformer()
tfidf_vectors = tfidf_transformers.fit(X).transform(X)


: 

In [None]:
tfidf_vectors.shape

: 

In [None]:
print(tfidf_vectors)

: 

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(list(set(list(dataset['category']))))

: 

In [None]:
list(le.classes_)

: 

In [None]:
le.transform(list(set(list(dataset['category']))))

: 

In [None]:
dataset['category'] = le.transform(dataset['category'])

: 

In [None]:
type(dataset['text'].values[1])

: 

### Splitting the data into train and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset['text'].values.astype('U'),dataset['category'].values.astype('int32'), test_size=0.10, random_state=0)
classes  = dataset['category'].unique()

: 

In [None]:
classes

: 

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier

: 

In [None]:
X_train[1]

: 

In [None]:
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2),  max_features=50000, max_df=0.5, use_idf=True, norm='l2')
counts = vectorizer.fit_transform(X_train)
vocab = vectorizer.vocabulary_

: 

In [None]:
vocab

: 

In [None]:
classifier = SGDClassifier(alpha=1e-05, max_iter=50, penalty='elasticnet')
targets = y_train
classfier = classifier.fit(counts, targets)
examples_counts = vectorizer.transform(X_test)
predictors = classifier.predict(examples_counts)

: 

In [None]:
import pickle
pickle.dump(classifier,open("news_classifier.pkl","wb"))
pickle.dump(vocab,open("vocab_news_classifier.pkl","wb"))

: 

In [None]:
ls

: 

In [None]:
scores = cross_val_score(classfier, examples_counts, y_test, cv=5)

: 

In [None]:
score = scores.mean()
print(round(score,3))

: 

In [None]:
vec = open("news_classifier.pkl", 'rb')
loaded_model = pickle.load(vec)
vcb = open("vocab_news_classifier.pkl", 'rb')
loaded_vocab = pickle.load(vcb)

: 

In [None]:
test = clean_text(dataset.iloc[2,1])
examples = [test]

: 

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

: 

In [None]:
count_vect = TfidfVectorizer(analyzer='word',ngram_range=(1,2), max_features=50000,max_df=0.5,use_idf=True, norm='l2',vocabulary=loaded_vocab)
tfidf_transformer = TfidfTransformer()
x_count = count_vect.fit_transform(examples)
predicted = loaded_model.predict(x_count)
result_category = predicted[0]
result_category

: 

In [None]:
le.inverse_transform([3])

: 

In [None]:
newTest = """Chinese smartphone major Xiaomi on Thursday said its arm MI India will distribute 2,500 handsets worth Rs 2 crore to support online education of children who are worst affected by the COVID-19 pandemic in the country. "MI India distribution and retails came up with a fantastic idea. Together they will be contributing 2,500 smartphones to enable children most impacted by the pandemic to access quality education. In this endeavour we found the right partner Teach for India who is committed to the cause," its chief operating officer Muralikrishnan B said in an online video.

The company's managing director Manu Jain said MI India is committed to digital India initiative and has always supported education for all.

The company shared that the smartphones pledged are worth Rs 2 crore. Jain appealed to people to bring in notice of the company about anyone who needs a smartphone for online education and the company will try to help them.
"""

: 

In [None]:
test= clean_text(newTest)
examples = [test]
count_vect = TfidfVectorizer(analyzer='word',ngram_range=(1,2), max_features=50000,max_df=0.5,use_idf=True, norm='l2',vocabulary=loaded_vocab)
tfidf_transformer = TfidfTransformer()
x_count = count_vect.fit_transform(examples)
predicted = loaded_model.predict(x_count)
result_category = predicted[0]
result_category

: 

In [None]:
final_pred = le.inverse_transform([result_category])
print(final_pred)

: 

: 