## Loading And Installing The Required Libraries 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
! pip install wordcloud
from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!




## Loading the dataset

In [2]:
df=pd.read_csv('final_news_list.csv')
df.head()
              

Unnamed: 0.1,Unnamed: 0,title,content,category
0,0,\n ‘Back to their worst’: Manchester Unit...,['\n It’s one step forward and two steps ...,sports
1,4,\n Rafael Nadal makes comeback from injur...,['\nRafael Nadal made his long-awaited injury ...,sports
2,5,\n This is what happened in sport during ...,['\n 2023 has been a year unlike any othe...,sports
3,6,\n Rafael Nadal plays down title expectat...,['\nRafa Nadal said it is still “impossible” f...,sports
4,7,\n Arsenal’s Premier League title hopes d...,['\nArsenal missed out on the chance to regain...,sports


In [3]:
a=len(df)
print('Original dataframe length',a)
df.drop_duplicates(inplace=True)
b=len(df)
print('Dataframe length after dropping duplicate rows',b)
print('No of duplicate rows removed',a-b)

Original dataframe length 1256
Dataframe length after dropping duplicate rows 1256
No of duplicate rows removed 0


In [4]:
df.isnull().sum()

Unnamed: 0    0
title         0
content       0
category      0
dtype: int64

### Checking whether the category column' classes are balanced or not

In [5]:
df['category'].value_counts()

category
sports       314
politics     314
economics    314
climate      314
Name: count, dtype: int64

In [6]:
df=df[['title','content','category']]

In [7]:
df.iloc[[0]]

Unnamed: 0,title,content,category
0,\n ‘Back to their worst’: Manchester Unit...,['\n It’s one step forward and two steps ...,sports


In [8]:
df['title'][0]

'\n      ‘Back to their worst’: Manchester United’s misery continues following dismal defeat to Nottingham Forest\n    '

In [9]:
df['content'][0]

"['\\n      It’s one step forward and two steps back for Manchester United this season as Erik ten Hag’s side followed up its thrilling comeback win over Aston Villa with another dismal defeat, this time at the hands of relegation-threatened Nottingham Forest.\\n  ', '\\n      It was yet more hapless defending – a defining feature of United’s season so far – that gifted Argentine international Nicolás Domínguez acres of room in the penalty to steer Forest into the lead after 64 minutes.\\n  ', '\\n      Marcus Rashford drew United level 12 minutes from time after Forest goalkeeper Matt Turner’s error, but Ten Hag’s frail defense was breached once again just four minutes later, as Morgan Gibbs-White finished off a slick counterattack to secure a 2-1 win and three crucial points in the battle against relegation.\\n  ', '\\n      It was United’s second match since British billionaire Jim Ratcliffe completed a deal to buy a 25% stake in the club\\xa0and if United’s Boxing Day comeback agai

In [10]:
df.describe(include='object')

Unnamed: 0,title,content,category
count,1256,1256,1256
unique,1252,1252,4
top,\n Saudi Arabia is trying to disrupt socc...,['\n When Saudi Arabian club Al-Hilal rep...,sports
freq,2,2,314


### Removing the newline tags from the content column

In [11]:
#First, we remove all tags which are present in our given dataset.

s=[]
for i in df['content']:
        x=(i[1:-1].replace('\\n      ','').replace('\\n  ','').replace('\\n',''))
        s.append(x)
    
print(len(s))
df['content']=s

1256


### Removing special_characters from the content column

In [12]:
def special_char(text):
  reviews = ''
  for x in text:
    if x.isalnum():
      reviews = reviews + x
    else:
      reviews = reviews + ' '
  return reviews
df['content'] = df['content'].apply(special_char)


### Converting the alphabets to lowercase in the content column

In [13]:
def convert_lower(text):
   return text.lower()
df['content'] = df['content'].apply(convert_lower)

In [14]:
sample=df['content'][0]

### Removing the stopwords from the content column

In [15]:
def remove_stopwords(text):
  stop_words = set(stopwords.words('english'))
  words = word_tokenize(text)
  return [x for x in words if x not in stop_words]
df['content'] = df['content'].apply(remove_stopwords)

In [16]:
df['content'][0]

['one',
 'step',
 'forward',
 'two',
 'steps',
 'back',
 'manchester',
 'united',
 'season',
 'erik',
 'ten',
 'hag',
 'side',
 'followed',
 'thrilling',
 'comeback',
 'win',
 'aston',
 'villa',
 'another',
 'dismal',
 'defeat',
 'time',
 'hands',
 'relegation',
 'threatened',
 'nottingham',
 'forest',
 'yet',
 'hapless',
 'defending',
 'defining',
 'feature',
 'united',
 'season',
 'far',
 'gifted',
 'argentine',
 'international',
 'nicolás',
 'domínguez',
 'acres',
 'room',
 'penalty',
 'steer',
 'forest',
 'lead',
 '64',
 'minutes',
 'marcus',
 'rashford',
 'drew',
 'united',
 'level',
 '12',
 'minutes',
 'time',
 'forest',
 'goalkeeper',
 'matt',
 'turner',
 'error',
 'ten',
 'hag',
 'frail',
 'defense',
 'breached',
 'four',
 'minutes',
 'later',
 'morgan',
 'gibbs',
 'white',
 'finished',
 'slick',
 'counterattack',
 'secure',
 '2',
 '1',
 'win',
 'three',
 'crucial',
 'points',
 'battle',
 'relegation',
 'united',
 'second',
 'match',
 'since',
 'british',
 'billionaire',
 'jim'

### Lemmatizing the content column

In [17]:

def lemmatize_word(text):
  wordnet = WordNetLemmatizer()
  return " ".join([wordnet.lemmatize(word) for word in text])
df['content'] = df['content'].apply(lemmatize_word)

In [18]:
X = df['content']
Y = df['category']

In [19]:
X[0]

'one step forward two step back manchester united season erik ten hag side followed thrilling comeback win aston villa another dismal defeat time hand relegation threatened nottingham forest yet hapless defending defining feature united season far gifted argentine international nicolás domínguez acre room penalty steer forest lead 64 minute marcus rashford drew united level 12 minute time forest goalkeeper matt turner error ten hag frail defense breached four minute later morgan gibbs white finished slick counterattack secure 2 1 win three crucial point battle relegation united second match since british billionaire jim ratcliffe completed deal buy 25 stake club xa0and united boxing day comeback villa provided optimism going forward saturday defeat quickly laid bare enormity task ahead david brailsford british cycling coach director sport ratcliffe ineos company xa0was attendance city ground alongside legendary united manager alex ferguson men surely left sorely disappointed united los

In [20]:
x = np.array(df.content.values)

#mapping categories to numerical values
y = np.array(df.category.map({'sports':1,'politics':2,'economics':3,'climate':4}).values)

cv = CountVectorizer(max_features = 5000)
x = cv.fit_transform(df.content).toarray()

print("X.shape = ",x.shape)
print("y.shape = ",y.shape)

X.shape =  (1256, 5000)
y.shape =  (1256,)


In [21]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0, shuffle = True)
print(len(x_train))
print(len(x_test))

1004
252


In [22]:
#Create Empty List
#create list of model and accuracy dicts
perform_list = [ ]
#Create, Fit and Predict all ML Model
def run_model(model_name, est_c, est_pnlty):

    mdl=''
    
    if model_name == 'Logistic Regression':
    
        mdl = LogisticRegression()
    
    elif model_name == 'Random Forest':
    
        mdl = RandomForestClassifier(n_estimators=100 ,criterion='entropy' , random_state=0)
    
    elif model_name == 'Multinomial Naive Bayes':
    
        mdl = MultinomialNB(alpha=1.0,fit_prior=True)
    
    elif model_name == 'Support Vector Classifer':
    
        mdl = SVC()
    
    elif model_name == 'Decision Tree Classifier':
    
        mdl = DecisionTreeClassifier()
    
    elif model_name == 'K Nearest Neighbour':
    
        mdl = KNeighborsClassifier(n_neighbors=10 , metric= 'minkowski' , p = 4)
    
    elif model_name == 'Gaussian Naive Bayes':
    
        mdl = GaussianNB()
    
    oneVsRest = OneVsRestClassifier(mdl)
    
    oneVsRest.fit(x_train, y_train)
    
    y_pred = oneVsRest.predict(x_test)
    
    # Performance metrics
    
    accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
    
    # Get precision, recall, f1 scores
    
    precision, recall, f1score, support = score(y_test, y_pred, average='micro')
    
    print(f'Test Accuracy Score of Basic {model_name}: % {accuracy}')
    
    print(f'Precision : {precision}')
    
    print(f'Recall : {recall}')
    
    print(f'F1-score : {f1score}')

    print(confusion_matrix(y_test,y_pred))
    
    # Add performance parameters to list
    
    perform_list.append(dict([
    
    ('Model', model_name),
    
    ('Test Accuracy', round(accuracy, 2)),
    
    ('Precision', round(precision, 2)),
    
    ('Recall', round(recall, 2)),
    
    ('F1', round(f1score, 2))
    
    ]))
#Logistic Regression
run_model('Logistic Regression', est_c=None, est_pnlty=None)
#Logistic Regression | Text Classification of News Articles
print()
#Random Forest
run_model('Random Forest', est_c=None, est_pnlty=None)
#Random Forest 
print()
#Multinomial Naive Bayes
run_model('Multinomial Naive Bayes', est_c=None, est_pnlty=None)
#Multinomial Naive Bayes
print()
#Support Vector Machine
run_model('Support Vector Classifer', est_c=None, est_pnlty=None)
#Support Vector Machine | Text Classification of News Articles
print()
#Decision Tree
run_model('Decision Tree Classifier', est_c=None, est_pnlty=None)
#Decision Tree
 
# print()
# #KNN
# run_model('K Nearest Neighbour', est_c=None, est_pnlty=None)
#KNN | Text Classification of News Articles
 
print()
# Gaussian Naive Bayes
run_model('Gaussian Naive Bayes', est_c=None, est_pnlty=None)
#Gaussian Naive Bayes | Text Classification for Articles 
 

Test Accuracy Score of Basic Logistic Regression: % 94.44
Precision : 0.9444444444444444
Recall : 0.9444444444444444
F1-score : 0.9444444444444444
[[50  0  2  1]
 [ 0 65  3  1]
 [ 1  2 53  2]
 [ 1  1  0 70]]

Test Accuracy Score of Basic Random Forest: % 95.24
Precision : 0.9523809523809523
Recall : 0.9523809523809523
F1-score : 0.9523809523809523
[[51  0  1  1]
 [ 1 65  2  1]
 [ 1  1 55  1]
 [ 1  2  0 69]]

Test Accuracy Score of Basic Multinomial Naive Bayes: % 92.86
Precision : 0.9285714285714286
Recall : 0.9285714285714286
F1-score : 0.9285714285714286
[[49  1  2  1]
 [ 0 64  4  1]
 [ 1  4 52  1]
 [ 1  2  0 69]]

Test Accuracy Score of Basic Support Vector Classifer: % 93.25
Precision : 0.9325396825396826
Recall : 0.9325396825396826
F1-score : 0.9325396825396827
[[50  1  1  1]
 [ 0 65  3  1]
 [ 1  6 50  1]
 [ 1  1  0 70]]

Test Accuracy Score of Basic Decision Tree Classifier: % 87.3
Precision : 0.873015873015873
Recall : 0.873015873015873
F1-score : 0.8730158730158731
[[46  1  1  

In [23]:
X.iloc[600]



In [24]:
#Create Dataframe of Model, Accuracy, Precision, Recall, and F1
model_performance = pd.DataFrame(data=perform_list)
model_performance = model_performance[['Model', 'Test Accuracy', 'Precision', 'Recall', 'F1']]
#modael_performance
#Dataframe of Model | Text Classification for News Articles 
#Best Model to Perform Accuracy Score
#Here, after training and testing the model we find that Random Forest Classifier model has given the best accuracy from all machine learning models.

model = model_performance["Model"]
max_value = model_performance["Test Accuracy"].max()
print("The best accuracy of model is", max_value,"from Random")
#Random Forests
#Fit & predict best ML Model
#Here we fit and predict our best model i.e. Random Forest.

classifier = RandomForestClassifier(n_estimators=100 ,criterion='entropy' , random_state=0).fit(x_train, y_train)
#classifier
y_pred = classifier.predict(x_test)
#Predict News Article
#Now, here, after the completion of model analysis, we can also predict any news articles.

y_pred1 = cv.transform([X.iloc[1000]])
yy = classifier.predict(y_pred1)
print(yy)
result = ""
if yy == [1]:
    result = "Sports News"
elif yy == [2]:
    result = "Politics News"
elif yy == [3]:
    result = "Economics News"
elif yy == [4]:
    result = "Climate News"

print(result)

The best accuracy of model is 95.24 from Random
[4]
Climate News


In [25]:
Y.iloc[1000]

'climate'

### Saving the random forest model using the pickle file

In [26]:
import pickle
data={'count_vectorizer':cv,
      'model':classifier}
with open('nlp_news_saved_steps_classifier.pkl','wb') as file:
    pickle.dump(data,file)

### Reloading the saved model and checking for predictions

In [27]:
with open('nlp_news_saved_steps_classifier.pkl','rb') as file:
    data=pickle.load(file)

In [28]:
count_vectorizer=data['count_vectorizer']
classifier_loaded=data['model']

In [36]:
z=input()

 The Securities and Exchange Commission will decide by next spring on a rule to make public companies disclose how much they generate in greenhouse gases and how climate change could hurt their businesses.  The rule, which comes amid the Biden administration’s efforts to tackle climate change, has been met with backlash from business leaders and lawmakers who argue that it oversteps the SEC’s mission to safeguard investors and regulate markets.  “Congress created the SEC to carry out the mission of protecting investors, maintaining fair, orderly, and efficient markets, and facilitating capital formation—not to advance progressive climate policies,” a group of Republican lawmakers wrote in a letter to the agency earlier this year.


In [37]:
remove_newline=(z[1:-1].replace('\\n      ','').replace('\\n  ','').replace('\\n',''))


In [38]:
def special_char(text):
  reviews = ''
  for x in text:
    if x.isalnum():
      reviews = reviews + x
    else:
      reviews = reviews + ' '
  return reviews

remove_special_chars=special_char(remove_newline)

In [39]:
def convert_lower(text):
   return text.lower()

remove_lowercase=convert_lower(remove_special_chars)

In [40]:
def remove_stopwords(text):
  stop_words = set(stopwords.words('english'))
  words = word_tokenize(text)
  return [x for x in words if x not in stop_words]
    
remove_stpwds=remove_stopwords(remove_lowercase)

In [41]:

def lemmatize_word(text):
  wordnet = WordNetLemmatizer()
  return " ".join([wordnet.lemmatize(word) for word in text])
lemmatize=lemmatize_word(remove_stpwds)

In [42]:

y_pred1 = count_vectorizer.transform([lemmatize])
yy = classifier.predict(y_pred1)
print(yy)
result = ""
if yy == [1]:
    result = "Sports News"
elif yy == [2]:
    result = "Politics News"
elif yy == [3]:
    result = "Economics News"
elif yy == [4]:
    result = "Climate News"
print(result)

[4]
Climate News
