# **Mounting Google Drive**




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).



## **Importing the required Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



### **Importing the dataset**


In [None]:
data = pd.read_csv("/content/drive/MyDrive/NN_news_classification/train.csv")
%cd /content/drive/MyDrive/

/content/drive/MyDrive


In [None]:
data

Unnamed: 0,Title,Excerpt,Category
0,"Uefa Opens Proceedings against Barcelona, Juve...",Uefa has opened disciplinary proceedings again...,sports
1,Amazon Blames Inflation as It Increases Cost o...,The increases are steeper than the 17 percent ...,business
2,Nigeria’s Parliament Passes Amended Electoral ...,Nigeria's Senate on Tuesday passed the harmoni...,politics
3,Nigeria: Lagos Governor Tests Positive for Cov...,"The Lagos State Governor, Mr. Babajide Sanwo-O...",health
4,South Africa Calls For Calm as Electoral Refor...,South Africa has raised concerns about the det...,politics
...,...,...,...
5509,Nigeria’s Central Bank Introduces ‘Naira 4 Dol...,The Central Bank of Nigeria (CBN) has introduc...,business
5510,Super Eagles to Host Leone Stars in AFCON 2023...,Following Tuesday night’s draw for the qualifi...,sports
5511,Frustration in Nigeria Breeding Calls for Sece...,"Ekiti State Governor, Dr. Kayode Fayemi on Tue...",politics
5512,Coca-Cola European Buys Australian Bottler for...,Coca-Cola European Partners Plc has agreed to ...,business


In [None]:
data.describe()

Unnamed: 0,Title,Excerpt,Category
count,5513,5514,5514
unique,5313,5313,6
top,Merck Covid Pill’s Success Slams Moderna Share...,Apple is letting some iPhone users fix their o...,business
freq,4,3,1492


In [None]:
data.Category.unique()

array(['sports', 'business', 'politics', 'health', 'tech',
       'entertainment'], dtype=object)

Creation of CategoryId column

In [None]:
data['CategoryId'] = data['Category'].factorize()[0]
data

Unnamed: 0,Title,Excerpt,Category,CategoryId
0,"Uefa Opens Proceedings against Barcelona, Juve...",Uefa has opened disciplinary proceedings again...,sports,0
1,Amazon Blames Inflation as It Increases Cost o...,The increases are steeper than the 17 percent ...,business,1
2,Nigeria’s Parliament Passes Amended Electoral ...,Nigeria's Senate on Tuesday passed the harmoni...,politics,2
3,Nigeria: Lagos Governor Tests Positive for Cov...,"The Lagos State Governor, Mr. Babajide Sanwo-O...",health,3
4,South Africa Calls For Calm as Electoral Refor...,South Africa has raised concerns about the det...,politics,2
...,...,...,...,...
5509,Nigeria’s Central Bank Introduces ‘Naira 4 Dol...,The Central Bank of Nigeria (CBN) has introduc...,business,1
5510,Super Eagles to Host Leone Stars in AFCON 2023...,Following Tuesday night’s draw for the qualifi...,sports,0
5511,Frustration in Nigeria Breeding Calls for Sece...,"Ekiti State Governor, Dr. Kayode Fayemi on Tue...",politics,2
5512,Coca-Cola European Buys Australian Bottler for...,Coca-Cola European Partners Plc has agreed to ...,business,1


In [None]:
category = data[['Category', 'CategoryId']].drop_duplicates().sort_values('CategoryId')
category

Unnamed: 0,Category,CategoryId
0,sports,0
1,business,1
2,politics,2
3,health,3
14,tech,4
27,entertainment,5


### **Preprocessing**

In [None]:

def convert_lower(text):
   return text.lower()
data['Excerpt'] = data['Excerpt'].apply(convert_lower)
data['Excerpt']

0       uefa has opened disciplinary proceedings again...
1       the increases are steeper than the 17 percent ...
2       nigeria's senate on tuesday passed the harmoni...
3       the lagos state governor, mr. babajide sanwo-o...
4       south africa has raised concerns about the det...
                              ...                        
5509    the central bank of nigeria (cbn) has introduc...
5510    following tuesday night’s draw for the qualifi...
5511    ekiti state governor, dr. kayode fayemi on tue...
5512    coca-cola european partners plc has agreed to ...
5513    central bank of nigeria governor godwin emefie...
Name: Excerpt, Length: 5514, dtype: object

In [None]:
#removal of stopwords
def remove_stopwords(text):
  stop_words = set(stopwords.words('english'))
  words = nltk.tokenize.word_tokenize(text)
  return [x for x in words if x not in stop_words]
data['Excerpt'] = data['Excerpt'].apply(remove_stopwords)
data['Excerpt']

0       [uefa, opened, disciplinary, proceedings, barc...
1       [increases, steeper, 17, percent, jump, prime,...
2       [nigeria, 's, senate, tuesday, passed, harmoni...
3       [lagos, state, governor, ,, mr., babajide, san...
4       [south, africa, raised, concerns, deterioratin...
                              ...                        
5509    [central, bank, nigeria, (, cbn, ), introduced...
5510    [following, tuesday, night, ’, draw, qualifica...
5511    [ekiti, state, governor, ,, dr., kayode, fayem...
5512    [coca-cola, european, partners, plc, agreed, b...
5513    [central, bank, nigeria, governor, godwin, eme...
Name: Excerpt, Length: 5514, dtype: object

In [None]:
#lemmatisation
def lemmatize_word(text):
  wordnet = WordNetLemmatizer()
  return " ".join([wordnet.lemmatize(word) for word in text])
data['Excerpt'] = data['Excerpt'].apply(lemmatize_word)
data['Excerpt']

0       uefa opened disciplinary proceeding barcelona ...
1       increase steeper 17 percent jump prime members...
2       nigeria 's senate tuesday passed harmonised cl...
3       lagos state governor , mr. babajide sanwo-olu ...
4       south africa raised concern deteriorating situ...
                              ...                        
5509    central bank nigeria ( cbn ) introduced scheme...
5510    following tuesday night ’ draw qualification s...
5511    ekiti state governor , dr. kayode fayemi tuesd...
5512    coca-cola european partner plc agreed buy aust...
5513    central bank nigeria governor godwin emefiele ...
Name: Excerpt, Length: 5514, dtype: object

In [None]:
data

Unnamed: 0,Title,Excerpt,Category,CategoryId
0,"Uefa Opens Proceedings against Barcelona, Juve...",uefa opened disciplinary proceeding barcelona ...,sports,0
1,Amazon Blames Inflation as It Increases Cost o...,increase steeper 17 percent jump prime members...,business,1
2,Nigeria’s Parliament Passes Amended Electoral ...,nigeria 's senate tuesday passed harmonised cl...,politics,2
3,Nigeria: Lagos Governor Tests Positive for Cov...,"lagos state governor , mr. babajide sanwo-olu ...",health,3
4,South Africa Calls For Calm as Electoral Refor...,south africa raised concern deteriorating situ...,politics,2
...,...,...,...,...
5509,Nigeria’s Central Bank Introduces ‘Naira 4 Dol...,central bank nigeria ( cbn ) introduced scheme...,business,1
5510,Super Eagles to Host Leone Stars in AFCON 2023...,following tuesday night ’ draw qualification s...,sports,0
5511,Frustration in Nigeria Breeding Calls for Sece...,"ekiti state governor , dr. kayode fayemi tuesd...",politics,2
5512,Coca-Cola European Buys Australian Bottler for...,coca-cola european partner plc agreed buy aust...,business,1


## **Vectorisation**


In [None]:
# importing word2vec model
!pip install gensim

import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/NN_news_classification/word2vec.bin', binary=True)



In [None]:
def vectorize(sentence):
    words = sentence.split()
    words_vecs = [model[word] for word in words if word in model]
    if len(words_vecs) == 0:
        return np.zeros(300)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

# Split dataset into training and testing sets

x_train, x_test, y_train, y_test = train_test_split(data.Excerpt, data.CategoryId.values, test_size = 0.2, random_state = 0, shuffle = True)

#vectorisation for x_train,x_test
x_train = np.array([vectorize(sentence) for sentence in x_train])
x_test = np.array([vectorize(sentence) for sentence in x_test])
print(len(x_train))
print(len(x_test))
x_train.shape


4411
1103


(4411, 300)

In [None]:
x_train

array([[ 0.02284071, -0.06662326, -0.02860514, ..., -0.04641385,
         0.01364475,  0.09695096],
       [ 0.02451735,  0.03609173, -0.00405297, ..., -0.05698806,
         0.02405255, -0.05134993],
       [ 0.00927179,  0.09931859,  0.04918324, ..., -0.00556252,
        -0.05638539, -0.09510387],
       ...,
       [ 0.05554754,  0.03423518,  0.03890298, ..., -0.01435991,
         0.0254364 ,  0.12384033],
       [-0.04876709,  0.08238729,  0.06835429, ..., -0.11121622,
        -0.0442454 , -0.00053914],
       [-0.09554037,  0.03334147, -0.00325776, ...,  0.03605957,
         0.04350815, -0.06340739]])

### **Model Training**


In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],           # Number of trees in the forest
    'max_depth': [None, 10, 20],               # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],           # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]              # Minimum samples required to be at a leaf node
}

#Randomforestclassifeir model
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(x_train, y_train)

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

# Get the best parameters and model from the grid search
best_params = grid_search.best_params_
classifier = grid_search.best_estimator_

print("Best Parameters:", best_params)


# Predict using the trained classifier
predictions = classifier.predict(x_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

# Display detailed classification report
class_names = data['Category'].unique()
report = classification_report(y_test, predictions, target_names=class_names)
print("Classification Report:\n", report)

Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Accuracy: 0.80
Classification Report:
                precision    recall  f1-score   support

       sports       0.90      0.94      0.92       279
     business       0.69      0.85      0.76       279
     politics       0.85      0.87      0.86       266
       health       0.91      0.67      0.77       129
         tech       0.19      0.12      0.15        59
entertainment       0.87      0.64      0.73        91

     accuracy                           0.80      1103
    macro avg       0.74      0.68      0.70      1103
 weighted avg       0.80      0.80      0.79      1103



## **Testing**


In [None]:
#testing for an example

y_pred1='''The world's third largest economy saw its Gross Domestic Product (GDP) rise by an annualised 6% in the period.It is about twice the rate of growth forecast by economists and marks the biggest rise in almost three years.
The fall in the value of the yen helped exporters as Japanese-made goods became cheaper for consumers around the world.'''

#preprocessing
y_pred1 = convert_lower(y_pred1)
y_pred1 = remove_stopwords(y_pred1)
y_pred1 = lemmatize_word(y_pred1)

#vectorisation
y_pred1 = np.array(vectorize(y_pred1))
y_pred1=np.array([y_pred1])

#predicting the output
yy = classifier.predict(y_pred1)

if yy[0]== 0:
  result = "sports"
elif yy[0] == 1:
  result = "business"
elif yy[0] == 2:
  result = "politics"
elif yy[0] == 3:
  result = "health"
elif yy[0] == 4:
  result = "tech"
elif yy[0]==5:
  result="entertainment"
print(result)

business


### **Saving model to Drive**


In [20]:
import joblib

# Save the trained model to a file
model_filename = 'random_forest_classifier_mdl.pkl'
joblib.dump(classifier, model_filename)
print(f"Trained model saved as {model_filename}")


Trained model saved as random_forest_classifier_mdl.pkl
