In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [2]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
df = pd.read_csv('english_news_dataset.csv')

In [4]:
def lastpart(x):
    x=x[1:-1]
    x=x.split(',')
    return x[-1].replace("'","")
df["News Categories"]=df["News Categories"].apply(lastpart)

In [5]:
le = LabelEncoder()
df['Encoded Categories'] = le.fit_transform(df['News Categories'])

In [6]:
# Show the mapping between the category names and their encoded values
category_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("\nCategory to Number Mapping:")
for category, encoded in category_mapping.items():
    print(f"{category}: {encoded}")


Category to Number Mapping:
 Asia_Cup_2023: 0
 Asian_Games_2022: 1
 Australian_Open_2024: 2
 Coronavirus: 3
 Health___Fitness: 4
 Hockey_World_Cup_2023: 5
 Israel-Hamas_War: 6
 Lifestyle: 7
 ODI_World_Cup_2023: 8
 Russia-Ukraine_Conflict: 9
 business: 10
 education: 11
 entertainment: 12
 facts: 13
 fashion: 14
 hatke: 15
 miscellaneous: 16
 national: 17
 policy: 18
 science: 19
 sports: 20
 startup: 21
 technology: 22
 travel: 23
 world: 24
Health___Fitness: 25
automobile: 26
business: 27
education: 28
entertainment: 29
fashion: 30
miscellaneous: 31
national: 32
politics: 33
science: 34
sports: 35
startup: 36
technology: 37
travel: 38
world: 39


In [7]:
df['Combined_column'] = df['Headline'] + ' ' + df['Content']

In [8]:
stop_words = set(stopwords.words('english'))

In [9]:
df_sampled = df.sample(10000)

# Tdifvectoriser

In [10]:
# Initialize SpaCy for lemmatization
nlp = spacy.load("en_core_web_sm")

# Function to clean and preprocess text
def preprocess_text(text):
    # 1. Text Cleaning: Remove non-alphabetic characters and unnecessary symbols
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only alphabetic characters and spaces

    # 2. Tokenization: Split text into words (tokens)
    tokens = nltk.word_tokenize(text.lower())  # Convert to lowercase and tokenize

    # 3. Remove Stop Words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # 4. Lemmatization (using NLTK's WordNetLemmatizer or SpaCy)
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(lemmatized_tokens)

df_sampled['Combined_column']=df_sampled['Combined_column'].apply(preprocess_text)

In [11]:
# 5. Vectorization: Convert the processed text into numerical form using TF-IDF
Tdvectorizer = TfidfVectorizer().fit(df_sampled['Combined_column'])
X = Tdvectorizer.transform(df_sampled['Combined_column'])

In [12]:
X_dense = X.toarray()

In [13]:
y = df_sampled['Encoded Categories']

In [14]:
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_dense, y, test_size=0.2, random_state=42)

In [15]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier()

# Train the model
rf_model.fit(X_train, y_train)

In [16]:
y_pred = rf_model.predict(X_test)

In [17]:
# Compute performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Print the performance metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.7335
Precision: 0.7360814026826044
Recall: 0.5396097022425909
F1 Score: 0.582606717178027


## Getting Text from Punch News

In [18]:
# Initialize SpaCy for lemmatization
nlp = spacy.load("en_core_web_sm")

# Input text
text = ['EPL returns: Liverpool play Southampton, Amorim debuts, City target redemption.']

# Function to clean and preprocess text
def preprocess_text(text):
    # 1. Text Cleaning: Remove non-alphabetic characters and unnecessary symbols
    text = re.sub(r'[^a-zA-Z\s]', '', text[0])  # Keep only alphabetic characters and spaces
    
    # 2. Tokenization and stop word removal using NLTK
    tokens = nltk.word_tokenize(text.lower())  # Convert to lowercase and tokenize
    stop_words = set(stopwords.words('english'))  # Set of stopwords from NLTK
    tokens = [word for word in tokens if word not in stop_words]

    # 3. Lemmatization using SpaCy
    doc = nlp(' '.join(tokens))  # Create a SpaCy document
    lemmatized_tokens = [token.lemma_ for token in doc]  # Lemmatize each token

    return ' '.join(lemmatized_tokens)

# Preprocess the text
cleaned_text = preprocess_text(text)

In [19]:
cleaned_text = preprocess_text(text)

cleaned_text_list = [cleaned_text]

tfidf_vector = Tdvectorizer.transform(pd.Series(cleaned_text_list))

tfidf_array = tfidf_vector.toarray()
print("TF-IDF Vector:\n", tfidf_array)


TF-IDF Vector:
 [[0. 0. 0. ... 0. 0. 0.]]


In [20]:
tfidf_array.shape

(1, 29307)

In [21]:
predicted_class= rf_model.predict(tfidf_array)

In [22]:
for c in category_mapping:
    if int(category_mapping[c])==predicted_class:
        print(c)

 national


In [23]:
# Initialize SpaCy for lemmatization
nlp = spacy.load("en_core_web_sm")

# Input text
text = ['PDP postpones NEC meeting as N’Central insists on chair.']

# Function to clean and preprocess text
def preprocess_text(text):
    # 1. Text Cleaning: Remove non-alphabetic characters and unnecessary symbols
    text = re.sub(r'[^a-zA-Z\s]', '', text[0])  # Keep only alphabetic characters and spaces
    
    # 2. Tokenization and stop word removal using NLTK
    tokens = nltk.word_tokenize(text.lower())  # Convert to lowercase and tokenize
    stop_words = set(stopwords.words('english'))  # Set of stopwords from NLTK
    tokens = [word for word in tokens if word not in stop_words]

    # 3. Lemmatization using SpaCy
    doc = nlp(' '.join(tokens))  # Create a SpaCy document
    lemmatized_tokens = [token.lemma_ for token in doc]  # Lemmatize each token

    return ' '.join(lemmatized_tokens)

# Preprocess the text
cleaned_text2 = preprocess_text(text)

In [24]:
cleaned_text2 = preprocess_text(text)

cleaned_text_list2 = [cleaned_text2]

tfidf_vector = Tdvectorizer.transform(pd.Series(cleaned_text_list2))

tfidf_array2 = tfidf_vector.toarray()
print("TF-IDF Vector:\n", tfidf_array2)


TF-IDF Vector:
 [[0. 0. 0. ... 0. 0. 0.]]


In [25]:
tfidf_array2.shape

(1, 29307)

In [26]:
predicted_text= rf_model.predict(tfidf_array2)

In [27]:
for c in category_mapping:
    if int(category_mapping[c])==predicted_text:
        print(c)

 national


# Web Scraping of text from Punch News on 28/11/2024

In [28]:
from bs4 import BeautifulSoup
import requests

In [29]:
url = 'https://punchng.com/topics/news/'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

In [30]:
soup.find_all('h3', class_ = "entry-title")

[<h3 class="entry-title"><a href="https://punchng.com/uk-opens-africas-largest-visa-application-centre-in-lagos/">
                                                                                                     [ICYMI] UK opens Africa’s largest visa application centre in Lagos </a></h3>,
 <h3 class="entry-title"><a href="https://punchng.com/non-oil-exports-key-to-economic-stability-man/">
                                                                                                     Non-oil exports key to economic stability – MAN </a></h3>,
 <h3 class="entry-title"><a href="https://punchng.com/pictorial-lagos-shuts-churches-hotels-others-over-noise-pollution/">
                                                                                                     PICTORIAL: Lagos shuts churches, hotels, others over noise pollution </a></h3>,
 <h3 class="entry-title"><a href="https://punchng.com/edo-recovers-30-vehicles-from-ex-government-officials/">
                            

In [31]:
ttt = soup.find_all('h3', class_ = "entry-title")

In [32]:
texts = [title.text.strip() for title in ttt]
texts

['[ICYMI] UK opens Africa’s largest visa application centre in Lagos',
 'Non-oil exports key to economic stability – MAN',
 'PICTORIAL: Lagos shuts churches, hotels, others over noise pollution',
 'Edo recovers 30 vehicles from ex-government officials',
 'Police alert residents of security drill around Abuja train station',
 'Why I suspended health commissioner – Ebonyi gov']

In [33]:
# Initialize SpaCy for lemmatization
nlp = spacy.load("en_core_web_sm")

# Function to clean and preprocess text
def preprocess_text(text):
    # 1. Text Cleaning: Remove non-alphabetic characters and unnecessary symbols
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only alphabetic characters and spaces
    
    # 2. Tokenization and stop word removal using NLTK
    tokens = nltk.word_tokenize(text.lower())  # Convert to lowercase and tokenize
    stop_words = set(stopwords.words('english'))  # Set of stopwords from NLTK
    tokens = [word for word in tokens if word not in stop_words]

    # 3. Lemmatization using SpaCy
    doc = nlp(' '.join(tokens))  # Create a SpaCy document
    lemmatized_tokens = [token.lemma_ for token in doc]  # Lemmatize each token

    return ' '.join(lemmatized_tokens)

# Process all 6 rows of text
processed_texts = [preprocess_text(text) for text in texts]

# Print processed texts
for original, processed in zip(texts, processed_texts):
    print(f"Original: {original}")
    print(f"Processed: {processed}\n")


Original: [ICYMI] UK opens Africa’s largest visa application centre in Lagos
Processed: icymi uk open africa large visa application centre lago

Original: Non-oil exports key to economic stability – MAN
Processed: nonoil export key economic stability man

Original: PICTORIAL: Lagos shuts churches, hotels, others over noise pollution
Processed: pictorial lago shut church hotel other noise pollution

Original: Edo recovers 30 vehicles from ex-government officials
Processed: edo recovers vehicle exgovernment official

Original: Police alert residents of security drill around Abuja train station
Processed: police alert resident security drill around abuja train station

Original: Why I suspended health commissioner – Ebonyi gov
Processed: suspend health commissioner ebonyi gov



In [39]:
processed_texts_list = [item for sublist in processed_texts for item in sublist]

In [41]:
processed_tfidf_vector = Tdvectorizer.transform(pd.Series(processed_texts_list))

processed_tfidf_array = processed_tfidf_vector.toarray()
print("TF-IDF Vector:\n", processed_tfidf_array)


TF-IDF Vector:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [42]:
predicted_texts = rf_model.predict(processed_tfidf_array)

In [44]:
for c in category_mapping:
    if int(category_mapping[c]) in predicted_texts:
        print(c)

 national


# Bag-of-Words with Countvectorizer

In [45]:
X = df_sampled['Combined_column']

In [46]:
y = df_sampled['Encoded Categories']

### RandomForest Model

In [47]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize CountVectorizer for Bag-of-Words (BoW)
vectorizer = CountVectorizer()

# Fit the vectorizer on the training data and transform the text data into word count vectors
X_train_bow = vectorizer.fit_transform(X_train)

# Transform the test data using the fitted vectorizer (same vocabulary)
X_test_bow = vectorizer.transform(X_test)

# Train the model
rf_model.fit(X_train_bow, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test_bow)


In [48]:
# Compute performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Print the performance metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.73
Precision: 0.7448718735590582
Recall: 0.5361187890736718
F1 Score: 0.5797593915329272


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Logistic Regression Classifier

In [49]:
# Train a Logistic Regression classifier
classifier = LogisticRegression()
classifier.fit(X_train_bow, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test_bow)

# Evaluate the model's performance
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       1.00      0.91      0.95        11
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.74      0.84      0.79        89
           6       0.87      0.65      0.74        31
           7       1.00      0.33      0.50         3
           8       0.62      0.78      0.69        27
           9       0.00      0.00      0.00         6
          10       0.87      0.91      0.89        87
          11       0.86      0.86      0.86         7
          12       0.67      0.31      0.42        13
          13       0.00      0.00      0.00         1
          14       0.88      0.93      0.91       113
          16       1.00      0.20      0.33         5
          17       0.63      0.77      0.70       261
          18       0.00      0.00      0.00         1
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
