In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [2]:
df = pd.read_csv('english_news_dataset.csv')

In [3]:
def lastpart(x):
    x=x[1:-1]
    x=x.split(',')
    return x[-1].replace("'","")
df["News Categories"]=df["News Categories"].apply(lastpart)

In [4]:
le = LabelEncoder()
df['Encoded Categories'] = le.fit_transform(df['News Categories'])

In [5]:
df_sampled = df.sample(10000)

# Using Bag-of-words to predict only the Headlines

In [6]:
X = df_sampled['Headline']
X

87736     Broadcasters mistake Suryakumar and Shami for ...
40495     US govt waives 26 laws to allow border wall co...
142711    Devil’s Kitchen: The mystical caves in Tamil Nadu
112746    Pine Labs Owned Setu Incurs A Net Loss Of INR ...
7536      Earth's electrons may be forming water on Moon...
                                ...                        
154277    RCB Head hilariously folds his hands when aske...
190335    Beirut airport screens display anti-Hezbollah ...
91335     23-yr-old man suffers partial vision loss afte...
86917     Beauty queen Ariana Viera dies aged 26 after c...
52297     What was stand taken by Rahul's father? Minist...
Name: Headline, Length: 10000, dtype: object

In [7]:
y = df_sampled['Encoded Categories']
y

87736      8
40495     39
142711    23
112746    22
7536      34
          ..
154277    35
190335     6
91335     17
86917     14
52297     33
Name: Encoded Categories, Length: 10000, dtype: int64

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression classifier

In [9]:
# Step 1: Initialize CountVectorizer for Bag-of-Words (BoW)
vectorizer = CountVectorizer()

# Step 2: Fit the vectorizer on the training data and transform the text data into word count vectors
X_train_bow = vectorizer.fit_transform(X_train)

# Step 3: Transform the test data using the fitted vectorizer (same vocabulary)
X_test_bow = vectorizer.transform(X_test)

# Step 4: Train a Logistic Regression classifier
classifier = LogisticRegression()
classifier.fit(X_train_bow, y_train)

# Step 5: Make predictions on the test set
y_pred = classifier.predict(X_test_bow)

# Step 6: Evaluate the model's performance
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           1       1.00      0.92      0.96        12
           3       0.00      0.00      0.00         1
           4       0.83      0.90      0.87       100
           6       0.60      0.70      0.65        30
           7       0.00      0.00      0.00         5
           8       0.75      0.41      0.53        51
           9       1.00      0.50      0.67         4
          10       0.89      0.83      0.86        81
          11       1.00      1.00      1.00         5
          12       0.67      0.09      0.16        22
          13       0.00      0.00      0.00         1
          14       0.92      0.93      0.93       104
          16       1.00      0.44      0.62         9
          17       0.53      0.77      0.63       254
          19       0.89      0.70      0.78        23
          20       0.56      0.33      0.42        15
          21       0.69      0.62      0.65        50
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Using Randomforest Model

In [10]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier()

# Train the model
rf_model.fit(X_train_bow, y_train)

In [11]:
y_pred = rf_model.predict(X_test_bow)

In [12]:
# Compute performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Print the performance metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.6535
Precision: 0.7125459688959421
Recall: 0.5514210950440215
F1 Score: 0.5865113044353295


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Lets work with only the Content using Bag-off-words

In [15]:
Xx = df_sampled['Content']
Xx

87736     Broadcasters mistook Team India cricketers Sur...
40495     The US government has waived 26 federal laws i...
142711    Located on the outskirts of Kodaikanal, Devils...
112746    Bengaluru-based fintech startup Setu has poste...
7536      Data from India's Chandrayaan-1 lunar mission ...
                                ...                        
154277    RCB Head Rajesh V Menon had a hilarious reacti...
190335    The information screens at Beirut's internatio...
91335     A 23-year-old man experienced 70% vision loss ...
86917     Venezuelan beauty queen Ariana Viera, who foug...
52297     Union Minister Dharmendra Pradhan criticised C...
Name: Content, Length: 10000, dtype: object

In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(Xx, y, test_size=0.2, random_state=42)

# Let use Logistic Regression classifier

In [17]:
# Step 1: Initialize CountVectorizer for Bag-of-Words (BoW)
vectorizer = CountVectorizer()

# Step 2: Fit the vectorizer on the training data and transform the text data into word count vectors
X_train_bow = vectorizer.fit_transform(X_train)

# Step 3: Transform the test data using the fitted vectorizer (same vocabulary)
X_test_bow = vectorizer.transform(X_test)

# Step 4: Train a Logistic Regression classifier
classifier = LogisticRegression()
classifier.fit(X_train_bow, y_train)

# Step 5: Make predictions on the test set
y_pred = classifier.predict(X_test_bow)

# Step 6: Evaluate the model's performance
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           1       1.00      0.92      0.96        12
           3       0.00      0.00      0.00         1
           4       0.81      0.92      0.86       100
           6       0.68      0.77      0.72        30
           7       0.00      0.00      0.00         5
           8       0.77      0.67      0.72        51
           9       0.40      0.50      0.44         4
          10       0.85      0.90      0.87        81
          11       1.00      1.00      1.00         5
          12       1.00      0.09      0.17        22
          13       0.00      0.00      0.00         1
          14       0.95      0.94      0.95       104
          16       0.80      0.44      0.57         9
          17       0.66      0.78      0.71       254
          19       1.00      0.74      0.85        23
          20       0.36      0.27      0.31        15
          21       0.74      0.68      0.71        50
   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Using Randomforest Model

In [18]:
# Train the model
rf_model.fit(X_train_bow, y_train)

In [19]:
y_pred = rf_model.predict(X_test_bow)

In [20]:
# Compute performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Print the performance metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.71
Precision: 0.7529915045012783
Recall: 0.5935122875591554
F1 Score: 0.6239498982850742


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
