In [1]:
# !pip install scikit-learn

In [3]:
import pandas as pd
processed_data=pd.read_csv('processed_data.csv')
processed_data

Unnamed: 0,category,encoded_category,new_text,count,tokenized_text
0,tech,4,tv future hand viewer home theatre system plas...,370,"['tv', 'future', 'hand', 'viewer', 'home', 'th..."
1,business,0,worldcom boss leave book worldcom boss bernie ...,176,"['worldcom', 'boss', 'leave', 'book', 'worldco..."
2,sport,3,tiger wary farrell gamble leicester rush make ...,107,"['tiger', 'wary', 'farrell', 'gamble', 'leices..."
3,sport,3,yeade face newcastle fa cup premiership newcas...,257,"['yeade', 'face', 'newcastle', 'fa', 'cup', 'p..."
4,entertainment,1,ocean raids box office ocean crime caper seque...,175,"['ocean', 'raids', 'box', 'office', 'ocean', '..."
...,...,...,...,...,...
2113,business,0,car pull retail figure retail sale fall 03 jan...,177,"['car', 'pull', 'retail', 'figure', 'retail', ..."
2114,politics,2,kilroy unveil immigration policy exchatshow ho...,98,"['kilroy', 'unveil', 'immigration', 'policy', ..."
2115,entertainment,1,rem announce new glasgow concert band rem anno...,168,"['rem', 'announce', 'new', 'glasgow', 'concert..."
2116,politics,2,political squabble snowball commonplace argue ...,371,"['political', 'squabble', 'snowball', 'commonp..."


## Imported required libraries for a text classification pipeline

In [4]:
# This library provides tools for splitting your dataset into training and testing sets.
# The train_test_split function is commonly used for this purpose.

from sklearn.model_selection import train_test_split

In [5]:
##  This library contains tools for feature extraction from text data. In your case, you're using TfidfVectorizer, 
#   which converts a collection of raw text documents to a matrix of TF-IDF features

from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
# This library provides various metrics for evaluating the performance of machine learning models.

from sklearn.metrics import accuracy_score, classification_report,precision_score,recall_score,confusion_matrix

In [8]:
#This library provides a way to streamline a lot of routine processes by putting together a sequence of data processing steps 
# and a final estimator (model). 

from sklearn.pipeline import Pipeline

In [9]:
# Assigned 'tokenized_text' column to X and 'category_encoded' column to y

X=processed_data['tokenized_text']
y=processed_data['encoded_category']

In [10]:
# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

## LogisticRegression

In [11]:
# Imported Logistic Regression from scikit-learn's linear_model

from sklearn.linear_model import LogisticRegression

In [12]:
#Combine the vectorizer and the model into a pipeline

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression(max_iter=500,random_state=0,C=1,fit_intercept=True))
])


In [13]:
# Fitted the text classification pipeline to the training data

pipeline.fit(X_train, y_train)

In [14]:
# Generated predictions on the test data using the text classification pipeline

LR_predictions = pipeline.predict(X_test)
LR_predictions

array([0, 0, 3, 0, 0, 2, 0, 0, 0, 3, 3, 4, 0, 2, 3, 3, 3, 0, 3, 0, 4, 4,
       2, 1, 0, 0, 0, 0, 0, 3, 2, 4, 2, 0, 3, 3, 2, 3, 2, 2, 3, 3, 2, 1,
       3, 3, 1, 0, 2, 3, 0, 4, 0, 3, 3, 3, 4, 1, 2, 2, 1, 2, 3, 1, 1, 4,
       4, 1, 2, 4, 0, 4, 3, 1, 3, 1, 4, 0, 4, 3, 3, 4, 2, 2, 0, 1, 0, 3,
       4, 2, 4, 0, 3, 0, 3, 0, 2, 0, 3, 3, 0, 3, 4, 0, 1, 0, 4, 0, 2, 1,
       3, 3, 0, 3, 2, 0, 3, 1, 4, 3, 0, 2, 4, 4, 1, 3, 2, 3, 0, 3, 2, 1,
       2, 2, 0, 4, 0, 1, 4, 2, 3, 3, 3, 3, 1, 3, 0, 1, 0, 1, 3, 0, 2, 4,
       3, 4, 0, 0, 0, 1, 4, 0, 0, 1, 0, 2, 4, 3, 2, 4, 1, 1, 3, 2, 1, 1,
       3, 2, 3, 3, 4, 1, 3, 2, 2, 3, 1, 0, 2, 0, 1, 1, 4, 1, 2, 1, 4, 3,
       1, 3, 4, 0, 4, 1, 0, 2, 0, 4, 1, 0, 4, 1, 1, 3, 3, 2, 0, 3, 4, 0,
       1, 0, 3, 0, 4, 2, 0, 3, 1, 3, 0, 4, 3, 2, 2, 1, 4, 3, 3, 0, 4, 4,
       3, 2, 2, 0, 1, 0, 2, 4, 1, 0, 1, 2, 0, 0, 1, 0, 3, 0, 4, 4, 1, 3,
       3, 0, 4, 1, 2, 0, 3, 2, 0, 0, 0, 1, 4, 0, 2, 4, 1, 3, 0, 1, 0, 4,
       4, 3, 2, 2, 2, 2, 1, 0, 1, 2, 0, 3, 1, 4, 1,

In [15]:
# Calculated accuracy using the predicted labels and actual labels from the test data

accuracy_score(y_test,LR_predictions)

0.9660377358490566

In [16]:
precision_score(y_test,LR_predictions,average='weighted')

0.9669808690732602

In [17]:
recall_score(y_test,LR_predictions,average='weighted')

0.9660377358490566

In [18]:
print(classification_report(y_test,LR_predictions))

              precision    recall  f1-score   support

           0       0.93      0.98      0.96       121
           1       1.00      0.96      0.98        90
           2       0.98      0.93      0.95        99
           3       0.96      1.00      0.98       131
           4       0.98      0.94      0.96        89

    accuracy                           0.97       530
   macro avg       0.97      0.96      0.97       530
weighted avg       0.97      0.97      0.97       530



## DecisionTreeClassifier

In [20]:
# Imported Decision Tree Classifier from scikit-learn's tree

from sklearn.tree import DecisionTreeClassifier

In [21]:
#Combine the vectorizer and the model into a pipeline

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('dt', DecisionTreeClassifier())
])

In [22]:
# Fitted the text classification pipeline to the training data

pipeline.fit(X_train, y_train)

In [23]:
# Generated predictions on the test data using the text classification pipeline

DT_predictions = pipeline.predict(X_test)
DT_predictions


array([0, 3, 3, 0, 3, 0, 0, 0, 0, 3, 3, 4, 0, 1, 1, 3, 3, 0, 1, 3, 4, 4,
       2, 1, 0, 0, 0, 0, 0, 3, 1, 4, 2, 2, 3, 3, 2, 3, 2, 2, 3, 3, 0, 1,
       3, 0, 3, 0, 2, 3, 0, 4, 0, 1, 3, 3, 4, 1, 0, 2, 1, 2, 3, 1, 1, 4,
       2, 1, 0, 4, 0, 0, 3, 1, 3, 1, 4, 0, 2, 1, 1, 4, 2, 2, 0, 1, 0, 3,
       1, 2, 2, 4, 3, 0, 3, 2, 2, 0, 3, 4, 0, 3, 4, 0, 1, 0, 4, 0, 2, 1,
       3, 3, 0, 3, 2, 0, 3, 1, 4, 3, 0, 2, 4, 4, 1, 3, 2, 3, 0, 3, 2, 2,
       2, 1, 0, 4, 0, 1, 4, 0, 3, 1, 3, 4, 4, 3, 0, 1, 0, 1, 3, 0, 2, 4,
       3, 4, 0, 3, 0, 1, 4, 0, 0, 1, 0, 2, 4, 3, 1, 4, 1, 1, 2, 2, 0, 1,
       3, 0, 3, 3, 4, 1, 1, 2, 2, 3, 1, 0, 2, 0, 1, 1, 4, 1, 2, 4, 4, 3,
       1, 3, 4, 2, 4, 1, 0, 2, 0, 4, 1, 1, 4, 1, 1, 3, 3, 2, 0, 3, 4, 0,
       1, 0, 3, 0, 4, 2, 0, 3, 4, 3, 0, 4, 3, 1, 2, 2, 4, 3, 3, 0, 1, 4,
       3, 2, 0, 0, 1, 0, 0, 4, 0, 0, 1, 2, 0, 0, 1, 0, 3, 0, 4, 4, 1, 3,
       3, 0, 4, 1, 2, 0, 3, 2, 0, 1, 4, 1, 4, 0, 2, 2, 1, 0, 4, 1, 0, 4,
       4, 3, 2, 2, 2, 2, 1, 0, 1, 2, 0, 3, 1, 4, 1,

In [24]:
# Calculated accuracy using the predicted labels and actual labels from the test data

accuracy_score(y_test,DT_predictions)

0.8113207547169812

In [25]:
precision_score(y_test,DT_predictions,average='weighted')

0.8147915119636895

In [26]:
recall_score(y_test,DT_predictions,average='weighted')

0.8113207547169812

In [27]:
print(classification_report(y_test,DT_predictions))

              precision    recall  f1-score   support

           0       0.75      0.85      0.80       121
           1       0.76      0.77      0.76        90
           2       0.79      0.75      0.77        99
           3       0.92      0.86      0.89       131
           4       0.84      0.80      0.82        89

    accuracy                           0.81       530
   macro avg       0.81      0.81      0.81       530
weighted avg       0.81      0.81      0.81       530



## GradientBoostingClassifier

In [29]:
# Imported Gradient Boosting Classifier from scikit-learn's ensemble

from sklearn.ensemble import GradientBoostingClassifier

In [30]:
#Combine the vectorizer and the model into a pipeline

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('gb', GradientBoostingClassifier())
])

In [31]:
# Fitted the text classification pipeline to the training data

pipeline.fit(X_train, y_train)

In [32]:
# Generated predictions on the test data using the text classification pipeline

RF_predictions = pipeline.predict(X_test)
RF_predictions

array([0, 0, 3, 0, 0, 2, 0, 0, 0, 3, 3, 4, 0, 2, 3, 3, 3, 0, 3, 0, 4, 4,
       2, 1, 0, 0, 0, 4, 0, 3, 2, 4, 2, 0, 3, 3, 2, 3, 2, 2, 3, 3, 2, 1,
       3, 3, 1, 0, 2, 3, 0, 4, 0, 3, 3, 3, 4, 1, 2, 2, 1, 2, 3, 1, 1, 4,
       4, 1, 2, 4, 0, 4, 3, 1, 3, 1, 4, 0, 4, 3, 3, 4, 2, 2, 0, 1, 0, 3,
       4, 2, 4, 4, 3, 0, 3, 0, 2, 0, 3, 3, 1, 3, 4, 0, 1, 0, 4, 0, 2, 1,
       3, 3, 0, 3, 2, 0, 3, 1, 4, 3, 0, 2, 4, 4, 1, 3, 2, 3, 0, 3, 2, 1,
       2, 2, 0, 4, 0, 1, 4, 0, 3, 3, 3, 4, 1, 3, 0, 1, 0, 1, 3, 0, 2, 4,
       3, 4, 0, 0, 0, 1, 4, 0, 0, 1, 0, 2, 4, 3, 1, 4, 1, 1, 3, 2, 0, 1,
       3, 2, 3, 3, 4, 1, 3, 2, 2, 3, 1, 0, 2, 0, 1, 1, 4, 1, 2, 1, 4, 3,
       1, 3, 4, 0, 4, 1, 0, 2, 0, 4, 1, 1, 4, 1, 1, 3, 3, 2, 0, 3, 4, 0,
       1, 0, 3, 0, 4, 2, 0, 3, 1, 3, 0, 4, 3, 0, 2, 2, 4, 3, 3, 0, 4, 4,
       3, 2, 2, 0, 1, 0, 2, 4, 1, 0, 1, 2, 0, 0, 1, 0, 3, 0, 4, 4, 1, 3,
       3, 0, 4, 1, 2, 0, 3, 2, 0, 0, 0, 1, 4, 0, 2, 4, 1, 3, 0, 1, 0, 4,
       4, 3, 2, 2, 2, 2, 1, 0, 1, 2, 0, 3, 1, 4, 1,

In [33]:
accuracy_score(y_test,RF_predictions)

0.9471698113207547

In [34]:
precision_score(y_test,RF_predictions,average='weighted')

0.9478130514383555

In [35]:
recall_score(y_test,RF_predictions,average='weighted')

0.9471698113207547

In [36]:
print(classification_report(y_test,RF_predictions))

              precision    recall  f1-score   support

           0       0.91      0.96      0.93       121
           1       0.94      0.92      0.93        90
           2       0.97      0.90      0.93        99
           3       0.97      1.00      0.98       131
           4       0.95      0.93      0.94        89

    accuracy                           0.95       530
   macro avg       0.95      0.94      0.94       530
weighted avg       0.95      0.95      0.95       530



## Naive_Bayes

In [37]:
from sklearn.naive_bayes import MultinomialNB

In [38]:
#Combine the vectorizer and the model into a pipeline

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

In [39]:
# Fitted the text classification pipeline to the training data

pipeline.fit(X_train, y_train)

In [40]:
# Generated predictions on the test data using the text classification pipeline

MNB_predictions = pipeline.predict(X_test)
MNB_predictions

array([0, 0, 3, 0, 0, 2, 0, 0, 0, 3, 3, 4, 0, 2, 3, 3, 3, 0, 3, 0, 4, 4,
       2, 1, 0, 0, 0, 0, 0, 3, 2, 4, 2, 0, 3, 3, 2, 3, 2, 2, 3, 3, 2, 1,
       3, 2, 1, 0, 2, 3, 0, 4, 0, 3, 3, 3, 4, 1, 2, 2, 2, 2, 3, 1, 1, 4,
       0, 1, 2, 2, 0, 4, 3, 1, 3, 1, 4, 0, 4, 3, 3, 4, 2, 2, 0, 1, 0, 3,
       4, 2, 4, 0, 3, 0, 3, 0, 2, 0, 3, 3, 0, 3, 4, 0, 1, 0, 4, 0, 2, 1,
       3, 3, 0, 3, 2, 0, 3, 1, 4, 3, 0, 2, 4, 4, 1, 3, 2, 3, 0, 3, 2, 1,
       2, 2, 0, 4, 0, 1, 4, 2, 3, 3, 3, 3, 1, 3, 0, 2, 0, 1, 3, 0, 2, 4,
       3, 4, 0, 0, 0, 1, 4, 0, 2, 1, 0, 2, 0, 3, 2, 4, 1, 1, 3, 2, 1, 1,
       3, 2, 3, 3, 4, 1, 3, 2, 2, 3, 1, 0, 2, 0, 1, 1, 4, 1, 2, 1, 4, 3,
       1, 3, 4, 0, 4, 1, 0, 2, 0, 4, 1, 0, 0, 1, 1, 3, 3, 2, 0, 3, 4, 0,
       1, 0, 3, 0, 4, 2, 0, 3, 1, 3, 0, 4, 3, 2, 2, 1, 4, 3, 3, 0, 4, 4,
       3, 2, 2, 0, 1, 0, 2, 4, 1, 0, 1, 2, 0, 0, 1, 0, 3, 0, 4, 4, 1, 3,
       3, 0, 4, 1, 2, 0, 3, 2, 0, 0, 0, 1, 4, 0, 2, 4, 1, 3, 0, 1, 0, 4,
       4, 3, 2, 2, 2, 2, 1, 0, 1, 2, 0, 3, 1, 2, 1,

In [41]:
accuracy_score(y_test,MNB_predictions)

0.9490566037735849

In [42]:
precision_score(y_test,MNB_predictions,average='weighted')

0.9514474070144896

In [43]:
recall_score(y_test,MNB_predictions,average='weighted')

0.9490566037735849

In [44]:
print(classification_report(y_test,MNB_predictions))

              precision    recall  f1-score   support

           0       0.89      0.98      0.94       121
           1       1.00      0.91      0.95        90
           2       0.93      0.95      0.94        99
           3       0.97      1.00      0.98       131
           4       0.97      0.87      0.92        89

    accuracy                           0.95       530
   macro avg       0.95      0.94      0.95       530
weighted avg       0.95      0.95      0.95       530



## Support Vector Classification

In [46]:
from sklearn.svm import LinearSVC

# Create a pipeline with TF-IDF vectorizer and Linear SVM classifier
pipeline_svm = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', LinearSVC(random_state=0, C=1,dual='auto'))
])

In [47]:
# Fit the pipeline on the training data
pipeline_svm.fit(X_train, y_train)

In [48]:
SVM_predictions = pipeline_svm.predict(X_test)
SVM_predictions

array([0, 0, 3, 0, 0, 2, 0, 0, 0, 3, 3, 4, 4, 2, 3, 3, 3, 0, 3, 0, 4, 4,
       2, 1, 0, 0, 0, 0, 0, 3, 2, 4, 2, 0, 3, 3, 2, 3, 2, 2, 3, 3, 2, 1,
       3, 3, 1, 0, 2, 3, 0, 4, 0, 3, 3, 3, 4, 1, 2, 2, 1, 2, 3, 1, 1, 4,
       4, 1, 2, 4, 0, 4, 3, 1, 3, 1, 4, 0, 4, 3, 3, 4, 2, 2, 0, 1, 0, 3,
       4, 2, 4, 0, 3, 0, 3, 0, 2, 0, 3, 3, 0, 3, 4, 0, 1, 0, 4, 0, 2, 1,
       3, 3, 0, 3, 2, 0, 3, 1, 4, 3, 0, 2, 4, 4, 1, 3, 2, 3, 0, 3, 2, 1,
       2, 2, 0, 4, 0, 1, 4, 2, 3, 3, 3, 4, 1, 3, 0, 1, 0, 1, 3, 0, 2, 4,
       3, 4, 0, 0, 0, 1, 4, 0, 2, 1, 0, 2, 4, 3, 2, 4, 1, 1, 3, 2, 1, 1,
       3, 2, 3, 3, 4, 1, 3, 2, 2, 3, 1, 0, 2, 0, 1, 1, 4, 1, 2, 1, 4, 3,
       1, 3, 4, 0, 4, 1, 0, 2, 0, 4, 1, 0, 4, 1, 1, 3, 3, 2, 0, 3, 4, 0,
       1, 0, 3, 0, 4, 2, 0, 3, 1, 3, 0, 4, 3, 2, 2, 1, 4, 3, 3, 0, 4, 4,
       3, 2, 2, 0, 1, 0, 2, 4, 1, 0, 1, 2, 0, 0, 1, 0, 3, 0, 4, 4, 1, 3,
       3, 0, 4, 1, 2, 0, 3, 2, 0, 0, 0, 1, 4, 0, 2, 4, 1, 3, 0, 1, 0, 4,
       4, 3, 2, 2, 2, 2, 1, 0, 1, 2, 0, 3, 1, 4, 1,

In [49]:
accuracy_score(y_test,SVM_predictions)

0.9792452830188679

In [50]:
precision_score(y_test,SVM_predictions,average='weighted')

0.9794123451507649

In [51]:
recall_score(y_test,SVM_predictions,average='weighted')

0.9792452830188679

In [52]:
print(classification_report(y_test,SVM_predictions))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       121
           1       1.00      0.98      0.99        90
           2       0.98      0.95      0.96        99
           3       0.98      1.00      0.99       131
           4       0.98      0.98      0.98        89

    accuracy                           0.98       530
   macro avg       0.98      0.98      0.98       530
weighted avg       0.98      0.98      0.98       530



## K-Nearest Neighbors (KNN)

In [53]:
from sklearn.neighbors import KNeighborsClassifier

# Create a pipeline with TF-IDF vectorizer and K-Nearest Neighbors classifier
pipeline_knn = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('knn', KNeighborsClassifier(n_neighbors=5))
])

In [54]:
# Fit the pipeline on the training data
pipeline_knn.fit(X_train, y_train)

In [55]:
KNN_predictions = pipeline_knn.predict(X_test)
KNN_predictions

array([0, 0, 3, 0, 0, 0, 0, 0, 2, 3, 3, 4, 4, 2, 3, 3, 3, 0, 3, 0, 4, 4,
       2, 1, 0, 0, 0, 0, 0, 3, 2, 4, 2, 0, 3, 3, 2, 3, 2, 2, 3, 0, 2, 1,
       3, 2, 1, 0, 2, 3, 0, 4, 2, 3, 3, 3, 4, 1, 2, 2, 1, 2, 3, 1, 1, 4,
       4, 1, 0, 4, 0, 4, 3, 1, 3, 1, 4, 0, 4, 3, 3, 4, 2, 2, 0, 1, 0, 3,
       4, 2, 4, 0, 3, 0, 3, 0, 2, 0, 3, 3, 2, 3, 4, 0, 1, 0, 4, 0, 2, 1,
       3, 3, 0, 3, 2, 0, 3, 1, 4, 3, 0, 2, 4, 4, 1, 3, 2, 3, 0, 3, 2, 1,
       2, 2, 0, 4, 0, 1, 4, 2, 3, 3, 3, 4, 1, 3, 0, 0, 0, 0, 3, 0, 2, 4,
       3, 4, 0, 0, 2, 1, 4, 0, 2, 1, 0, 2, 4, 3, 2, 4, 1, 1, 3, 2, 1, 1,
       3, 2, 3, 3, 4, 1, 3, 2, 2, 3, 1, 0, 2, 0, 1, 1, 4, 1, 2, 1, 4, 3,
       1, 3, 4, 0, 4, 1, 4, 0, 0, 4, 1, 0, 4, 1, 1, 3, 3, 2, 0, 3, 4, 0,
       1, 0, 3, 0, 4, 0, 0, 3, 1, 3, 0, 4, 3, 2, 2, 1, 4, 3, 3, 0, 1, 4,
       3, 2, 2, 0, 1, 0, 2, 4, 1, 0, 1, 2, 0, 0, 1, 0, 3, 0, 4, 4, 1, 3,
       3, 0, 4, 1, 2, 0, 3, 2, 0, 0, 4, 1, 4, 0, 2, 4, 1, 3, 0, 1, 0, 4,
       4, 3, 2, 2, 2, 2, 1, 0, 1, 2, 0, 3, 2, 4, 1,

In [56]:
accuracy_score(y_test,KNN_predictions)

0.9415094339622642

In [57]:
precision_score(y_test,KNN_predictions,average='weighted')

0.9428590619307832

In [58]:
recall_score(y_test,KNN_predictions,average='weighted')

0.9415094339622642

In [59]:
print(classification_report(y_test,KNN_predictions))

              precision    recall  f1-score   support

           0       0.91      0.92      0.91       121
           1       0.99      0.89      0.94        90
           2       0.92      0.93      0.92        99
           3       0.98      0.98      0.98       131
           4       0.91      0.98      0.94        89

    accuracy                           0.94       530
   macro avg       0.94      0.94      0.94       530
weighted avg       0.94      0.94      0.94       530

