In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
import pickle
import torch
from transformers import BertTokenizer, BertModel
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load dataset
df = pd.read_csv('path_to_your_csv_file.csv')

# Preprocessing
def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text.strip().lower()

df['Heading'] = df['Heading'].apply(clean_text)
df['Full_Article'] = df['Full_Article'].apply(clean_text)

# Encode target labels
label_encoder = LabelEncoder()
df['Article_Type'] = label_encoder.fit_transform(df['Article_Type'])


In [None]:
class BertVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, pretrained_model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
        self.model = BertModel.from_pretrained(pretrained_model_name)

    def embed_text(self, text):
        inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([self.embed_text(text) for text in X])

# Vectorize Heading and Full_Article
X = df['Heading'] + ' ' + df['Full_Article']
y = df['Article_Type']

bert_vectorizer = BertVectorizer()
X_bert = bert_vectorizer.fit_transform(X)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_bert, y, test_size=0.2, random_state=42)


In [None]:
# Logistic Regression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [None]:
param_grid = {'C': [0.1, 1, 10, 100]}
grid_search = GridSearchCV(SVC(kernel='linear'), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_clf = grid_search.best_estimator_

In [None]:
y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

In [None]:
with open('text_classifier.pkl', 'wb') as model_file:
    pickle.dump(clf, model_file)