In [2]:
import pandas as pd
import numpy as np
import re
import pickle
from sklearn.base import BaseEstimator, TransformerMixin
from transformers import BertTokenizer, BertModel
import torch
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [4]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, stopwords):
        self.stopwords = stopwords
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.bert_model = BertModel.from_pretrained("bert-base-uncased")

    def clean_text(self, text):
        text = re.sub(r"\d+", "", text)
        text = re.sub(r"[^\w\s]", "", text)
        text = text.lower()
        return text

    def remove_stopwords(self, text):
        tokens = text.split()
        filtered_tokens = [word for word in tokens if word not in self.stopwords]
        return " ".join(filtered_tokens)

    def bert_embedding(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = self.bert_model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
        return embeddings.flatten()

    def transform(self, X, y=None):
        processed_texts = []
        for text in X:
            cleaned = self.clean_text(text)
            no_stopword = self.remove_stopwords(cleaned)
            bert_embedded = self.bert_embedding(no_stopword)
            processed_texts.append(bert_embedded)
        return np.array(processed_texts)

    def fit(self, X, y=None):
        return self

In [6]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words("english"))

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
pipeline = Pipeline([
    ("preprocessor", TextPreprocessor(stopwords=stopwords)),
    ("scaler", StandardScaler()),
    ("classifier", LogisticRegression())
])

In [14]:
df = pd.read_excel(r"C:\Users\701540\CONDA\Sentiment_Analysis\sentiment.xlsx")
df.sample(2)

Unnamed: 0,REVIEW,sentiment,sentiment_label
376,Full Satis Fied,0.35,POSITIVE
687,"VERY GOOD , GOOD STAFF",0.805,POSITIVE


In [15]:
X_train = df["REVIEW"]
y_train = df["sentiment_label"]

In [16]:
pipeline.fit(X_train, y_train)

In [17]:
with open("Sentiment_analysis_pipeline.pkl", "wb") as file:
    pickle.dump(pipeline, file)

In [31]:
with open('sentiment_analysis_pipeline.pkl', 'rb') as file:
    loaded_pipeline = pickle.load(file)

# Use the pipeline directly for prediction
predictions = loaded_pipeline.predict(["there are no collection. Need to improve the collections"])
print(predictions)

['POSITIVE']
