<a href="https://colab.research.google.com/github/Nanditha-V/text_summary_classification/blob/master/text_summary_classify.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import numpy as np
import pandas as pd
from transformers import pipeline, BartTokenizer, BartForConditionalGeneration


class ZeroShotClassifier:
    def __init__(self):
        # Load zero-shot classification pipeline
        self.classifier = pipeline("zero-shot-classification")

    def classify(self, text, labels):
        # Perform zero-shot classification
        result = self.classifier(text, labels)
        return result

class BertTextSummarizer:
    def __init__(self):
        # Load pre-trained BART model and tokenizer
        model_name = 'facebook/bart-large-cnn'
        self.tokenizer = BartTokenizer.from_pretrained(model_name)
        self.model = BartForConditionalGeneration.from_pretrained(model_name)

    def summarize(self, text):
        # Tokenize the text
        inputs = self.tokenizer.encode_plus(text, truncation=True, padding='longest', return_tensors='pt')

        # Generate the summary using BART
        summary_ids = self.model.generate(inputs['input_ids'], num_beams=4, length_penalty=2.0, max_length=142, min_length=56, early_stopping=True)
        summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        return summary

# Main program
if __name__ == "__main__":
    df_1 = pd.read_csv("/content/IMDB Dataset.csv",encoding='utf-8')
    #print(df_1.head())
    df_2 = df_1.iloc[:10]
    labels = ["action", "comedy","triller","drama","sci-fi","romance"]

    zero_shot_classifier = ZeroShotClassifier()
    bert_text_summarizer = BertTextSummarizer()

    summarized_texts = []
    classify_text = []
    for index, row in df_2.iterrows():
      text = row['review']
      classification_result = zero_shot_classifier.classify(text, labels)
      classify_text.append(classification_result)
      summary = bert_text_summarizer.summarize(text)
      summarized_texts.append(summary)
        #print(f"\nSummary for text {index + 1}:")
        #print(summary)
        #print("="*50)
    df_2['summarized'] = summarized_texts
    df_2['classified'] = classify_text




No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2['summarized'] = summarized_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2['classified'] = classify_text


In [None]:
pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.3 MB/s[0m eta [36m0:00:0

# New section

In [None]:
df_1 = pd.read_csv("/content/IMDB Dataset.csv",encoding="utf-8")

In [None]:
df_1.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [12]:
df_2.head()

Unnamed: 0,review,sentiment,summarized,classified
0,One of the other reviewers has mentioned that ...,positive,The first thing that struck me about Oz was it...,{'sequence': 'One of the other reviewers has m...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. The filming tec...,{'sequence': 'A wonderful little production. <...
2,I thought this was a wonderful way to spend ti...,positive,This was the most I'd laughed at one of Woody'...,{'sequence': 'I thought this was a wonderful w...
3,Basically there's a family where a little boy ...,negative,A little boy (Jake) thinks there's a zombie in...,{'sequence': 'Basically there's a family where...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love in the Time of Money"" is...","{'sequence': 'Petter Mattei's ""Love in the Tim..."
