<a href="https://colab.research.google.com/github/KAILASHVenkat/T3-PREPROCESSING/blob/main/final_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install transformers pandas datasets

Collecting datasets
  Downloading datasets-2.16.0-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.16.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [2]:
import pandas as pd
from transformers import pipeline
from datasets import load_dataset

In [3]:
# Load the MRPC dataset from Hugging Face
mrpc_dataset = load_dataset('glue', 'mrpc')
mrpc_df = pd.DataFrame({'input_text': mrpc_dataset['train']['sentence1'], 'target_text': mrpc_dataset['train']['sentence2']})

Downloading data:   0%|          | 0.00/649k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [4]:
# Load the emotion classification model
emotion_classifier = pipeline('text-classification', model='bhadresh-savani/bert-base-go-emotion')

# Define emotion mapping
emotion_mapping = {
    'high_negative': ['anger', 'disgust', 'fear', 'grief', 'sadness'],
    'low_negative': ['nervousness', 'annoyance', 'disappointment', 'embarrassment', 'remorse', 'disapproval'],
    'neutral': ['confusion', 'curiosity', 'realization', 'surprise', 'neutral'],
    'low_positive': ['approval', 'caring', 'desire', 'relief'],
    'high_positive': ['amusement', 'excitement', 'pride', 'optimism', 'gratitude', 'joy', 'admiration', 'love']
}

config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [5]:
# Function to get the dominant emotion label based on score
def get_dominant_emotion_label(text):
    result = emotion_classifier(text)
    dominant_emotion = max(result, key=lambda x: x['score'])
    return dominant_emotion['label'] if dominant_emotion['score'] > 0.5 else 'not_classified'

# Apply emotion classification to create new columns
mrpc_df['input_emotion'] = mrpc_df['input_text'].apply(lambda text: get_dominant_emotion_label(text))
mrpc_df['target_emotion'] = mrpc_df['target_text'].apply(lambda text: get_dominant_emotion_label(text))


In [None]:
mrpc_df.to_csv('/content/2.csv', index=False)
from google.colab import files
files.download('/content/2.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [7]:
# Filter and Process DataFrame
# Filter out rows with 'not_classified' or 'neutral' emotion in either input or target
mrpc_df = mrpc_df[
    (mrpc_df['input_emotion'] != 'not_classified') & (mrpc_df['target_emotion'] != 'not_classified') &
    (mrpc_df['input_emotion'] != 'neutral') & (mrpc_df['target_emotion'] != 'neutral') &
    (mrpc_df['input_emotion'].isin(emotion_mapping.keys())) & (mrpc_df['target_emotion'].isin(emotion_mapping.keys()))
]

# Create decreasing intensity pairs
decreasing_intensity_pairs = mrpc_df[mrpc_df.apply(
    lambda row: emotion_mapping.get(row['input_emotion'], []) == emotion_mapping.get(row['target_emotion'], []) and
                  emotion_classifier(row['input_text'])[0]['score'] > emotion_classifier(row['target_text'])[0]['score'],
    axis=1)]

# Create increasing intensity pairs
increasing_intensity_pairs = mrpc_df.drop(decreasing_intensity_pairs.index)
increasing_intensity_pairs[['input_text', 'target_text']] = increasing_intensity_pairs[['target_text', 'input_text']].reset_index(drop=True)

decreasing_intensity_pairs['input_text'] = decreasing_intensity_pairs.apply(
    lambda row: f"{row['input_emotion']} to {row['target_emotion']}: {row['input_text']}", axis=1)

# Concatenate DataFrames
final_mrpc_dataset = pd.concat([decreasing_intensity_pairs, increasing_intensity_pairs], ignore_index=True)

# Save to CSV
final_mrpc_dataset.to_csv('output_mrpc_dataset.csv', index=False)