In [None]:
I have python cells that are all merged together I want you to split them properly

1. Importing the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import re 
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize
from collections import Counter
import numpy as np
import random
2. Loading the dataset
df = pd.read_csv("text.csv")
df.head()
3. Preprocessing the data
3.1 checking for missing values
df.isnull().sum()
3.2 checking for duplicates
df.duplicated().sum()
3.3 displaying dataframe shape
print("Shape of the data: ", df.shape)
df.head()
3.4 removing unnecessary columns
if df.columns[0] == "Unnamed: 0":
    df.rename(columns={"Unnamed: 0": "index"}, inplace=True)
    df.set_index("index", inplace= True)
df.head()
3.5 displaying class distribution
After we found that the dataset is imbalanced, we will use synonym replacement to balance the dataset.
lemmatizer = WordNetLemmatizer()
tokens = []
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
3.6 Synonym replacement
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().lower())
    return list(synonyms)

3.7 lower case letters, removing special characters, and removing extra spaces, and then lemmatizing the text.
sw = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [tok for tok in tokens if tok.isalpha()]
    tokens = [tok for tok in tokens if tok not in sw]
    tokens = word_tokenize(" ".join (tokens))
    pos_tags = pos_tag(tokens)
    tokens = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in pos_tags
    ]
    return ' '.join(tokens)

df["text"] = df["text"].apply(preprocess_text)
print(df)
3.8 Identifying the POS of a word.
df['label'] = df['label'].replace(0,'Sadness')
df['label'] = df['label'].replace(1,'Joy')
df['label'] = df['label'].replace(2,'Love')
df['label'] = df['label'].replace(3,'Anger')
df['label'] = df['label'].replace(4,'Fear')
df['label'] = df['label'].replace(5,'Surprise')

count = df['label'].value_counts()
# Create a figure with two subplots
fig, axs = plt.subplots(1, 2, figsize=(12,s 6), facecolor='white')

# Plot pie chart on the first subplot
palette = sns.color_palette("viridis")
sns.set_palette(palette)
axs[1].pie(count, labels=count.index, autopct='%1.1f%%', startangle=140)
axs[1].set_title('Distribution of Categories')

# Plot bar chart on the second subplot
sns.barplot(x=count.index, y=count.values, ax=axs[0], palette="viridis")
axs[0].set_title('Count of Categories')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()
3.9: Augmenting the Minority Class to Balance the Dataset
df.head()
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            if lemma.name().lower() != word.lower():
                synonyms.add(lemma.name().replace('_', ' '))
    return list(synonyms)

get_synonyms('sad')

def augment_sentence(text, num_augments=2):
    words = text.split()
    new_sentences = [text]
    for _ in range(num_augments):
        new_words = []
        for word in words:
            # If it's an emotion-related word, try replacing it
            if word.lower() in text:
                syns = get_synonyms(word)
                if syns:
                    new_word = random.choice(syns)
                    new_words.append(new_word)
                else:
                    new_words.append(word)
            else:
                new_words.append(word)
        new_sentences.append(' '.join(new_words))
    return new_sentences
new_rows = []

df_surprise = df[df['label'] == 'Surprise'].copy()

for id, row in df_surprise.iterrows():
    augmentations = augment_sentence(row['text'])
    for sent in augmentations:
        new_rows.append({
                'text':  sent,
                'label': 'Surprise'
            })

print(df.value_counts())
df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
print(df.value_counts())
augmented_texts = []
augmented_labels = []

for i, row in df.iterrows():
    augments = augment_sentence(row['text'], num_augments=2)
    augmented_texts.extend(augments)
    augmented_labels.extend([row['label']] * len(augments))
df.head()
vc = pd.Series(tokens).value_counts()
vc = vc.head(7000)
words = list(vc.index.unique())
print(len(words))
word_to_index = {}
index = 1 

for word in words:
    if word not in word_to_index:
        word_to_index[word] = index
        index += 1
print("Vocabulary:", word_to_index)

# 1️⃣  Build / extend the vocabulary
special_tokens = ["<PAD>", "<UNK>", "<SOS>", "<EOS>"]

word_to_index = {}             # or load your existing dict here
for tok in special_tokens:      # make sure the specials are present first
    if tok not in word_to_index:
        word_to_index[tok] = len(word_to_index)

# Optionally add your corpus words afterwards
for sent in sentences:          # sentences is a list of token lists
    for word in sent:
        if word not in word_to_index:
            word_to_index[word] = len(word_to_index)

# 2️⃣  Keep the index of <UNK> handy
UNK_IDX = word_to_index["<UNK>"]

# 3️⃣  Robust mapping function
def sentence_to_sequence(sentence, word_to_index, unk_index=UNK_IDX):
    """
    Convert a list of tokens to their indices.
    Unknown words map to `unk_index`.
    """
    return [word_to_index.get(word, unk_index) for word in sentence]

# 4️⃣  Vectorise every sentence
sequences = [sentence_to_sequence(sent, word_to_index) for sent in sentences]
print("Sequences:", sequences[:10])

all_words = []
for sentence in df['text']:
    all_words.extend(clean_text(sentence))

word_freq = Counter(all_words)

word2idx = {word: idx+2 for idx, word in enumerate(all_words)}

word2idx['<PAD>'] = 0
word2idx['<UNK>'] = 1

def text_to_sequence(text):
    return [word2idx.get(word, word2idx['<UNK>']) for word in clean_text(text)]


df['sequence'] = df['text'].apply(text_to_sequence)
df['sequence']

def pad_sequences(sequences, max_len):
    padded = np.zeros((len(sequences), max_len), dtype=int)
    for i, seq in enumerate(sequences):
        length = min(len(seq), max_len)
        padded[i, :length] = seq[:length]
    return padded

# Example: set max_len (you can use 50, 100, or based on your data)
max_len = 50  # try different values if needed

# Apply padding
padded_array = pad_sequences(df['sequence'], max_len)
padded_array
list_sadness = []
list_joy = []
list_love = []
list_anger = []
list_fear = []
list_surprise = []
list_unlabeled = []
def categorize_emotions(row):
    if row['label'] == 0:
        list_sadness.append(row['text'])
    elif row['label'] == 1:
        list_joy.append(row['text'])
    elif row['label'] == 2:
        list_love.append(row['text'])
    elif row['label'] == 3:
        list_anger.append(row['text'])
    elif row['label'] == 4:
        list_fear.append(row['text'])
    elif row['label'] == 5:
        list_surprise.append(row['text'])
    return row
df = df.apply(categorize_emotions, axis=1)
sadness = pd.DataFrame({'text': list_sadness, 'label': 0})
joy = pd.DataFrame({'text': list_joy, 'label': 1})
love = pd.DataFrame({'text': list_love, 'label': 2})
anger = pd.DataFrame({'text': list_anger, 'label': 3})
fear = pd.DataFrame({'text': list_fear, 'label': 4})
surprise = pd.DataFrame({'text': list_surprise, 'label': 5})
sorted_df = pd.concat([sadness, joy, love, anger, fear, surprise], ignore_index=True)

print(sorted_df)
sadness_tokens  = []
joy_tokens      = []
love_tokens     = []
anger_tokens    = []
fear_tokens     = []
surprise_tokens = []

def tokenize_sadness(row):
    token = row['text'].split()
    sadness_tokens.extend(token)
    return row
def tokenize_joy(row):
    token = row['text'].split()
    joy_tokens.extend(token)
    return row
def tokenize_love(row):
    token = row['text'].split()
    love_tokens.extend(token)
    return row
def tokenize_anger(row):
    token = row['text'].split()
    anger_tokens.extend(token)
    return row
def tokenize_fear(row):
    token = row['text'].split()
    fear_tokens.extend(token)
    return row
def tokenize_surprise(row):
    token = row['text'].split()
    surprise_tokens.extend(token)
    return row
sadness = sadness.apply(tokenize_sadness, axis=1)
joy = joy.apply(tokenize_joy, axis=1)
love = love.apply(tokenize_love, axis=1)
anger = anger.apply(tokenize_anger, axis=1)
fear = fear.apply(tokenize_fear, axis=1)
surprise = surprise.apply(tokenize_surprise, axis=1)

sadness_words_vc  = pd.Series(sadness_tokens).value_counts()
joy_words_vc      = pd.Series(joy_tokens).value_counts()
love_words_vc     = pd.Series(love_tokens).value_counts()
anger_words_vc    = pd.Series(anger_tokens).value_counts()
fear_words_vc     = pd.Series(fear_tokens).value_counts()
surprise_words_vc = pd.Series(surprise_tokens).value_counts()
#1 Input, 1 output Neural Network


def sigmoid (SOP):
    return 1 / ( 1 + np.exp(-1 * SOP))

def error (predicted, target):
    return np.power((predicted - target) , 2)
def error_predicted_derivative (predicted, target):
    return 2 * (predicted - target)
def activation_sop_derivative (SOP):
    return sigmoid(SOP) * (1 - sigmoid(SOP))
def sop_w_deriv (x):
    return x
def update_weight (weight, learning_rate, grad):
    return weight - (learning_rate * grad)

input_layer = np.random.rand(10)
input_size = len(input_layer)
weights = np.random.rand(10)
target = 0.3
learning_rate = 0.0005
x = float
b = np.random.rand(1)
print(input_layer)
print("target: ", target)
predictions = []
errors = []
printed = 0

for i in range(80000):
    sop = np.sum(weights * input_layer)
    predicted = sigmoid(sop)
    err = error(predicted, target)
    for j in range(input_size):
        grad = error_predicted_derivative(predicted, target) * activation_sop_derivative(sop) * sop_w_deriv(input_layer[j])
        weights[j] = update_weight(weights[j], learning_rate, grad)
    predictions.append(predicted)
    errors.append(err)
    if err < 0.0001 and not printed:
        print("Got the right answer after {} iterations.".format(i))
        printed = 1
        
    
plt.figure()
plt.plot(predictions)
plt.title("Predictions over Iterations")
plt.xlabel("Iteration")
plt.ylabel("Prediction")
plt.grid(True)
plt.show()

plt.figure()
plt.plot(errors)
plt.title("Error Rate over Iterations")
plt.xlabel("Iteration")
plt.ylabel("Error")
plt.grid(True)
plt.show()