In [1]:
!pip install pandas numpy nltk tensorflow scikit-learn



In [2]:
import pandas as pd
import numpy as np
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
from google.colab import files
uploaded = files.upload()

Saving dataset.csv to dataset.csv


In [5]:
data = pd.read_csv(next(iter(uploaded)))

print("Dataset shape:", data.shape)
print(data.head())

Dataset shape: (6164, 2)
                                                text  label
0  the real reason why you're sad? you're attache...      1
1      my biggest problem is overthinking everything      1
2  the worst sadness is the sadness you've taught...      1
3  i cannot make you understand. i cannot make an...      1
4  i don't think anyone really understands how ti...      1


In [6]:
data = data[['text','label']]
print(data.head())

                                                text  label
0  the real reason why you're sad? you're attache...      1
1      my biggest problem is overthinking everything      1
2  the worst sadness is the sadness you've taught...      1
3  i cannot make you understand. i cannot make an...      1
4  i don't think anyone really understands how ti...      1


In [7]:
data.drop_duplicates(inplace=True)
data.dropna(subset=['text'], inplace=True)

print("After cleaning:", data.shape)

After cleaning: (5810, 2)


In [8]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):

    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)

    # Remove punctuation & special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Convert to lowercase
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    tokens = [stemmer.stem(word) for word in tokens]

    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return " ".join(tokens)

In [9]:
data['clean_text'] = data['text'].apply(preprocess_text)

print(data[['text','clean_text']].head())

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [10]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
data['clean_text'] = data['text'].apply(preprocess_text)
print(data[['text','clean_text']].head())

                                                text  \
0  the real reason why you're sad? you're attache...   
1      my biggest problem is overthinking everything   
2  the worst sadness is the sadness you've taught...   
3  i cannot make you understand. i cannot make an...   
4  i don't think anyone really understands how ti...   

                                          clean_text  
0  real reason your sad your attach peopl distant...  
1                  biggest problem overthink everyth  
2                     worst sad sad youv taught hide  
3  make understand make anyon understand happen i...  
4  dont think anyon realli understand tire act ok...  


In [12]:
data = data[data['clean_text'].str.strip() != ""]
print("After removing empty rows:", data.shape)

After removing empty rows: (5743, 3)


In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['clean_text'])

sequences = tokenizer.texts_to_sequences(data['clean_text'])

print("Example sequence:", sequences[0])

Example sequence: [225, 161, 28, 104, 28, 1038, 3, 2105, 28, 353, 569, 3, 276, 13, 21, 3, 135, 28, 23, 3, 23, 215, 39, 66, 3, 11]


In [14]:
max_length = 100  # can adjust based on dataset

padded_sequences = pad_sequences(
    sequences,
    maxlen=max_length,
    padding='post',
    truncating='post'
)

print("Shape after padding:", padded_sequences.shape)

Shape after padding: (5743, 100)


In [15]:
X = padded_sequences
y = data['label']

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (5743, 100)
y shape: (5743,)


In [16]:
print("Dataset shape:", data.shape)
print("Sample cleaned text:", data['clean_text'].iloc[0])
print("Padded shape:", padded_sequences.shape)
print("Label distribution:\n", data['label'].value_counts())

Dataset shape: (5743, 3)
Sample cleaned text: real reason your sad your attach peopl distant your pay attent peopl ignor make time peopl busi your care peopl care less come let peopl go
Padded shape: (5743, 100)
Label distribution:
 label
0    2961
1    2782
Name: count, dtype: int64


In [17]:
print("Rows removed:", 6164 - len(data))

Rows removed: 421


In [18]:
min_count = min(data['label'].value_counts())

balanced_data = pd.concat([
    data[data['label'] == 0].sample(min_count, random_state=42),
    data[data['label'] == 1].sample(min_count, random_state=42)
])

balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

print("Balanced shape:", balanced_data.shape)
print("Balanced distribution:\n", balanced_data['label'].value_counts())

Balanced shape: (5564, 3)
Balanced distribution:
 label
0    2782
1    2782
Name: count, dtype: int64


In [19]:
print(balanced_data['clean_text'].isnull().sum())

0


In [20]:
print((balanced_data['clean_text'].str.strip() == "").sum())

0


In [21]:
print(balanced_data['label'].value_counts())

label
0    2782
1    2782
Name: count, dtype: int64


In [22]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(balanced_data['clean_text'])

# Vocabulary size
print("Vocabulary size:", len(tokenizer.word_index))

Vocabulary size: 5231


In [23]:
sequences = tokenizer.texts_to_sequences(balanced_data['clean_text'])

print("Example sequence:", sequences[0])

Example sequence: [1007, 1102, 173, 65, 5, 4, 76, 273, 2070, 927, 322, 53, 855]


In [24]:
max_length = 100   # same length for all sentences

padded_sequences = pad_sequences(
    sequences,
    maxlen=max_length,
    padding='post',
    truncating='post'
)

print("Shape after padding:", padded_sequences.shape)

Shape after padding: (5564, 100)


In [25]:
X = padded_sequences
y = balanced_data['label']

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Label distribution:\n", y.value_counts())

X shape: (5564, 100)
y shape: (5564,)
Label distribution:
 label
0    2782
1    2782
Name: count, dtype: int64


In [26]:
print("Any null in X:", np.isnan(X).any())
print("Any null in y:", y.isnull().sum())

Any null in X: False
Any null in y: 0
