In [4]:
# imports
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

In [5]:
# Load data
data_path = '../../data/sentiment-analysis/train.csv'
df = pd.read_csv(data_path, encoding='ISO-8859-1')

# Preview
print("Sample data:")
print(df[['text','sentiment']].head())

# Basic info
print("\nDataset shape:", df.shape)
print("\nSentiment counts:")
print(df['sentiment'].value_counts())

Sample data:
                                                text sentiment
0                I`d have responded, if I were going   neutral
1      Sooo SAD I will miss you here in San Diego!!!  negative
2                          my boss is bullying me...  negative
3                     what interview! leave me alone  negative
4   Sons of ****, why couldn`t they put them on t...  negative

Dataset shape: (27481, 10)

Sentiment counts:
sentiment
neutral     11118
positive     8582
negative     7781
Name: count, dtype: int64


In [6]:
# Text cleaning function
def clean_text(text):
    text = str(text).lower()  # Lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-z0-9\s\']', '', text)  # Keep letters, digits, spaces, apostrophes
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

In [7]:
# Apply cleaning
df['clean_text'] = df['text'].apply(clean_text)

# Remove empty or null cleaned texts if any
df = df[df['clean_text'].str.len() > 0]

In [8]:
# Encode sentiment labels (example)
label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
df = df[df['sentiment'].isin(label_map.keys())]
df['label'] = df['sentiment'].map(label_map)

In [9]:
# Train-test split (you can adjust test size)
train_df, val_df = train_test_split(df, test_size=0.15, stratify=df['label'], random_state=42)

print("\nTraining samples:", len(train_df))
print("Validation samples:", len(val_df))


Training samples: 23356
Validation samples: 4122


In [10]:
# Save processed datasets if needed
train_df.to_csv('../../data/sentiment-analysis/processed_train.csv', index=False)
val_df.to_csv('../../data/sentiment-analysis/processed_val.csv', index=False)

print("Saved.")

Saved.
