In [1]:
# Imports
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

In [2]:
# Load data
data_path = '../../data/sentiment-analysis/train.csv'
df = pd.read_csv(data_path, encoding='ISO-8859-1')

# Remove duplicates and missing values
df = df.drop_duplicates(subset=['text'])
df = df.dropna(subset=['text', 'sentiment'])

# Preview data
print("Sample data:")
print(df[['text', 'sentiment']].head())

Sample data:
                                                text sentiment
0                I`d have responded, if I were going   neutral
1      Sooo SAD I will miss you here in San Diego!!!  negative
2                          my boss is bullying me...  negative
3                     what interview! leave me alone  negative
4   Sons of ****, why couldn`t they put them on t...  negative


In [3]:
# Text cleaning
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r"[^a-z0-9\s']", '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['text'].apply(clean_text)
df = df[df['clean_text'].str.len() > 0]

In [4]:
# Encode sentiment labels
label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
df = df[df['sentiment'].isin(label_map.keys())]
df['label'] = df['sentiment'].map(label_map)

In [5]:
# Split dataset
train_df, val_df = train_test_split(
    df, test_size=0.15, stratify=df['label'], random_state=42
)

print("Training samples:", len(train_df))
print("Validation samples:", len(val_df))
print("Train label counts:\n", train_df['label'].value_counts())
print("Val label counts:\n", val_df['label'].value_counts())

Training samples: 23355
Validation samples: 4122
Train label counts:
 label
1    9448
2    7294
0    6613
Name: count, dtype: int64
Val label counts:
 label
1    1667
2    1288
0    1167
Name: count, dtype: int64


In [6]:
# Stratified split
train_df, val_df = train_test_split(
    df, test_size=0.15, stratify=df['label'], random_state=42
)
print("Training samples:", len(train_df))
print("Validation samples:", len(val_df))
print("Train label counts:\n", train_df['label'].value_counts())
print("Val label counts:\n", val_df['label'].value_counts())

Training samples: 23355
Validation samples: 4122
Train label counts:
 label
1    9448
2    7294
0    6613
Name: count, dtype: int64
Val label counts:
 label
1    1667
2    1288
0    1167
Name: count, dtype: int64


In [7]:
# Save
train_df.to_csv('../../data/sentiment-analysis/processed_train.csv', index=False)
val_df.to_csv('../../data/sentiment-analysis/processed_val.csv', index=False)
print("Saved.")

Saved.
