In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split

In [15]:
tweets = pd.read_parquet('../data/raw/Many_Tweets.parquet',engine='fastparquet')
tweets.head()

Unnamed: 0,Text,Label
0,is upset that he can't update his Facebook by ...,negative
1,@Kenichan I dived many times for the ball. Man...,negative
2,my whole body feels itchy and like its on fire,negative
3,"@nationwideclass no, it's not behaving at all....",negative
4,@Kwesidei not the whole crew,negative


In [16]:
tweets['Label'].value_counts()

Label
positive    1048516
negative    1044145
neutral      198586
Name: count, dtype: int64

In [17]:
targuet_total = 200_000 
num_classes = len(tweets['Label'].unique())
samples_per_class = targuet_total // num_classes

undersampled = pd.DataFrame()

for label in tweets['Label'].unique():
    df_label = tweets[tweets['Label']==label]
    n_samples = min(samples_per_class,len(df_label))
    df_downsampled = df_label.sample(n=n_samples, random_state=8)
    undersampled = pd.concat([undersampled,df_downsampled])

undersampled = undersampled.sample(frac=1,random_state=8).reset_index(drop=True)

In [18]:
# Data Split
X_train_val,X_test,y_train_val,y_test = train_test_split(undersampled['Text'],undersampled['Label'],
                                                         test_size=0.2,random_state=8)
X_train,X_val,y_train,y_val = train_test_split(X_train_val,y_train_val,test_size=0.2,random_state=8)

train = pd.concat([X_train,y_train],axis=1).reset_index(drop=True)
val = pd.concat([X_val,y_val],axis=1).reset_index(drop=True)
test = pd.concat([X_test,y_test],axis=1).reset_index(drop=True)

In [19]:
# Exploratory Analysis
print(train.isnull().sum())
print(test.isnull().sum())
print(val.isnull().sum())

Text     0
Label    0
dtype: int64
Text     0
Label    0
dtype: int64
Text     0
Label    0
dtype: int64


In [20]:
# Cleaning null values
train = train.dropna()
test = test.dropna()
val = val.dropna()

In [21]:
# Text Cleaning
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.data.preprocessing import process_dataset_parallel

In [22]:
# Cleaning training dataframe 
training_clean = process_dataset_parallel(train,'Text')

  0%|          | 0/127998 [00:00<?, ?it/s]

100%|██████████| 127998/127998 [00:06<00:00, 19046.39it/s]


In [23]:
# Cleaning test dataframe 
test_clean = process_dataset_parallel(test,'Text')

100%|██████████| 40000/40000 [00:02<00:00, 17078.54it/s]


In [24]:
# Cleaning val dataframe 
val_clean = process_dataset_parallel(val,'Text')

100%|██████████| 32000/32000 [00:01<00:00, 21751.38it/s]


In [25]:
# Change for clean text
train['Text'] = training_clean
test['Text'] = test_clean
val['Text'] = val_clean

# Map class labels 
labels_mapped = {
    'negative':0,
    'positive':1,
    'neutral':2
}

train['Label'] = train['Label'].map(labels_mapped)
test['Label'] = test['Label'].map(labels_mapped)
val['Label'] = val['Label'].map(labels_mapped)

# Export process data
train.to_parquet('../data/processed/clean_train_tweets.parquet',index=False)
test.to_parquet('../data/processed/clean_test_tweets.parquet',index=False)
val.to_parquet('../data/processed/clean_val_tweets.parquet',index=False)