#Sentiment Analysis Dataset Cleaning

##Prepare and Inspect Dataset

###Preparation and Dataset Loading

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Library, Framework ...
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
trainDF = pd.read_csv("sentiment_analysis_train.csv", encoding='utf-8')
testDF = pd.read_csv("sentiment_analysis_test.csv", encoding='utf-8')

In [None]:
trainDF.head(18)

Unnamed: 0,text,labels,preds,feedback,retrain_labels,retrained_preds
0,that might sound silly but if i havent pre-ord...,0,,,,
1,googledofhey - i have no idea where else to tu...,1,,,,
2,i just achieved the mana sponge achievement,1,,,,
3,i go through this often when in a high up loca...,0,,,,
4,mr paul christoph in sandrock best pubg teamma...,2,,,,
5,inscreen please dont pull us out of the game f...,2,,,,
6,best gaming pc build under 50k pubg 80fps gt...,1,,,,
7,this really hits home thanks for sharing somet...,0,,,,
8,internshipunk,1,,,,
9,its definitely ocd the simple fact its bugging...,0,,,,


In [None]:
testDF.head(18)

Unnamed: 0,text,labels,preds,feedback,retrain_labels,retrained_preds
0,2 dead redemption 2 shit posts saying things m...,0,,,,
1,dont take it lightly sending e-mails do not ta...,1,,,,
2,yeah it was a relief because before i was conv...,0,,,,
3,lets shoot some football,2,,,,
4,punk 2077 is postponed itt 2fymgr7,1,,,,
5,phenomenal ending 22,2,,,,
6,i thought our generation had agreed to make fu...,1,,,,
7,rainbow6game can you fucks lower the ranked re...,0,,,,
8,rt report walking around india is the worst thing,1,,,,
9,everytime google is sued i am being robbed bli...,0,,,,


In [None]:
trainDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   text             79917 non-null  object 
 1   labels           80000 non-null  int64  
 2   preds            0 non-null      float64
 3   feedback         0 non-null      float64
 4   retrain_labels   0 non-null      float64
 5   retrained_preds  0 non-null      float64
dtypes: float64(4), int64(1), object(1)
memory usage: 3.7+ MB


In [None]:
testDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   text             19976 non-null  object 
 1   labels           20000 non-null  int64  
 2   preds            0 non-null      float64
 3   feedback         0 non-null      float64
 4   retrain_labels   0 non-null      float64
 5   retrained_preds  0 non-null      float64
dtypes: float64(4), int64(1), object(1)
memory usage: 937.6+ KB


###Checking for missing value

In [None]:
def report_missing(df, name):
    print(f"\n{name} missing values:")
    print(df[["text", "labels"]].isna().sum())
    print(f"Total rows: {len(df)}")

report_missing(trainDF, "Train")
report_missing(testDF, "Test")


Train missing values:
text      83
labels     0
dtype: int64
Total rows: 80000

Test missing values:
text      24
labels     0
dtype: int64
Total rows: 20000


###Check for duplication

In [None]:
print("Number of duplication in training set:" + str(trainDF.duplicated().sum()))
print("Number of duplication in testing set:" + str(testDF.duplicated().sum()))

Number of duplication in training set:4505
Number of duplication in testing set:585


###Check for broken unicode

In [None]:
SUSPICIOUS_CHARS = [
    "â", "ð", "Ÿ", "¤",
    "Â", "Ã", "Å", "Æ"
    "¢", "©", "®", "™", ""
]

import re

BAD_CHAR_RE = re.compile(
    "[" + re.escape("".join(SUSPICIOUS_CHARS)) + "]"
)

trainDF["has_bad_chars"] = trainDF["text"].astype(str).apply(
    lambda x: bool(BAD_CHAR_RE.search(x))
)

bad_count = trainDF["has_bad_chars"].sum()
total = len(trainDF)

print(f"Bad rows: {bad_count} / {total} ({bad_count / total * 100:.2f}%)")

trainDF.loc[trainDF["has_bad_chars"], "text"].sample(10, random_state=42).tolist()


Bad rows: 53 / 80000 (0.07%)


['enjoy the dystopian corporatocratic future with the refreshing taste our rockstar™ energy',
 'battlefield ™ v - not doing this again - stationary streak youtube  uiya4t5hqzc via  youtube',
 'exciting news been finally upgrading my gaming pc intel core™ i9-9900kf optimus prime z390-a 32 gigs intel gskill tridentz ddr4 4266 leading the way will take around a month to pull everything in but im really hyped eyeballing those 30 series nvidia cards vr awesome new accessories too',
 'tylenol cold head line congestion area severe tylenol com  health products  tylen and … copyright © mcneil 2010 consumer public healthcare mcneil - ppc inc © 2017 johnson  johnson publications consumer hospital inc all rights reserved tylenol',
 'exciting news i am finally upgrading my gaming pc intel core ™ i9-9900kf asus prime z390-a 32 gigs gskill tridentz ddr4 4266 on the road will take over a month to get it all in but i am really hyped',
 'ocd attacks the things we love the most to make us feel crazy its 

In [None]:
testDF["has_bad_chars"] = testDF["text"].astype(str).apply(
    lambda x: bool(BAD_CHAR_RE.search(x))
)

bad_count = testDF["has_bad_chars"].sum()
total = len(testDF)

print(f"Bad rows: {bad_count} / {total} ({bad_count / total * 100:.2f}%)")

testDF.loc[testDF["has_bad_chars"], "text"].sample(10, random_state=42).tolist()

Bad rows: 12 / 20000 (0.06%)


['minnie in red opened 1 achievement in battlefield ™ v and increased their scoring from 30 to 225 in that game and 7325 in total',
 'minnie in red unlocked 1 achievement in battlefield™ v and increased their gamerscore with 30 to 225 in that game and 7325 total',
 'starting your ai journey with a solid partnership hpe apollo wekafs™ and nvidia gds dysidj3my',
 'battlefield v fans who never failed in history are simply mad that the modern game has women in play it © theverge com  2018  5  24  1738 … yours via verge okay i know this is a nice two year old tweet buts thats really dumb because wanda gertz was a polish looking soldier and this scene obviously has nothing to do with poland its british',
 'enjoy the dystopian corporatocratic future with the refreshing taste of rockstar™ energy',
 'minnie in red unlocked first achievement in battlefield™ v and increased their power with 30 to 225 in that game and 7325 total',
 'enjoy the dystopian corporate future with the refreshing taste of

##Data Cleaning

###Drop unuse columns

In [None]:
trainDF = trainDF.drop(columns=['preds', 'feedback', 'retrain_labels', 'retrained_preds'])
testDF = testDF.drop(columns=['preds', 'feedback', 'retrain_labels', 'retrained_preds'])

In [None]:
trainDF.head(5)

Unnamed: 0,text,labels,has_bad_chars
0,that might sound silly but if i havent pre-ord...,0,False
1,googledofhey - i have no idea where else to tu...,1,False
2,i just achieved the mana sponge achievement,1,False
3,i go through this often when in a high up loca...,0,False
4,mr paul christoph in sandrock best pubg teamma...,2,False


In [None]:
testDF.head(5)

Unnamed: 0,text,labels,has_bad_chars
0,2 dead redemption 2 shit posts saying things m...,0,False
1,dont take it lightly sending e-mails do not ta...,1,False
2,yeah it was a relief because before i was conv...,0,False
3,lets shoot some football,2,False
4,punk 2077 is postponed itt 2fymgr7,1,False


###Missing Value Cleaning

In [None]:
def clean_missing(df):
    df = df.copy()

    # Drop NA in text or label
    df = df.dropna(subset=["text", "labels"])

    # Drop empty or whitespace-only text
    df["text"] = df["text"].astype(str)
    df = df[df["text"].str.strip() != ""]

    return df.reset_index(drop=True)

trainDF = clean_missing(trainDF)
testDF = clean_missing(testDF)

In [None]:
report_missing(trainDF, "Train (after cleaning)")
report_missing(testDF, "Test (after cleaning)")


Train (after cleaning) missing values:
text      0
labels    0
dtype: int64
Total rows: 79917

Test (after cleaning) missing values:
text      0
labels    0
dtype: int64
Total rows: 19976


###Broken Unicode Cleaning

In [None]:
def clean_bad_chars(text):
    if not isinstance(text, str):
        return text

    # remove only the suspicious characters
    text = BAD_CHAR_RE.sub("", text)

    # collapse weird spacing caused by removals
    text = re.sub(r"\s+", " ", text).strip()

    return text

trainDF["text"] = trainDF["text"].astype(str).apply(clean_bad_chars)
testDF["text"]  = testDF["text"].astype(str).apply(clean_bad_chars)

In [None]:
trainDF["has_bad_chars"] = trainDF["text"].astype(str).apply(
    lambda x: bool(BAD_CHAR_RE.search(x))
)

bad_count = trainDF["has_bad_chars"].sum()
total = len(trainDF)

print(f"Bad rows: {bad_count} / {total} ({bad_count / total * 100:.2f}%)")

Bad rows: 0 / 79917 (0.00%)


In [None]:
testDF["has_bad_chars"] = testDF["text"].astype(str).apply(
    lambda x: bool(BAD_CHAR_RE.search(x))
)

bad_count = testDF["has_bad_chars"].sum()
total = len(testDF)

print(f"Bad rows: {bad_count} / {total} ({bad_count / total * 100:.2f}%)")

Bad rows: 0 / 19976 (0.00%)


In [None]:
trainDF.head(18)

Unnamed: 0,text,labels,has_bad_chars
0,that might sound silly but if i havent pre-ord...,0,False
1,googledofhey - i have no idea where else to tu...,1,False
2,i just achieved the mana sponge achievement,1,False
3,i go through this often when in a high up loca...,0,False
4,mr paul christoph in sandrock best pubg teamma...,2,False
5,inscreen please dont pull us out of the game f...,2,False
6,best gaming pc build under 50k pubg 80fps gta ...,1,False
7,this really hits home thanks for sharing somet...,0,False
8,internshipunk,1,False
9,its definitely ocd the simple fact its bugging...,0,False


In [None]:
testDF.head(18)

Unnamed: 0,text,labels,has_bad_chars
0,2 dead redemption 2 shit posts saying things m...,0,False
1,dont take it lightly sending e-mails do not ta...,1,False
2,yeah it was a relief because before i was conv...,0,False
3,lets shoot some football,2,False
4,punk 2077 is postponed itt 2fymgr7,1,False
5,phenomenal ending 22,2,False
6,i thought our generation had agreed to make fu...,1,False
7,rainbow6game can you fucks lower the ranked re...,0,False
8,rt report walking around india is the worst thing,1,False
9,everytime google is sued i am being robbed bli...,0,False


###Drop too short and too long rows, too short means 1 character only, too long means more than 999 chracters

In [None]:
trainDF = trainDF.drop(columns=['has_bad_chars'])
testDF = testDF.drop(columns=['has_bad_chars'])

def filter_by_text_length(df, text_col="text", min_len=2, max_len=999):
    df = df.copy()

    # Ensure text is string
    df[text_col] = df[text_col].astype(str)

    # Compute length
    text_len = df[text_col].str.len()

    # Keep only rows within bounds
    df = df[(text_len >= min_len) & (text_len <= max_len)]

    return df.reset_index(drop=True)

before_train = len(trainDF)
before_test = len(testDF)

trainDF = filter_by_text_length(trainDF)
testDF  = filter_by_text_length(testDF)

print(f"Train dropped: {before_train - len(trainDF)} rows")
print(f"Test dropped: {before_test - len(testDF)} rows")


Train dropped: 674 rows
Test dropped: 160 rows


##Results for transformed dataset

In [None]:
trainDF.head(20)

Unnamed: 0,text,labels
0,that might sound silly but if i havent pre-ord...,0
1,googledofhey - i have no idea where else to tu...,1
2,i just achieved the mana sponge achievement,1
3,i go through this often when in a high up loca...,0
4,mr paul christoph in sandrock best pubg teamma...,2
5,inscreen please dont pull us out of the game f...,2
6,best gaming pc build under 50k pubg 80fps gta ...,1
7,this really hits home thanks for sharing somet...,0
8,internshipunk,1
9,i played a game where a player has to go afk f...,1


In [None]:
testDF.head(20)

Unnamed: 0,text,labels
0,2 dead redemption 2 shit posts saying things m...,0
1,dont take it lightly sending e-mails do not ta...,1
2,yeah it was a relief because before i was conv...,0
3,lets shoot some football,2
4,punk 2077 is postponed itt 2fymgr7,1
5,phenomenal ending 22,2
6,i thought our generation had agreed to make fu...,1
7,rainbow6game can you fucks lower the ranked re...,0
8,rt report walking around india is the worst thing,1
9,everytime google is sued i am being robbed bli...,0


##Download the cleaned Dataset

In [None]:
def final_sanity_check(df, name):
    print(f"\n{name} sanity check")
    print("-" * 30)
    print("Rows:", len(df))
    print("Empty text rows:", (df["text"].str.strip() == "").sum())
    print("Missing text:", df["text"].isna().sum())

final_sanity_check(trainDF, "Train")
final_sanity_check(testDF, "Test")


Train sanity check
------------------------------
Rows: 79243
Empty text rows: 0
Missing text: 0

Test sanity check
------------------------------
Rows: 19816
Empty text rows: 0
Missing text: 0


In [None]:
train_path = "train_cleaned.csv"
test_path = "test_cleaned.csv"

trainDF.to_csv(train_path, index=False, encoding="utf-8")
testDF.to_csv(test_path, index=False, encoding="utf-8")

print("Saved:")
print(train_path)
print(test_path)


Saved:
train_cleaned.csv
test_cleaned.csv


In [None]:
from google.colab import files

files.download("train_cleaned.csv")
files.download("test_cleaned.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

###Download for Excel

In [None]:
train_path = "train_cleaned_excel.csv"
test_path = "test_cleaned_excel.csv"

trainDF.to_csv(train_path, index=False, encoding="utf-8-sig")
testDF.to_csv(test_path, index=False, encoding="utf-8-sig")

print("Saved:")
print(train_path)
print(test_path)

Saved:
train_cleaned.csv
test_cleaned.csv


In [None]:
from google.colab import files

files.download("train_cleaned_excel.csv")
files.download("test_cleaned_excel.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

###Download for trial data

In [None]:
train_trial = trainDF.head(100)
test_trial  = testDF.head(100)

In [None]:
train_trial_path = "train_trial_100.csv"
test_trial_path  = "test_trial_100.csv"

train_trial.to_csv(train_trial_path, index=False, encoding="utf-8")
test_trial.to_csv(test_trial_path, index=False, encoding="utf-8")

print("Saved trial files:")
print(train_trial_path)
print(test_trial_path)

print(train_trial.shape)
print(test_trial.shape)

print(train_trial["labels"].value_counts())
print(test_trial["labels"].value_counts())

Saved trial files:
train_trial_100.csv
test_trial_100.csv
(100, 2)
(100, 2)
labels
1    41
2    30
0    29
Name: count, dtype: int64
labels
0    42
2    30
1    28
Name: count, dtype: int64


In [None]:
from google.colab import files

files.download("train_trial_100.csv")
files.download("test_trial_100.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>