In [1]:
import pandas as pd

column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']

df = pd.read_csv('sentiment_140 Dataset.csv', 
                 encoding='ISO-8859-1', 
                 header=None, 
                 names=column_names)


In [2]:
df.shape

(1600000, 6)

In [3]:
print(df.isnull().sum())
df = df.dropna()

target    0
ids       0
date      0
flag      0
user      0
text      0
dtype: int64


In [4]:
df.describe()

Unnamed: 0,target,ids
count,1600000.0,1600000.0
mean,2.0,1998818000.0
std,2.000001,193576100.0
min,0.0,1467810000.0
25%,0.0,1956916000.0
50%,2.0,2002102000.0
75%,4.0,2177059000.0
max,4.0,2329206000.0


In [5]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
# Lowercase all text in the 'text' column
df['text'] = df['text'].str.lower()


In [7]:
print(df['text'].head()) 


0    @switchfoot http://twitpic.com/2y1zl - awww, t...
1    is upset that he can't update his facebook by ...
2    @kenichan i dived many times for the ball. man...
3      my whole body feels itchy and like its on fire 
4    @nationwideclass no, it's not behaving at all....
Name: text, dtype: object


In [8]:
import re

def remove_html(text):
    if isinstance(text, str):
        return re.sub(r'<.*?>', '', text)  # remove HTML tags
    return text

df['text'] = df['text'].apply(remove_html)


In [9]:
print(df['text'].head()) 

0    @switchfoot http://twitpic.com/2y1zl - awww, t...
1    is upset that he can't update his facebook by ...
2    @kenichan i dived many times for the ball. man...
3      my whole body feels itchy and like its on fire 
4    @nationwideclass no, it's not behaving at all....
Name: text, dtype: object


In [10]:
def remove_extra_whitespace(text):
    if isinstance(text, str):
        # Replace multiple whitespace characters with a single space and strip leading/trailing spaces
        return re.sub(r'\s+', ' ', text).strip()
    return text

df['text'] = df['text'].apply(remove_extra_whitespace)



In [11]:
import string

def remove_punctuation(text):
    if isinstance(text, str):
        return text.translate(str.maketrans('', '', string.punctuation))
    return text

df['text'] = df['text'].apply(remove_punctuation)


In [12]:
print(df['text'].head()) 

0    switchfoot httptwitpiccom2y1zl  awww thats a b...
1    is upset that he cant update his facebook by t...
2    kenichan i dived many times for the ball manag...
3       my whole body feels itchy and like its on fire
4    nationwideclass no its not behaving at all im ...
Name: text, dtype: object


In [14]:
from spellchecker import SpellChecker

spell = SpellChecker()

def get_misspelled(text):
    if not isinstance(text, str): return "Invalid input"
    words = text.split()
    misspelled = list(spell.unknown(words))
    return misspelled if misspelled else "No misspelled words"

df['misspelled'] = df['text'].apply(get_misspelled)


In [22]:
import emoji
from collections import Counter

# Function to extract emojis from text
def extract_emojis(text):
    if not isinstance(text, str):
        return []
    return [char for char in text if emoji.is_emoji(char)]

# Apply function to extract emojis from each review
emoji_lists = df['text'].apply(extract_emojis)

# Flatten all extracted emoji lists into a single list
all_emojis = [em for sublist in emoji_lists for em in sublist]

# Count frequency of each emoji
emoji_freq = Counter(all_emojis)

# Display results
if len(all_emojis) == 0:
    print("✅ No emojis found in any review.")
else:
    print(f" Total emojis found: {len(all_emojis)}")
    print(f" Unique emojis used: {len(emoji_freq)}")
    print(" Top 10 emojis used:")
    for em, count in emoji_freq.most_common(10):
        print(f"{em} : {count}")


 Total emojis found: 1510
 Unique emojis used: 2
 Top 10 emojis used:
© : 1021
® : 489


In [21]:
import re

# Regex pattern for standard emojis
emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F"  # Emoticons (😀 - 😏)
    u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # Transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # Flags
    u"\U00002700-\U000027BF"  # Dingbats
    u"\U0001F900-\U0001F9FF"  # Supplemental symbols
    u"\U00002600-\U000026FF"  # Misc symbols
    "]+", flags=re.UNICODE)

# Function to extract only proper emojis
def extract_proper_emojis(text):
    if not isinstance(text, str):
        return []
    return emoji_pattern.findall(text)

# Apply to your DataFrame
emoji_list_filtered = df['text'].apply(extract_proper_emojis)
all_emojis_filtered = [em for sublist in emoji_list_filtered for em in sublist]
emoji_freq_filtered = Counter(all_emojis_filtered)

print(f"✅ Proper emojis found: {len(all_emojis_filtered)}")
print(f"🔁 Unique proper emojis: {len(emoji_freq_filtered)}")
print("🔥 Top emojis:")
for em, count in emoji_freq_filtered.most_common(10):
    print(f"{em} : {count}")


✅ Proper emojis found: 0
🔁 Unique proper emojis: 0
🔥 Top emojis:
