In [4]:
import pandas as pd

In [5]:
# Load the training data
train_df = pd.read_csv('Train.csv')

In [6]:
# Shape of training data
print(f" Shape: {train_df}")

 Shape:        Unnamed: 0                                               TEXT  Label
0               0  Vacation wasted ! #vacation2017 #photobomb #ti...      0
1               1  Oh Wynwood, you’re so funny! : @user #Wynwood ...      1
2               2  Been friends since 7th grade. Look at us now w...      2
3               3  This is what it looks like when someone loves ...      3
4               4  RT @user this white family was invited to a Bl...      3
...           ...                                                ...    ...
69995       69995  Yes, I call Galina "my Bubie" Go follow my bea...      3
69996       69996    I SEA you, Seattle @ Ballard Seafood Festival\n     16
69997       69997  If one of my daughters is wearing this and ask...      2
69998       69998  Guess who whoop people on THEIR homecoming?! #...      3
69999       69999  We Love you Robbie @ Heritage Memorial Cemeter...     14

[70000 rows x 3 columns]


In [7]:
# First 5 rows
print(train_df.head())

   Unnamed: 0                                               TEXT  Label
0           0  Vacation wasted ! #vacation2017 #photobomb #ti...      0
1           1  Oh Wynwood, you’re so funny! : @user #Wynwood ...      1
2           2  Been friends since 7th grade. Look at us now w...      2
3           3  This is what it looks like when someone loves ...      3
4           4  RT @user this white family was invited to a Bl...      3


In [8]:
# Load the mapping data
mapping_df = pd.read_csv('Mapping.csv')
print(mapping_df.head())

   Unnamed: 0 emoticons  number
0           0         😜       0
1           1         📸       1
2           2         😍       2
3           3         😂       3
4           4         😉       4


In [9]:
# Rename columns to align for merging
mapping_df = mapping_df.rename(columns={'number': 'Label', 'emoticons': 'emoji'})

In [10]:
# Merge label → emoji mapping into train_df
train_df = train_df.merge(mapping_df[['Label', 'emoji']], on='Label')

# Preview of the new dataframe
print(train_df.head())

   Unnamed: 0                                               TEXT  Label emoji
0           0  Vacation wasted ! #vacation2017 #photobomb #ti...      0     😜
1           1  Oh Wynwood, you’re so funny! : @user #Wynwood ...      1     📸
2           2  Been friends since 7th grade. Look at us now w...      2     😍
3           3  This is what it looks like when someone loves ...      3     😂
4           4  RT @user this white family was invited to a Bl...      3     😂


In [15]:
import re

# Define a cleaning function
def clean_text(text):
    text = text.lower()                                # Lowercase
    text = re.sub(r'@[\w_]+', '', text)                # Remove @mentions
    text = re.sub(r'#(\w+)', r'\1', text)              # Remove # but keep the word
    text = re.sub(r'http\S+', '', text)                # Remove links
    text = re.sub(r'[^\w\s]', '', text)                # Remove punctuation
    text = re.sub(r'\d+', '', text)                    # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()           # Remove extra whitespace
    return text

# Apply cleaning
train_df['clean_text'] = train_df['TEXT'].apply(clean_text)

# Preview
train_df[['TEXT', 'clean_text']].head()


Unnamed: 0,TEXT,clean_text
0,Vacation wasted ! #vacation2017 #photobomb #ti...,vacation wasted vacation photobomb tired vacat...
1,"Oh Wynwood, you’re so funny! : @user #Wynwood ...",oh wynwood youre so funny wynwood art itwasam ...
2,Been friends since 7th grade. Look at us now w...,been friends since th grade look at us now we ...
3,This is what it looks like when someone loves ...,this is what it looks like when someone loves ...
4,RT @user this white family was invited to a Bl...,rt this white family was invited to a black ba...


### We ran a code which cleaned each tweet in the TEXT column by removing mentions, hashtags, links, punctuation, numbers, and extra spaces, and saves the result in a new column called clean_text. ✅

In [16]:
# Clean dataframe 
train_df.head()

Unnamed: 0.1,Unnamed: 0,TEXT,Label,emoji,clean_text
0,0,Vacation wasted ! #vacation2017 #photobomb #ti...,0,😜,vacation wasted vacation photobomb tired vacat...
1,1,"Oh Wynwood, you’re so funny! : @user #Wynwood ...",1,📸,oh wynwood youre so funny wynwood art itwasam ...
2,2,Been friends since 7th grade. Look at us now w...,2,😍,been friends since th grade look at us now we ...
3,3,This is what it looks like when someone loves ...,3,😂,this is what it looks like when someone loves ...
4,4,RT @user this white family was invited to a Bl...,3,😂,rt this white family was invited to a black ba...


In [17]:
train_df.drop(columns=['TEXT'], inplace=True)


In [18]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,Label,emoji,clean_text
0,0,0,😜,vacation wasted vacation photobomb tired vacat...
1,1,1,📸,oh wynwood youre so funny wynwood art itwasam ...
2,2,2,😍,been friends since th grade look at us now we ...
3,3,3,😂,this is what it looks like when someone loves ...
4,4,3,😂,rt this white family was invited to a black ba...


### Now we'll convert text to numbers. We'll->
##### 1)Convert the cleaned text into numeric sequences using Tokenizer

##### 2)Pad those sequences so they’re all the same length

##### 3)Convert the emoji Label column into categorical one-hot labels for training


In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Set a vocabulary size
vocab_size = 5000
oov_token = "<OOV"

# Create and fit tokenize
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_token)
tokenizer.fit_on_texts(train_df['clean_text'])

# Converting text to sequences 
sequences = tokenizer.texts_to_sequences(train_df['clean_text'])

# Pad the sequences to same length 
max_length = max(len(seq) for seq in sequences)
padded_sequence = pad_sequences(sequences, maxlen = max_length, padding = 'post')

In [25]:
# One-Hot Encode the Labels

from tensorflow.keras.utils import to_categorical

# Check how many unique labels you have 
num_classes = train_df['Label'].nunique()
print(f"Number of classes : {num_classes}")

one_hot_labels = to_categorical(train_df['Label'], num_classes=num_classes)


Number of classes : 20


In [27]:
# This will show you all 20 emoji–label pairs being used in the dataset
train_df[['Label', 'emoji']].drop_duplicates().sort_values('Label')


Unnamed: 0,Label,emoji
0,0,😜
1,1,📸
2,2,😍
3,3,😂
5,4,😉
6,5,🎄
7,6,📷
9,7,🔥
10,8,😘
11,9,❤
