In [None]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the datasets
train_df = pd.read_csv('Train.csv')
test_df = pd.read_csv('Test.csv')
mapping_df = pd.read_csv('Mapping.csv')
OutputFormats_df

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Define a function to clean the text
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'#\w+', '', text)     # Remove hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()                  # Convert to lowercase
    return text

# Apply the cleaning function to the datasets
train_df['cleaned_text'] = train_df['TEXT'].apply(clean_text)
test_df['cleaned_text'] = test_df['TEXT'].apply(clean_text)

# Tokenize the text
train_df['tokens'] = train_df['cleaned_text'].apply(word_tokenize)
test_df['tokens'] = test_df['cleaned_text'].apply(word_tokenize)

# Remove stopwords
stop_words = set(stopwords.words('english'))
train_df['tokens'] = train_df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
test_df['tokens'] = test_df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

# Join tokens back into strings for TF-IDF Vectorizer
train_df['cleaned_text'] = train_df['tokens'].apply(lambda x: ' '.join(x))
test_df['cleaned_text'] = test_df['tokens'].apply(lambda x: ' '.join(x))

# Create TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train = tfidf_vectorizer.fit_transform(train_df['cleaned_text'])
X_test = tfidf_vectorizer.transform(test_df['cleaned_text'])

# Encode the labels
number_to_emoticon = dict(zip(mapping_df['number'], mapping_df['emoticons']))
train_df['emoticon'] = train_df['Label'].map(number_to_emoticon)
label_encoder = LabelEncoder()
train_df['emoji_label'] = label_encoder.fit_transform(train_df['emoticon'])

# Split the training data into training and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, train_df['emoji_label'], test_size=0.2, random_state=42)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_split, y_train_split)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report
import time

# Define the adjusted parameter grid for RandomForest
param_dist = {
    'n_estimators': [50, 100],  # Reduced number of estimators
    'max_depth': [10, 20],      # Reduced max depth
    'min_samples_split': [2, 5]
}

# Initialize RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Use StratifiedKFold for better class balance in folds
stratified_kfold = StratifiedKFold(n_splits=3)

# RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=5, cv=stratified_kfold, n_jobs=-1, verbose=2, random_state=42)

# Start timer
start = time.time()

random_search.fit(X_train_res, y_train_res)

# End timer
end = time.time()
print(f"RandomizedSearchCV took {end - start:.2f} seconds")

# Get the best estimator
best_rf_model = random_search.best_estimator_

# Predict on the validation set
y_val_pred = best_rf_model.predict(X_val)

# Evaluate the model
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_))


Fitting 3 folds for each of 5 candidates, totalling 15 fits
RandomizedSearchCV took 117.78 seconds
              precision    recall  f1-score   support

           ☀       0.17      0.58      0.26       370
           ✨       0.27      0.12      0.17       644
           ❤       0.38      0.16      0.23      3049
          🇺🇸       0.38      0.41      0.39       509
           🎄       0.43      0.70      0.53       387
           💕       0.13      0.14      0.13       728
           💙       0.14      0.10      0.12       466
           💜       0.11      0.04      0.06       358
           💯       0.08      0.14      0.10       356
           📷       0.06      0.05      0.05       431
           📸       0.06      0.40      0.10       531
           🔥       0.35      0.32      0.33       875
           😁       0.09      0.04      0.05       355
           😂       0.32      0.20      0.24      1384
           😉       0.05      0.06      0.06       372
           😊       0.07      0.03   

In [None]:
def predict_emoji(text, model, vectorizer, encoder):
    cleaned_text = clean_text(text)
    tokens = word_tokenize(cleaned_text)
    tokens = [word for word in tokens if word not in stop_words]
    text_features = vectorizer.transform([' '.join(tokens)])
    emoji_label = model.predict(text_features)[0]
    emoji = encoder.inverse_transform([emoji_label])[0]
    return emoji

# Test the function with new inputs
new_texts = [
    "I love spending time with my family! ",
    "This is so frustrating ",
    "Can't wait for the weekend! ",
    "Just finished a great workout "
]

for text in new_texts:
    print(f"Text: {text}")
    print(f"Predicted Emoji: {predict_emoji(text, best_rf_model, tfidf_vectorizer, label_encoder)}")
    print()


Text: I love spending time with my family! 
Predicted Emoji: ❤

Text: This is so frustrating 
Predicted Emoji: 📸

Text: Can't wait for the weekend! 
Predicted Emoji: 😉

Text: Just finished a great workout 
Predicted Emoji: 😊

