In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np
import emoji
from xgboost import XGBClassifier
import pickle


# Function to extract emojis from text
def extract_emojis(text):
    return [char for char in text if char in emoji.EMOJI_DATA]

# Function to generate unique substrings of emojis
def printSubStrings(emoji_str):
    d = set()
    for i in range(len(emoji_str)):
        subStr = ""
        for j in range(i, len(emoji_str)):
            subStr += emoji_str[j]
            d.add(subStr)
    return d

# Drug-related emoji string and non-drug-related emoji string
drug_emoji_str = "🐡🍼💎‍❌☁🌳🍁🌲🍯💣👅🐌🪜🍦🦜⛰Ⓜ🐶📱🤑🍀🔌🧈🚌🪨🌱😌🎿🅿🍌🐲🌠🔵💯🌨⛄❎🌈🎫❤☃🎱💉🥡😛🚬🐍❄🧊🐝🌿🍫💥💵⚡🐎💙🐉🪟🚀⛽💰🔥✈🧪🔮💀🚆🍭🏈👽🍪🫘🍄🐴💧🪴🔑💊🍳🛒📦🍨🥥🥦🍇🕯💠🌴🤎🌹🍬🍃🪂💜"
non_drug_emoji_str = "🌻🤪🎧😁🤣😅😭🎨😷🤓😘👊🥳🧡😐💔🏆🥺♀🍕🤘😴🙏🦋🤞🙌💕🎈😇😬✨🤠🥶😚🎁🐱🎤😡🤩🎃😂👩🎸👍🤗🧩💬🤦😕♂🎉😎🥵💖💪🤔🎯🕊🎂😏🕺🙃🍰🤷🌞🌟😵🎅💚📸😢😜😊🥰🎀🙅👏😍🎶🥂"

# Generate emoji substrings
drug_emoji_substrings = list(printSubStrings(drug_emoji_str))
non_drug_emoji_substrings = list(printSubStrings(non_drug_emoji_str))

# Create DataFrame for drug-related and non-drug-related emoji substrings
df_drug = pd.DataFrame({
    'emojis': drug_emoji_substrings,
    'target': 1  # Drug-related
})

df_non_drug = pd.DataFrame({
    'emojis': non_drug_emoji_substrings,
    'target': 0  # Non-drug-related
})

# Combine both DataFrames
df = pd.concat([df_drug, df_non_drug]).reset_index(drop=True)

# Apply Label Encoder to the target variable
X = df['emojis'].values
y = df['target'].values

label_encoder_y = LabelEncoder()
y_encoded = label_encoder_y.fit_transform(y)

# Apply TF-IDF Vectorizer to the emoji dataset (using character n-grams for better pattern recognition)
# Further reduce max_features to limit complexity
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3), max_features=200)
X_vectorized = vectorizer.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y_encoded, test_size=0.25, stratify=y_encoded, random_state=42
)

# Initialize XGBoost Classifier with increased regularization
xgb = XGBClassifier(
    random_state=42, 
    eval_metric='mlogloss', 
    reg_alpha=0.5,  # Increase L1 regularization to reduce overfitting
    reg_lambda=1.5  # Increase L2 regularization
)

# Perform GridSearchCV to find the best hyperparameters
param_grid = {
    'learning_rate': [0.05],  # Fewer options
    'max_depth': [3],  
    'n_estimators': [50],  
    'min_child_weight': [3]  # Increase min_child_weight for greater regularization
}

# Perform GridSearchCV with a single job to prevent overload
grid_search = GridSearchCV(xgb, param_grid, cv=2, scoring='accuracy', n_jobs=1)  # Using 2 folds and 1 core
grid_search.fit(X_train, y_train)

# Sample a smaller subset for further tuning
X_train_small, _, y_train_small, _ = train_test_split(X_train, y_train, train_size=0.1, stratify=y_train, random_state=42)
grid_search.fit(X_train_small, y_train_small)

# Get the best model from GridSearch
best_xgb = grid_search.best_estimator_

# Train the best XGBoost model on the training set
best_xgb.fit(X_train, y_train)

# Predict on the test set
y_predict_xgb = best_xgb.predict(X_test)

# Evaluate the model
print("XGBoost Classifier Accuracy:", accuracy_score(y_test, y_predict_xgb))
print(confusion_matrix(y_test, y_predict_xgb))
print(classification_report(y_test, y_predict_xgb))
# ... (previous code)

# Train the best XGBoost model on the training set
best_xgb.fit(X_train, y_train)

# Predict on the test set
y_predict_xgb = best_xgb.predict(X_test)

# Evaluate the model
print("XGBoost Classifier Accuracy:", accuracy_score(y_test, y_predict_xgb))
print(confusion_matrix(y_test, y_predict_xgb))
print(classification_report(y_test, y_predict_xgb))

# Save the best XGBoost model to a pickle file
with open('best_xgb_model.pkl', 'wb') as model_file:
    pickle.dump(best_xgb, model_file)

# Save the vectorizer
with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

# Save the label encoder
with open('label_encoder.pkl', 'wb') as encoder_file:
    pickle.dump(label_encoder_y, encoder_file)

print("Models saved successfully!")

# Function to predict the target for a given emoji sequence

XGBoost Classifier Accuracy: 0.92806484295846
[[ 810    0]
 [ 142 1022]]
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       810
           1       1.00      0.88      0.94      1164

    accuracy                           0.93      1974
   macro avg       0.93      0.94      0.93      1974
weighted avg       0.94      0.93      0.93      1974

XGBoost Classifier Accuracy: 0.92806484295846
[[ 810    0]
 [ 142 1022]]
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       810
           1       1.00      0.88      0.94      1164

    accuracy                           0.93      1974
   macro avg       0.93      0.94      0.93      1974
weighted avg       0.94      0.93      0.93      1974

Models saved successfully!
