<a href="https://colab.research.google.com/github/KirkUgeX/Tweet-Engagement-Optimization-Project/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub
import pandas as pd
import os

In [None]:
path = kagglehub.dataset_download("thedevastator/tweets-and-user-engagement")
print("Path to dataset files:", path)

csv_file = None
for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith('.csv') and 'Twitterdatainsheets' in file:
            csv_file = os.path.join(root, file)
            print(f"Found CSV file at: {csv_file}")
            break

if csv_file:
    tweets_df = pd.read_csv(csv_file)
    print(f"Loaded dataset with {len(tweets_df)} tweets")
    print("\nDataset columns:")
    print(tweets_df.columns.tolist())
    print("\nSample data:")
    print(tweets_df.head(3))
else:
    print("Could not find the dataset CSV file")

In [None]:
df=pd.read_csv("/kaggle/input/tweets-and-user-engagement/Twitterdatainsheets.csv")

In [None]:
df.columns = df.columns.str.strip()


In [None]:
df = df.iloc[:99998]


In [None]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"@\w+", '', text)
    text = re.sub(r"[^a-z\s#]", '', text)
    text = re.sub(r"\s+", ' ', text).strip()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

def extract_text_features(df):
    df = df.copy()
    df['text_clean'] = df['text'].apply(clean_text)

    df['word_count'] = df['text'].apply(lambda x: len(str(x).split()) if pd.notna(x) else 0)
    df['has_question'] = df['text'].str.contains(r'\?', regex=True, na=False).astype(int)
    df['has_exclaim'] = df['text'].str.contains(r'!', regex=True, na=False).astype(int)
    df['has_emoji'] = df['text'].str.contains(r'[^\w\s#@,.\-:/]', regex=True, na=False).astype(int)
    df['has_hashtag'] = df['text'].str.contains(r'#\w+', regex=True, na=False).astype(int)
    df['has_mention'] = df['text'].str.contains(r'@\w+', regex=True, na=False).astype(int)

    return df
def extract_text_features_from_json(data):
    if isinstance(data, dict):
        data = pd.DataFrame([data])

    data['text_clean'] = data['text'].apply(clean_text)

    data['word_count'] = data['text'].apply(lambda x: len(str(x).split()) if pd.notna(x) else 0)
    data['has_question'] = data['text'].str.contains(r'\?', regex=True, na=False).astype(int)
    data['has_exclaim'] = data['text'].str.contains(r'!', regex=True, na=False).astype(int)
    data['has_emoji'] = data['text'].str.contains(r'[^\w\s#@,.\-:/]', regex=True, na=False).astype(int)
    data['has_hashtag'] = data['text'].str.contains(r'#\w+', regex=True, na=False).astype(int)
    data['has_mention'] = data['text'].str.contains(r'@\w+', regex=True, na=False).astype(int)

    result_json = data.to_dict(orient="records")
    return result_json



In [None]:
df = extract_text_features(df)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np

In [None]:
class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key): self.key = key
    def fit(self, X, y=None): return self
    def transform(self, X): return X[self.key]

class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, text_key): self.text_key = text_key
    def fit(self, X, y=None): return self
    def transform(self, X): return X.drop(columns=[self.text_key])


In [None]:
original_targets = ['Likes', 'RetweetCount', 'Reach']
log_target = 'Reach_log'
target_cols = ['Likes', 'RetweetCount', log_target]
from sklearn.preprocessing import LabelEncoder
# –ü–æ–¥–≥–æ—Ç–æ–≤–∏–º –¥–∞—Ç–∞—Å–µ—Ç
df = df.dropna(subset=['text', 'Weekday', 'Hour'] + original_targets)
df['Weekday'] = LabelEncoder().fit_transform(df['Weekday'])
df['Reach_log'] = np.log1p(df['Reach'])
df = extract_text_features(df)



In [None]:
categorical_features = ['Weekday']
numerical_features = ['Hour', 'word_count', 'has_question', 'has_exclaim', 'has_emoji', 'has_hashtag', 'has_mention']


In [None]:
target_cols = ['Likes', 'RetweetCount', 'Reach']
df = df.dropna(subset=['text_clean', 'Weekday', 'Hour'] + target_cols)
from sklearn.preprocessing import LabelEncoder
df['Weekday'] = LabelEncoder().fit_transform(df['Weekday'])



In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

df['Reach_log'] = np.log1p(df['Reach'])
df['Likes_log'] = np.log1p(df['Likes'])
df['Retweet_log'] = np.log1p(df['RetweetCount'])

df['is_retweet'] = df['text'].str.lower().str.startswith('rt').astype(int)

df['is_dead'] = (df['Likes'] == 0).astype(int)

df['engagement_score_raw'] = (
    0.6 * df['Likes_log'] +
    0.2 * df['Retweet_log'] * (1 - df['is_retweet']) +
    0.2 * df['Reach_log'] -
    1.5 * df['is_dead'])

scaler_minmax = MinMaxScaler(feature_range=(0, 10))
df['engagement_score'] = scaler_minmax.fit_transform(df[['engagement_score_raw']])
y=df['engagement_score']

In [None]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
from transformers import BertTokenizer, BertModel
from sklearn.base import BaseEstimator, TransformerMixin
class BertEmbedder(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.model.eval()
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')
        self.model.to(self.device)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        embeddings = []
        for text in X:
            inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
            inputs = {key: value.to(self.device) for key, value in inputs.items()}  # –ü–µ—Ä–µ–º–µ—â–∞–µ–º –¥–∞–Ω–Ω—ã–µ –Ω–∞ GPU
            with torch.no_grad():
                outputs = self.model(**inputs)
            # –ò—Å–ø–æ–ª—å–∑—É–µ–º [CLS]-—Ç–æ–∫–µ–Ω –∫–∞–∫ –ø—Ä–µ–¥—Å—Ç–∞–≤–ª–µ–Ω–∏–µ —Ç–µ–∫—Å—Ç–∞
            cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()  # –ü–µ—Ä–µ–º–µ—â–∞–µ–º –æ–±—Ä–∞—Ç–Ω–æ –Ω–∞ CPU
            embeddings.append(cls_embedding)
        return np.array(embeddings)

In [None]:
from sklearn.preprocessing import FunctionTransformer

In [None]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
])

In [None]:

full_pipeline = FeatureUnion([
    ('text', Pipeline([
        ('selector', TextSelector('text_clean')),
        ('bert_embed', BertEmbedder())
    ])),
    ('meta', Pipeline([
        ('drop_text', ColumnDropper('text_clean')),
        ('preprocessor', preprocessor)
    ]))
])


In [None]:

X = df[['text_clean', 'Weekday', 'Hour', 'word_count', 'has_question', 'has_exclaim', 'has_emoji', 'has_hashtag', 'has_mention']]
y = df['engagement_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_trans = full_pipeline.fit_transform(X_train)
X_test_trans = full_pipeline.transform(X_test)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.histplot(y, bins=50, kde=True)
plt.title('–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ engagement_score')
plt.show()


In [None]:
df.sort_values('engagement_score', ascending=False).head(10)[['text_clean', 'engagement_score','Likes', 'RetweetCount', 'Reach']]


In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_trans, y_train, test_size=0.2, random_state=42)

lgbm = LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.01,
    num_leaves=128,
    max_depth=10,
    min_child_samples=10,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.05,
    reg_lambda=0.05,
    random_state=42,
    device='cpu'
)
lgbm.fit(X_train_split, y_train_split)

y_pred = lgbm.predict(X_val_split)
r2 = r2_score(y_val_split, y_pred)
mae = mean_absolute_error(y_val_split, y_pred)

print(f"‚úÖ R¬≤: {r2:.4f}")
print(f"üìâ MAE: {mae:.4f}")


In [None]:
# –°–æ—Ö—Ä–∞–Ω—è–µ–º –º–æ–¥–µ–ª—å –≤ —Ç–µ–∫—Å—Ç–æ–≤–æ–º —Ñ–æ—Ä–º–∞—Ç–µ
lgbm.booster_.save_model("lgbm_model.txt")


In [None]:
import lightgbm as lgb
lgbm = lgb.Booster(model_file="lgbm_model.txt")

In [None]:
import numpy as np

y_pred = lgbm.predict(X_val_split)
y_true = y_val_split

y_pred = np.array(y_pred)
y_true = np.array(y_true)

within_range = np.abs(y_pred - y_true) <= 1
count_within_range = np.sum(within_range)
total = len(y_true)
percent_within_range = (count_within_range / total) * 100

print(f"‚úÖ –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–æ–ø–∞–¥–∞–Ω–∏–π –≤ ¬±1: {count_within_range}/{total} ({percent_within_range:.2f}%)")

In [None]:
!pip install transformers datasets accelerate


In [None]:
from datasets import load_dataset, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import json

def load_jsonl_for_finetune(path):
    with open(path, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
    return Dataset.from_list([
        {
            "input_text": f"""Original tweet: "{ex['original_tweet']}"\nSuggestions:\n- {chr(10).join(ex['suggestions'])}\n\nRewrite the tweet to maximize engagement.""",
            "target_text": ex["enhanced_tweet"]
        }
        for ex in data
    ])

dataset = load_jsonl_for_finetune("/content/diverse_emojified_tweets_500.jsonl")


In [None]:
model_name = "google/flan-t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

emoji_list = ["üî•", "üöÄ", "üéâ", "üì¢", "üëâ", "üòé", "üíº", "üìå", "üîî", "üß†", "üóìÔ∏è", "üìÖ", "üí°", "üë®‚Äçüíª", "‚ö°", "üåê"]

tokenizer.add_tokens(emoji_list)

model.resize_token_embeddings(len(tokenizer))

def preprocess(example):
    return tokenizer(example['input_text'], text_target=example['target_text'], truncation=True)
tokenized = dataset.map(preprocess, batched=True)

collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-enhanced-tweet-model",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    learning_rate=5e-5,
    save_strategy="epoch",
    logging_steps=10,
    fp16=True,
    push_to_hub=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    eval_dataset=tokenized,
    data_collator=collator)

trainer.train()

trainer.save_model("./flan-t5-enhanced-tweet-model")
tokenizer.save_pretrained("./flan-t5-enhanced-tweet-model")

In [None]:
from transformers import pipeline

model_path = "./flan-t5-enhanced-tweet-model"

generator = pipeline("text2text-generation", model=model_path, tokenizer=model_path)
def generate_enhanced_tweet(original_tweet, suggestions, model=generator):
    prompt = f"""Original tweet: "{original_tweet}"

Suggestions:
- {chr(10).join(suggestions)}

Rewrite the tweet to maximize engagement, keeping the original meaning, and including suggested enhancements like excitement, hashtags, and engagement features such as questions or emojis."""
    result = model(prompt, max_length=150, num_beams=5, do_sample=True, temperature=0.9)[0]['generated_text']

    return result

In [None]:
import copy
def suggest_enhancement(tweet_data, model=lgbm, pipeline=full_pipeline, step=3, text_model=generator):
    tweet_df = pd.DataFrame([tweet_data])
    transformed = pipeline.transform(tweet_df)
    current_pred = model.predict(transformed)[0]

    suggestions = []

    #The best day and time
    best_score = -np.inf
    best_hour = None
    best_day = None

    weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

    for day in weekdays:
        for hour in range(0, 24, step):
            temp_data = copy.deepcopy(tweet_data)
            temp_data['Weekday'] = day
            temp_data['Hour'] = hour
            temp_df = pd.DataFrame([temp_data])
            transformed_temp = pipeline.transform(temp_df)
            temp_score = model.predict(transformed_temp)[0]
            if temp_score > best_score:
                best_score = temp_score
                best_hour = hour
                best_day = day

    if best_score > current_pred:
        suggestions.append(
            f"üìÖ Consider posting on {best_day} around {best_hour}:00 ‚Äî model predicts higher engagement."
        )

    text = tweet_data['text']
    if "apply" in text.lower():
        suggestions.append("‚úÖ Try a more exciting tone'")

    if "#" not in text:
        suggestions.append("‚úÖ Add relevant hashtags")
    elif tweet_data.get("has_hashtag", 0) < 2:
        suggestions.append("‚úÖ Use at least 2‚Äì3 hashtags.")

    if tweet_data.get("has_emoji", 0) == 0:
        suggestions.append("‚úÖ Add emojis to grab attention üéØüî•üíº")
    if tweet_data.get("has_exclaim", 0) == 0:
        suggestions.append("‚úÖ Use exclamation marks to boost excitement!")
    if tweet_data.get("has_question", 0) == 0:
        suggestions.append("‚úÖ Ask a question to engage your audience.")

    enhanced_text = None
    if text_model:
        prompt = f"""Original tweet: "{text}"

Suggestions:
- {chr(10).join(suggestions)}

Now, please rewrite the tweet to make it more engaging and exciting!
- Include emojis to grab attention
- Add relevant hashtags
- Use action verbs to create urgency and excitement
- Try using exclamation marks to energize the tone
- Consider adding a question to engage the audience

Ensure the tweet remains true to the original message but make it more attractive and interactive!"""

        enhanced_text = text_model(prompt, max_length=150, num_beams=5, do_sample=True, temperature=1.3)[0]['generated_text']

    return {
        "text": text,
        "prompt": prompt,
        "Predicted Engagement Score": round(float(current_pred), 2),
        "Best Posting Time": f"{best_day} at {best_hour}:00",
        "Best Score Estimate": round(float(best_score), 2),
        "Suggested Enhancements": suggestions,
        "Enhanced Tweet": enhanced_text
    }

In [None]:
example = {
    "text": "New job opening at our company! Apply here: https://vk.com #hiring #jobs",
    "Sentiment": "Positive",
    "Weekday": "Monday",
    "Hour": 18.0
}
sg=extract_text_features_from_json(example)
print(sg)
suggest_enhancement(sg[0],text_model=generator)
