In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers datasets accelerate --quiet
!pip install transformers torch
!pip install pandas transformers sentence-transformers
!pip install datasets
!pip install --upgrade transformers
!pip install transformers pandas
!pip install transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [27]:
!pip install transformers accelerate bitsandbytes


Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.5


In [3]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd
import torch
from google.colab import drive
import os
import re

In [None]:
import pandas as pd
import re

class SocialPostGenerator:
    def __init__(self, dataset_path):
        self.df = pd.read_csv(dataset_path)
        self.df['keywords'] = self.df['caption'].apply(self.extract_keywords)
        self.available_topics = self.df['topic'].str.lower().unique()

    def extract_keywords(self, text):
        return set(re.findall(r'\b[a-zA-Z]{4,}\b', text.lower()))

    def detect_topic(self, prompt, min_score_threshold=1):  # 👈 CHANGEMENT ICI
        prompt_keywords = self.extract_keywords(prompt)
        topic_scores = {}

        for topic in self.available_topics:
            topic_posts = self.df[self.df['topic'].str.lower() == topic]
            topic_keywords = set().union(*topic_posts['keywords'])
            score = len(prompt_keywords & topic_keywords)
            topic_scores[topic] = score

        # DEBUG: Afficher les scores
        print("Topic scores:", topic_scores)

        best_match = max(topic_scores, key=topic_scores.get)
        if topic_scores[best_match] >= min_score_threshold:
            return best_match

        # Fallback : vérifier si le mot-clé du sujet est présent dans le prompt brut
        for topic in self.available_topics:
            if topic in prompt.lower():
                print(f"⚠️ Fallback: '{topic}' détecté directement dans le prompt.")
                return topic

        return None

    def generate_posts(self, prompt, num_options=2):
        prompt_keywords = self.extract_keywords(prompt)
        self.df['score'] = self.df['keywords'].apply(
            lambda x: len(x & prompt_keywords)
        )

        topic = self.detect_topic(prompt)
        if not topic:
            return "❌ Aucun sujet pertinent détecté pour ce prompt."

        relevant_posts = self.df[self.df['topic'].str.lower() == topic]

        # Trier et supprimer les doublons de caption
        relevant_posts = relevant_posts.sort_values('score', ascending=False)
        relevant_posts = relevant_posts.drop_duplicates(subset='caption')
        selected_posts = relevant_posts.head(num_options).to_dict('records')

        output = []
        for i, post in enumerate(selected_posts, 1):
            output.append(
                f"Option {i}\n"
                f"{post['caption']} {post['hashtags']}\n"
                f"CTA: {post['cta']}\n"
                f"Tone: {post['tone']} | Platform: {post['platform']}\n"
            )

        return "\n".join(output) if output else "No matching posts found."


# Example usage

generator = SocialPostGenerator("/content/drive/MyDrive/Post generation/generated_social_posts_large.csv")

prompts = [
    "Looking for an Instagram caption to promote my sustainable fashion brand.",
    "Promote my latest wireless earbuds. Focus on sound quality and design.",
    "Announce the new iphone release ",
    "I want a social post to promote my skincare serum for glowing skin."
]

for prompt in prompts:
    print(f"\n{'='*50}\nPrompt: {prompt}\n{'='*50}")
    print(generator.generate_posts(prompt))



Prompt: Looking for an Instagram caption to promote my sustainable fashion brand.
Topic scores: {'beauty': 0, 'fashion': 0, 'electronics': 0, 'tech': 0}
⚠️ Fallback: 'fashion' détecté directement dans le prompt.
Option 1
Step up your style game with these must-have looks 👗 🧥 #StyleInspo #OOTD #FashionTrends
CTA: See what’s trending!
Tone: trendy | Platform: TikTok

Option 2
Style that speaks louder than words 👠 👗 #Trendy #OOTD #RunwayLook #FashionTrends
CTA: Get inspired!
Tone: trendy | Platform: Instagram


Prompt: Promote my latest wireless earbuds. Focus on sound quality and design.
Topic scores: {'beauty': 0, 'fashion': 0, 'electronics': 1, 'tech': 1}
Option 1
🔋 📷 Experience sound like never before – new headphones review up now! #GadgetLover #Electronics #SmartDevices
CTA: Check it out!
Tone: detailed | Platform: Facebook

Option 2
🔋 📺 Experience sound like never before – new headphones review up now! #SmartDevices #TechGear #GadgetLover
CTA: Check it out!
Tone: enthusiastic | Pl

**GP2**

In [None]:
import pandas as pd
import re
from transformers import pipeline, set_seed

class HybridSocialPostGenerator:
    def __init__(self, dataset_path, model_name='gpt2'):
        self.df = pd.read_csv(dataset_path)
        self.df['keywords'] = self.df['caption'].apply(self.extract_keywords)
        self.available_topics = self.df['topic'].str.lower().unique()
        self.generator = pipeline('text-generation', model=model_name)
        set_seed(42)

    def extract_keywords(self, text):
        return set(re.findall(r'\b[a-zA-Z]{4,}\b', text.lower()))

    def detect_topic(self, prompt):
        prompt_keywords = self.extract_keywords(prompt)
        topic_scores = {}
        for topic in self.available_topics:
            topic_posts = self.df[self.df['topic'].str.lower() == topic]
            topic_keywords = set().union(*topic_posts['keywords'])
            score = len(prompt_keywords & topic_keywords)
            topic_scores[topic] = score
        best_match = max(topic_scores, key=topic_scores.get)
        return best_match if topic_scores[best_match] > 0 else None

    def enhance_caption(self, prompt, base_caption, tone, platform):
        input_prompt = (
            f"You're writing a social media post for {platform} with a {tone} tone. "
            f"The original post was:\n'{base_caption}'\n"
            f"Make it more engaging, keep it under 280 characters, and add emojis/hashtags if needed.\nNew version:"
        )
        output = self.generator(input_prompt, max_length=100, num_return_sequences=1)
        return output[0]['generated_text'].split("New version:")[-1].strip()

    def generate_custom_posts(self, prompt, max_options=3):
        prompt_keywords = self.extract_keywords(prompt)
        topic = self.detect_topic(prompt)

        if not topic:
            return "No topic found for this prompt."

        self.df['score'] = self.df['keywords'].apply(lambda x: len(x & prompt_keywords))
        relevant_posts = self.df[self.df['topic'].str.lower() == topic]
        relevant_posts = relevant_posts.sort_values('score', ascending=False)
        unique_posts = relevant_posts.drop_duplicates(subset=['caption', 'hashtags', 'cta'])
        selected = unique_posts.head(max_options).to_dict('records')

        results = []
        for i, post in enumerate(selected, 1):
            enhanced = self.enhance_caption(prompt, post['caption'], post['tone'], post['platform'])
            results.append(
                f"Option {i}\n"
                f"{enhanced}\n"
                f"Original CTA: {post['cta']}\n"
                f"Tone: {post['tone']} | Platform: {post['platform']}\n"
            )

        return "\n".join(results) if results else "No variations generated."

# Example usage
generator = HybridSocialPostGenerator("/content/drive/MyDrive/generated_social_posts_large.csv")

prompts = [
    "Launching our new AI email assistant",
    "Promo for summer skincare line",
    "Back to school campaign for students"
]

for prompt in prompts:
    print(f"\n{'='*60}\nPrompt: {prompt}\n{'='*60}")
    print(generator.generate_custom_posts(prompt))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Prompt: Launching our new AI email assistant
No topic found for this prompt.

Prompt: Promo for summer skincare line


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Option 1
'Instagram: @szomjulg @sympress @crismycris @jdg @bronco @harr
Original CTA: Try it now!
Tone: friendly | Platform: TikTok

Option 2
The most popular Twitter: 140 characters

Twitter: 140 characters Facebook: 90,000 followers, 500 followers, 400 likes
Original CTA: Your glow starts here!
Tone: friendly | Platform: YouTube

Option 3
'Miguel del Toro is going to be in for a challenge from day one… We'll put on a show!'
'This is what you did
Original CTA: Watch the tutorial!
Tone: glamorous | Platform: TikTok


Prompt: Back to school campaign for students
No topic found for this prompt.


**BART**

In [None]:
import pandas as pd
import re
from transformers import pipeline

class BartSocialPostGenerator:
    def __init__(self, dataset_path, model_name='facebook/bart-large-cnn'):
        self.df = pd.read_csv(dataset_path)
        self.df['keywords'] = self.df['caption'].apply(self.extract_keywords)
        self.available_topics = self.df['topic'].str.lower().unique()
        self.generator = pipeline('text2text-generation', model=model_name)

    def extract_keywords(self, text):
        return set(re.findall(r'\b[a-zA-Z]{4,}\b', text.lower()))

    def detect_topic(self, prompt):
        prompt_keywords = self.extract_keywords(prompt)
        topic_scores = {}
        for topic in self.available_topics:
            topic_posts = self.df[self.df['topic'].str.lower() == topic]
            topic_keywords = set().union(*topic_posts['keywords'])
            topic_scores[topic] = len(prompt_keywords & topic_keywords)
        best_match = max(topic_scores, key=topic_scores.get)
        return best_match if topic_scores[best_match] > 0 else None

    def enhance_post(self, base_caption, tone, platform):
        prompt = (
            f"Rewrite the following social media post for {platform} with a {tone} tone. "
            f"Make it engaging and add hashtags or emojis if suitable:\n\n"
            f"{base_caption}"
        )
        response = self.generator(prompt, max_length=80, clean_up_tokenization_spaces=True)[0]['generated_text']
        return response.strip()

    def generate_custom_posts(self, prompt, max_options=3):
        prompt_keywords = self.extract_keywords(prompt)
        topic = self.detect_topic(prompt)

        if not topic:
            return "No topic found for this prompt."

        self.df['score'] = self.df['keywords'].apply(lambda x: len(x & prompt_keywords))
        relevant_posts = self.df[self.df['topic'].str.lower() == topic]
        relevant_posts = relevant_posts.sort_values('score', ascending=False)
        unique_posts = relevant_posts.drop_duplicates(subset=['caption', 'hashtags', 'cta'])
        selected = unique_posts.head(max_options).to_dict('records')

        results = []
        for i, post in enumerate(selected, 1):
            enhanced = self.enhance_post(post['caption'], post['tone'], post['platform'])
            results.append(
                f"Option {i}\n"
                f"{enhanced}\n"
                f"Original CTA: {post['cta']}\n"
                f"Tone: {post['tone']} | Platform: {post['platform']}\n"
            )

        return "\n".join(results) if results else "No variations generated."


In [None]:
generator = BartSocialPostGenerator("/content/drive/MyDrive/generated_social_posts_large.csv")
prompt = "New skincare product launch for dry skin"
print(generator.generate_custom_posts(prompt))


Device set to use cuda:0


Option 1
Skincare made simple. Your routine just got an upgrade. Rewrite the following social media post for TikTok with a friendly tone. Make it engaging and add hashtags or emojis if suitable. If you have any questions, feel free to contact us at editorial@dailymail.co.uk.
Original CTA: Try it now!
Tone: friendly | Platform: TikTok

Option 2
Skincare made simple. Your routine just got an upgrade. Rewrite the following social media post for YouTube with a friendly tone. Make it engaging and add hashtags or emojis if suitable. If you have a YouTube channel, please share it with us. We would love to hear from you.
Original CTA: Your glow starts here!
Tone: friendly | Platform: YouTube

Option 3
Skincare made simple. Your routine just got an upgrade. Rewrite the following social media post for TikTok with a glamorous tone. Make it engaging and add hashtags or emojis if suitable. If you have any questions, feel free to contact us at editorial@dailymail.co.uk.
Original CTA: Watch the tutor

**google/flan-t5-large**

In [None]:
import pandas as pd
import re
import random
from transformers import pipeline

class EmojiGOOGLEPostGenerator:
    def __init__(self, dataset_path, model_name='google/flan-t5-large'):
        # Load dataset
        self.df = pd.read_csv(dataset_path)

        # Ensure 'emoji' column exists and handle missing values
        if 'emoji' not in self.df.columns:
            self.df['emoji'] = ''
        self.df['emoji'] = self.df['emoji'].fillna('')

        # Extract keywords for each caption
        self.df['keywords'] = self.df['caption'].apply(self.extract_keywords)
        self.available_topics = self.df['topic'].str.lower().unique()

        # Initialize the model
        self.generator = pipeline('text2text-generation', model=model_name, device=0)  # CUDA

    def extract_keywords(self, text):
        return set(re.findall(r'\b[a-zA-Z]{4,}\b', text.lower()))

    def detect_topic(self, prompt):
        prompt_keywords = self.extract_keywords(prompt)
        topic_scores = {}

        # Iterate through each topic and check how well it matches
        for topic in self.available_topics:
            topic_posts = self.df[self.df['topic'].str.lower() == topic]
            topic_keywords = set().union(*topic_posts['keywords'])

            # Score based on the number of keyword matches
            score = len(prompt_keywords & topic_keywords)

            topic_scores[topic] = score

        # If no match found, try a broader match (could be more lenient or use a different strategy)
        if topic_scores:
            best_match = max(topic_scores, key=topic_scores.get)
            return best_match if topic_scores[best_match] > 0 else None
        return None

    def enhance_caption(self, base_caption, tone, platform):
        # Clean and structured prompt for generating a caption
        simple_prompt = f"Create a {tone} social media post for {platform}: {base_caption}"
        response = self.generator(simple_prompt, max_length=80, clean_up_tokenization_spaces=True)[0]['generated_text']

        # Clean up the unwanted text from the response
        clean_response = response.split(f"Create a {tone} social media post for {platform}:")[1] if f"Create a {tone} social media post for {platform}:" in response else response
        return clean_response.strip()

    def generate_clean_posts(self, prompt, max_options=3):
        topic = self.detect_topic(prompt)
        if not topic:
            return "⚠️ No matching topic found."

        print(f"Topic detected: {topic}")

        prompt_keywords = self.extract_keywords(prompt)
        self.df['score'] = self.df['keywords'].apply(lambda x: len(x & prompt_keywords))

        print(f"Prompt keywords: {prompt_keywords}")
        print(f"Keywords for each post: {self.df['keywords']}")

        top_posts = self.df[self.df['topic'].str.lower() == topic].sort_values('score', ascending=False)
        print(f"Posts for '{topic}' topic: {top_posts}")

        unique_posts = top_posts.drop_duplicates(subset=['caption', 'hashtags', 'cta'])
        print(f"Unique posts for '{topic}' topic: {unique_posts.shape[0]}")

        selected = unique_posts.head(max_options).to_dict('records')

        output = []
        for i, post in enumerate(selected, 1):
            # Generate enhanced caption
            new_caption = self.enhance_caption(post['caption'], post['tone'], post['platform'])

            # Inject original emojis (if exist)
            emoji = post['emoji'] if post['emoji'] else self.get_random_emojis()

            # Combine caption with emojis
            final_caption = f"{new_caption} {emoji}".strip()

            # Prepare the structured output
            output.append(f"Option {i}:\n{final_caption}\n\n")

        if output:
            return "".join(output)
        else:
            return "⚠️ No valid posts generated."

    def get_random_emojis(self, count=2):
        all_emojis = self.df['emoji'].dropna().unique().tolist()
        all_emojis = [e for e in all_emojis if e.strip()]
        return " ".join(random.sample(all_emojis, min(count, len(all_emojis)))) if all_emojis else ""


In [None]:
generator = EmojiGOOGLEPostGenerator("/content/drive/MyDrive/generated_social_posts_large.csv")

prompt = "New skincare product launch for dry skin"
print(generator.generate_clean_posts(prompt))


Device set to use cuda:0


Topic detected: beauty
Prompt keywords: {'skin', 'launch', 'skincare', 'product'}
Keywords for each post: 0         {full, with, look, makeup, watch, that, moves}
1      {with, have, your, step, these, style, looks, ...
2         {full, with, look, makeup, watch, that, moves}
3      {than, louder, words, speaks, that, style, tre...
4      {like, never, sound, before, headphones, revie...
                             ...                        
995        {with, hello, changing, future, game, latest}
996    {with, have, your, step, these, style, looks, ...
997        {gadget, blowing, dropped, this, just, minds}
998                           {this, find, best, camera}
999       {full, with, look, makeup, watch, that, moves}
Name: keywords, Length: 1000, dtype: object
Posts for 'beauty' topic:                                                caption   topic emojis  \
43   Skincare made simple 🧴 🧖 Your routine just got...  beauty   🧴, 🧖   
46   Skincare made simple 💋 🧖 Your routine just got

**google/flan-t5-base+all-MiniLM-L6-v2**

In [5]:
import pandas as pd
import re
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util

class SocialPostGenerator:
    def __init__(self, dataset_path):
        self.df = pd.read_csv(dataset_path)
        self.df['keywords'] = self.df['caption'].apply(self.extract_keywords)
        self.available_topics = self.df['topic'].str.lower().unique()

        # NLP tools
        self.rewriter = pipeline("text2text-generation", model="google/flan-t5-base")
        self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

        # Precompute topic embeddings
        self.topic_embeddings = {
            topic: self.embedding_model.encode(
                " ".join(self.df[self.df['topic'].str.lower() == topic]['caption'].tolist()),
                convert_to_tensor=True
            )
            for topic in self.available_topics
        }

    def extract_keywords(self, text):
        return set(re.findall(r'\b[a-zA-Z]{4,}\b', text.lower()))

    def reformulate_prompt(self, prompt):
        try:
            reformulated = self.rewriter(
                f"Paraphrase this sentence: {prompt}",
                max_length=60,
                do_sample=False
            )[0]['generated_text']
            print(f"🔁 Reformulated: {reformulated}")
            return reformulated
        except:
            return prompt

    def detect_topic_semantic(self, prompt):
        prompt_emb = self.embedding_model.encode(prompt, convert_to_tensor=True)
        scores = {
            topic: util.cos_sim(prompt_emb, topic_emb).item()
            for topic, topic_emb in self.topic_embeddings.items()
        }
        print("📊 Semantic topic scores:", scores)
        best_topic = max(scores, key=scores.get)
        return best_topic if scores[best_topic] > 0.2 else None

    def detect_topic_keywords(self, prompt, min_score_threshold=1):
        prompt_keywords = self.extract_keywords(prompt)
        topic_scores = {}

        for topic in self.available_topics:
            topic_posts = self.df[self.df['topic'].str.lower() == topic]
            topic_keywords = set().union(*topic_posts['keywords'])
            score = len(prompt_keywords & topic_keywords)
            topic_scores[topic] = score

        print("🔤 Keyword topic scores:", topic_scores)
        best_match = max(topic_scores, key=topic_scores.get)
        if topic_scores[best_match] >= min_score_threshold:
            return best_match

        for topic in self.available_topics:
            if topic in prompt.lower():
                print(f"⚠️ Fallback keyword match: {topic}")
                return topic

        return None

    def generate_posts(self, prompt, num_options=2):
        original_prompt = prompt
        prompt = self.reformulate_prompt(prompt)

        topic = self.detect_topic_semantic(prompt)
        if not topic:
            topic = self.detect_topic_keywords(prompt)

        if not topic:
            return "❌ Aucun sujet pertinent détecté pour ce prompt."

        print(f"✅ Topic détecté: {topic}")
        prompt_keywords = self.extract_keywords(prompt)
        self.df['score'] = self.df['keywords'].apply(
            lambda x: len(x & prompt_keywords)
        )

        relevant_posts = self.df[self.df['topic'].str.lower() == topic]
        relevant_posts = relevant_posts.sort_values('score', ascending=False)
        relevant_posts = relevant_posts.drop_duplicates(subset='caption')
        selected_posts = relevant_posts.head(num_options).to_dict('records')

        output = []
        for i, post in enumerate(selected_posts, 1):
            output.append(
                f"Option {i}\n"
                f"{post['caption']} {post['hashtags']}\n"
                f"CTA: {post['cta']}\n"
                f"Tone: {post['tone']} | Platform: {post['platform']}\n"
            )

        return "\n".join(output) if output else "No matching posts found."


# 🚀 Example usage
generator = SocialPostGenerator("/content/drive/MyDrive/Post_generation/generated_social_posts_large.csv")

prompts = [
    "Our new serum uses microcurrent technology to stimulate your skin cells and boost collagen. Smart skincare starts here.",
    "Write a social caption for a campaign that launches an AI-powered sustainable fashion app targeting Gen Z. The vibe should be rebellious, stylish, eco-conscious, and TikTok-ready. Include a clever tagline, emojis, and 3 on-trend hashtags.",
    "I'm launching a new streetwear collection for summer. I need a catchy post.",
    "Promote my latest wireless earbuds. Focus on sound quality and design.",
    "Generate a caption for launching my new ai tool.",
    "Write a short and glamorous post for my latest lipgloss tutorial.",
    "Teach your skin, not just your brain. AI meets your vanity — and it glows.",
    "This new wand can adapt to every look you want — soft glam, full beat, or even bare glow. Let’s show it off"

]



for prompt in prompts:
    print(f"\n{'='*50}\nPrompt: {prompt}\n{'='*50}")
    print(generator.generate_posts(prompt))


Device set to use cuda:0



Prompt: Our new serum uses microcurrent technology to stimulate your skin cells and boost collagen. Smart skincare starts here.
🔁 Reformulated: Our new serum uses microcurrent technology to stimulate your skin cells and boost collagen. Smart skincare starts here.
📊 Semantic topic scores: {'beauty': 0.328197181224823, 'fashion': 0.028874481096863747, 'electronics': 0.050334807485342026, 'tech': 0.0741412490606308, nan: 0.009272335097193718}
✅ Topic détecté: beauty
Option 1
Skincare made simple 🧖 💋 Your routine just got an upgrade. #BeautyTips #MakeupMagic #GlowUp
CTA: Watch the tutorial!
Tone: friendly | Platform: TikTok

Option 2
Skincare made simple 🌸 🧖 Your routine just got an upgrade. #GlowUp #BeautyHacks #BeautyTips
CTA: Radiate confidence!
Tone: glamorous | Platform: TikTok


Prompt: Write a social caption for a campaign that launches an AI-powered sustainable fashion app targeting Gen Z. The vibe should be rebellious, stylish, eco-conscious, and TikTok-ready. Include a clever ta

**facebook/bart-large-mnli**

In [None]:
import pandas as pd
from transformers import pipeline
from sklearn.preprocessing import LabelEncoder

# Load dataset (replace with your dataset path)
df = pd.read_csv('/content/drive/MyDrive/Post generation/generated_social_posts_large.csv')  # Replace with your dataset path

# Encode topics (This step is optional if you already have the labels)
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['topic'])

# Initialize zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define possible topics
possible_topics = df['topic'].unique().tolist()

# Function to classify text into topics
def classify_text(text):
    result = classifier(text, candidate_labels=possible_topics)
    return result['labels'][0], result['scores'][0]  # Return top label and score

# Example usage with prompts
prompts = [
    "I'm launching a new streetwear collection for summer. I need a catchy post.",
    "Promote my latest wireless earbuds. Focus on sound quality and design.",
    "Announce the beta release of my new machine learning platform.",
    "Write a short and glamorous post for my latest makeup tutorial."
]

for prompt in prompts:
    print(f"\n{'='*50}\nPrompt: {prompt}\n{'='*50}")
    label, score = classify_text(prompt)
    print(f"Predicted Topic: {label} | Confidence Score: {score:.4f}")


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0



Prompt: I'm launching a new streetwear collection for summer. I need a catchy post.
Predicted Topic: fashion | Confidence Score: 0.9711

Prompt: Promote my latest wireless earbuds. Focus on sound quality and design.
Predicted Topic: tech | Confidence Score: 0.7353

Prompt: Announce the beta release of my new machine learning platform.
Predicted Topic: tech | Confidence Score: 0.9585

Prompt: Write a short and glamorous post for my latest makeup tutorial.
Predicted Topic: beauty | Confidence Score: 0.8560


In [None]:
import pandas as pd
import re
import random
from transformers import pipeline
from difflib import get_close_matches

# Load dataset (replace with your dataset path)
df = pd.read_csv('/content/drive/MyDrive/Post_generation/generated_social_posts_large.csv')

# Initialize zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define possible topics (unique from dataset)
possible_topics = df['topic'].dropna().unique().tolist()

# Preprocessing text for better classification
def preprocess_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()

# Classify text and return label + score, or "unknown" if score too low
def classify_text(text, threshold=0.4):
    result = classifier(text, candidate_labels=possible_topics)
    top_label = result['labels'][0]
    top_score = result['scores'][0]
    if top_score < threshold:
        return "unknown", top_score
    return top_label, top_score

# Fallback to closest matching topic using fuzzy matching
def fallback_topic(topic, all_topics):
    match = get_close_matches(topic, all_topics, n=1, cutoff=0.6)
    return match[0] if match else None

# Generate post by filtering dataset and sampling from relevant posts
def generate_post(topic, num_options=2):
    relevant_posts = df[df['topic'].str.lower() == topic.lower()]

    if relevant_posts.empty:
        fallback = fallback_topic(topic, possible_topics)
        if fallback:
            print(f"🔁 Using fallback topic: {fallback}")
            relevant_posts = df[df['topic'].str.lower() == fallback.lower()]
            topic = fallback
        if relevant_posts.empty:
            return "❌ No posts found for this topic."

    sample_size = min(num_options, len(relevant_posts))
    selected_posts = relevant_posts.sample(n=sample_size, random_state=42).to_dict('records')

    output = []
    for i, post in enumerate(selected_posts, 1):
        output.append(
            f"Option {i}\n"
            f"{post['caption']} {post['hashtags']}\n"
            f"CTA: {post['cta']}\n"
            f"Tone: {post['tone']} | Platform: {post['platform']}\n"
        )

    return "\n".join(output)

# Example prompts
prompts = [
    "Write a social caption for a campaign that launches an AI-powered sustainable fashion app targeting Gen Z. The vibe should be rebellious, stylish, eco-conscious, and TikTok-ready. Include a clever tagline, emojis, and 3 on-trend hashtags.",
    "Our new serum uses microcurrent technology to stimulate your skin cells and boost collagen. Smart skincare starts here.",
    "Just one swipe and our AI-powered brush adapts to your skin type in real time. Beauty meets intelligence",
    "This isn’t just a mirror — it’s your personal beauty lab. With facial recognition and skin analysis powered by AI.",
    "I'm launching a new streetwear collection for summer. I need a catchy post.",
    "Promote my latest wireless earbuds. Focus on sound quality and design.",
    "Generate a caption for launching my new ai tool.",
    "Write a short and glamorous post for my latest lipgloss tutorial.",
    "Teach your skin, not just your brain. AI meets your vanity — and it glows.",
    "This new wand can adapt to every look you want — soft glam, full beat, or even bare glow. Let’s show it off"
]

# Run classification + generation
for prompt in prompts:
    print(f"\n{'='*50}\nPrompt: {prompt}\n{'='*50}")

    clean_prompt = preprocess_text(prompt)
    label, score = classify_text(clean_prompt)

    print(f"Predicted Topic: {label} | Confidence Score: {score:.4f}")
    if label == "unknown":
        print("⚠️ Low confidence — result may be unreliable.")

    generated_post = generate_post(label)
    print(f"Generated Post:\n{generated_post}")


Device set to use cuda:0



Prompt: Write a social caption for a campaign that launches an AI-powered sustainable fashion app targeting Gen Z. The vibe should be rebellious, stylish, eco-conscious, and TikTok-ready. Include a clever tagline, emojis, and 3 on-trend hashtags.
Predicted Topic: fashion | Confidence Score: 0.5947
Generated Post:
Option 1
From the runway to your wardrobe 🧥 🕶️ What’s your favorite piece? #StreetStyle #StyleInspo #OOTD
CTA: Your style upgrade!
Tone: inspirational | Platform: TikTok

Option 2
Step up your style game with these must-have looks 🕶️ 👗 #StreetStyle #RunwayLook #FashionTrends
CTA: See what’s trending!
Tone: playful | Platform: TikTok


Prompt: Our new serum uses microcurrent technology to stimulate your skin cells and boost collagen. Smart skincare starts here.
Predicted Topic: tech | Confidence Score: 0.8680
Generated Post:
Option 1
🚀 📱 This gadget just dropped and it’s blowing our minds! #TechNews #FutureTech #Innovation
CTA: Learn how!
Tone: informative | Platform: Twitter


**this is the oneee (not really) its bart + neo fails!**

In [None]:
import pandas as pd
from transformers import pipeline

# Load dataset (replace with your dataset path)
df = pd.read_csv('/content/drive/MyDrive/Post_generation/generated_social_posts_large.csv')  # Replace with your dataset path

# Initialize zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
generator = pipeline("text-generation", model="EleutherAI/gpt-neo-2.7B")  # You can use a different model here if necessary

# Define possible topics (can be extracted from the dataset or manually defined)
possible_topics = df['topic'].unique().tolist()

# Function to classify text into topics
def classify_text(text):
    result = classifier(text, candidate_labels=possible_topics)
    return result['labels'][0], result['scores'][0]  # Return top label and score

# Function to generate posts based on detected topic using few-shot learning
def generate_post_with_few_shot(topic, num_examples=3):
    # Filter dataset by the detected topic
    relevant_posts = df[df['topic'].str.lower() == topic.lower()]

    if relevant_posts.empty:
        return "❌ No posts found for this topic."

    # Select a few example posts
    examples = relevant_posts.sample(n=num_examples, random_state=42)

    # Create a prompt for LLM using these examples
    prompt = "Here are a few examples of social media posts:\n\n"

    for i, post in enumerate(examples.to_dict('records'), 1):
        prompt += f"Example {i}:\n"
        prompt += f"Caption: {post['caption']} {post['hashtags']}\n"
        prompt += f"CTA: {post['cta']}\n"
        prompt += f"Tone: {post['tone']} | Platform: {post['platform']}\n\n"

    prompt += f"Based on these examples, generate a new post for the topic '{topic}':"

    # Generate a new post with LLM based on the few-shot prompt
    generated_post = generator(prompt, max_new_tokens=100, num_return_sequences=1)[0]['generated_text']

    return generated_post.strip()

# Example usage with prompts
prompts = [
"Write a rebellious, stylish, eco-conscious caption for a sustainable fashion app targeting Gen Z. Make it TikTok-ready with bold language, trendy emojis, and 3 eco-friendly hashtags like #SustainableFashion, #EcoChic, #GenZStyle."
 ]

# Generating posts based on detected topics
for prompt in prompts:
    print(f"\n{'='*50}\nPrompt: {prompt}\n{'='*50}")

    # Detect the topic
    label, score = classify_text(prompt)
    print(f"Predicted Topic: {label} | Confidence Score: {score:.4f}")

    # Generate a post based on the detected topic using few-shot learning
    generated_post = generate_post_with_few_shot(label)
    print(f"Generated Post:\n{generated_post}")


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0
Device set to use cuda:0



Prompt: Write a rebellious, stylish, eco-conscious caption for a sustainable fashion app targeting Gen Z. Make it TikTok-ready with bold language, trendy emojis, and 3 eco-friendly hashtags like #SustainableFashion, #EcoChic, #GenZStyle.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicted Topic: fashion | Confidence Score: 0.7257
Generated Post:
Here are a few examples of social media posts:

Example 1:
Caption: From the runway to your wardrobe 🧥 🕶️ What’s your favorite piece? #StreetStyle #StyleInspo #OOTD
CTA: Your style upgrade!
Tone: inspirational | Platform: TikTok

Example 2:
Caption: Step up your style game with these must-have looks 🕶️ 👗 #StreetStyle #RunwayLook #FashionTrends
CTA: See what’s trending!
Tone: playful | Platform: TikTok

Example 3:
Caption: From the runway to your wardrobe 👗 🧥 What’s your favorite piece? #StyleInspo #FashionTrends #RunwayLook
CTA: Level up your look!
Tone: inspirational | Platform: Pinterest

Based on these examples, generate a new post for the topic 'fashion':

Example 1:

I would love to read, what's trending in fashion right now.

CTA: Find something fun to share in your fashion-based story.
Tone: inspirational | Platform: Instagram

Example 2:

I would love to see, what's trending in fashion right now.

This is where

***google flan base 2  + L6-v2 ***

In [3]:
import pandas as pd
import re
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util

class SocialPostGenerator:
    def __init__(self, dataset_path):
        self.df = pd.read_csv(dataset_path)
        self.df['keywords'] = self.df['caption'].apply(self.extract_keywords)
        self.available_topics = self.df['topic'].str.lower().unique()

        # NLP tools
        self.rewriter = pipeline("text2text-generation", model="google/flan-t5-base")
        self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
        self.enricher = pipeline("text2text-generation", model="google/flan-t5-large")

        # Precompute topic embeddings
        self.topic_embeddings = {
            topic: self.embedding_model.encode(
                " ".join(self.df[self.df['topic'].str.lower() == topic]['caption'].tolist()),
                convert_to_tensor=True
            )
            for topic in self.available_topics
        }

    def extract_keywords(self, text):
        return set(re.findall(r'\b[a-zA-Z]{4,}\b', text.lower()))

    def reformulate_prompt(self, prompt):
        try:
            reformulated = self.rewriter(
                f"Paraphrase this sentence: {prompt}",
                max_length=60,
                do_sample=False
            )[0]['generated_text']
            print(f"🔁 Reformulated: {reformulated}")
            return reformulated
        except:
            return prompt

    def detect_topic_semantic(self, prompt):
        prompt_emb = self.embedding_model.encode(prompt, convert_to_tensor=True)
        scores = {
            topic: util.cos_sim(prompt_emb, topic_emb).item()
            for topic, topic_emb in self.topic_embeddings.items()
        }
        print("📊 Semantic topic scores:", scores)
        best_topic = max(scores, key=scores.get)
        return best_topic if scores[best_topic] > 0.2 else None

    def detect_topic_keywords(self, prompt, min_score_threshold=1):
        prompt_keywords = self.extract_keywords(prompt)
        topic_scores = {}

        for topic in self.available_topics:
            topic_posts = self.df[self.df['topic'].str.lower() == topic]
            topic_keywords = set().union(*topic_posts['keywords'])
            score = len(prompt_keywords & topic_keywords)
            topic_scores[topic] = score

        print("🔤 Keyword topic scores:", topic_scores)
        best_match = max(topic_scores, key=topic_scores.get)
        if topic_scores[best_match] >= min_score_threshold:
            return best_match

        for topic in self.available_topics:
            if topic in prompt.lower():
                print(f"⚠️ Fallback keyword match: {topic}")
                return topic

        return None

    def enrich_post(self, post_data, num_variants=2):
        enriched_versions = []

        prompt_template = (
            "Write a social media post with a {tone} tone. "
            "Use this caption: '{caption}', include these hashtags: {hashtags}, and end with this call to action: '{cta}'. "
            "Make it engaging, natural, and include some fitting emojis."
        )

        for _ in range(num_variants):
            prompt = prompt_template.format(
                tone=post_data['tone'],
                caption=post_data['caption'],
                hashtags=post_data['hashtags'],
                cta=post_data['cta']
            )

            try:
                enriched = self.enricher(prompt, max_length=80, num_return_sequences=1, do_sample=True)[0]['generated_text']
            except:
                enriched = "⚠️ Could not generate enriched post."

            enriched_versions.append(enriched.strip())

        return enriched_versions

    def generate_posts(self, prompt, num_options=2):
        original_prompt = prompt
        prompt = self.reformulate_prompt(prompt)

        topic = self.detect_topic_semantic(prompt)
        if not topic:
            topic = self.detect_topic_keywords(prompt)

        if not topic:
            return "❌ Aucun sujet pertinent détecté pour ce prompt."

        print(f"✅ Topic détecté: {topic}")
        prompt_keywords = self.extract_keywords(prompt)
        self.df['score'] = self.df['keywords'].apply(lambda x: len(x & prompt_keywords))

        relevant_posts = self.df[self.df['topic'].str.lower() == topic]
        relevant_posts = relevant_posts.sort_values('score', ascending=False)
        relevant_posts = relevant_posts.drop_duplicates(subset='caption')
        selected_posts = relevant_posts.head(num_options).to_dict('records')

        output = []
        for i, post in enumerate(selected_posts, 1):
            output.append(
                f"\nOption {i}\n"
                f"{post['caption']} {post['hashtags']}\n"
                f"CTA: {post['cta']}\n"
                f"Tone: {post['tone']}"
            )

            enriched_versions = self.enrich_post(post)
            for j, enriched in enumerate(enriched_versions, 1):
                output.append(f"✨ Enriched Version {j}: {enriched.strip()}")

            # Structured enriched post
            structured_post = f"✨ Structured Enriched Post: {enriched_versions[0]}"
            output.append(structured_post)

        return "\n".join(output) if output else "No matching posts found."


# Example usage
generator = SocialPostGenerator("/content/drive/MyDrive/Post_generation/generated_social_posts_large.csv")

prompts = [
    "Launch our AI fashion app for personalized outfits. Write a trendy, stylish caption with hashtags"
]

for prompt in prompts:
    print(f"\n{'='*50}\nPrompt: {prompt}\n{'='*50}")
    print(generator.generate_posts(prompt))


Device set to use cuda:0
Device set to use cuda:0



Prompt: Launch our AI fashion app for personalized outfits. Write a trendy, stylish caption with hashtags
🔁 Reformulated: The AI fashion app is a fashion app that lets you customize your outfits.
📊 Semantic topic scores: {'beauty': 0.18847548961639404, 'fashion': 0.25414615869522095, 'electronics': 0.03696385771036148, 'tech': 0.16923022270202637, nan: 0.0800614058971405}
✅ Topic détecté: fashion

Option 1
Step up your style game with these must-have looks 👗 🧥 #StyleInspo #OOTD #FashionTrends
CTA: See what’s trending!
Tone: trendy
✨ Enriched Version 1: See what's trending and show your style to the world - all in the #OOTD hashtag.
✨ Enriched Version 2: 'Stay stylish with these must-have looks. You're already following them'
✨ Structured Enriched Post: See what's trending and show your style to the world - all in the #OOTD hashtag.

Option 2
Style that speaks louder than words 👠 👗 #Trendy #OOTD #RunwayLook #FashionTrends
CTA: Get inspired!
Tone: trendy
✨ Enriched Version 1: Wear these

In [4]:
prompts = [
    "Introduce our new luxury skincare serum that hydrates and rejuvenates your skin, leaving it glowing and refreshed. Write an alluring caption with hashtags that highlight its benefits."
]

for prompt in prompts:
    print(f"\n{'='*50}\nPrompt: {prompt}\n{'='*50}")
    print(generator.generate_posts(prompt))


Prompt: Introduce our new luxury skincare serum that hydrates and rejuvenates your skin, leaving it glowing and refreshed. Write an alluring caption with hashtags that highlight its benefits.
🔁 Reformulated: Our new luxury skincare serum hydrates and rejuvenates your skin, leaving it glowing and refreshed.
📊 Semantic topic scores: {'beauty': 0.4566115736961365, 'fashion': 0.05920186638832092, 'electronics': 0.06610680371522903, 'tech': 0.11696355044841766, nan: 0.06638158112764359}
✅ Topic détecté: beauty

Option 1
Skincare made simple 🧖 💋 Your routine just got an upgrade. #BeautyTips #MakeupMagic #GlowUp
CTA: Watch the tutorial!
Tone: friendly
✨ Enriched Version 1: Let's get started.
✨ Enriched Version 2: You’re always looking for a way to pamper yourself, but don’t know where to start? Here are some sex-friendly, affordable, and simple ways to make your skin glow.
✨ Structured Enriched Post: Let's get started.

Option 2
Skincare made simple 🌸 🧖 Your routine just got an upgrade. #Glo

In [7]:
prompts = [
    "Unveil our latest AI-powered home automation system that brings smart living to your doorstep. Write an engaging caption with hashtags to showcase its convenience, security, and advanced features."
]

for prompt in prompts:
    print(f"\n{'='*50}\nPrompt: {prompt}\n{'='*50}")
    print(generator.generate_posts(prompt))


Prompt: Unveil our latest AI-powered home automation system that brings smart living to your doorstep. Write an engaging caption with hashtags to showcase its convenience, security, and advanced features.
🔁 Reformulated: We're introducing our latest AI-powered home automation system that brings smart living to your doorstep. Write an engaging caption with hashtags to showcase its convenience, security, and advanced features.
📊 Semantic topic scores: {'beauty': 0.12985464930534363, 'fashion': 0.18974348902702332, 'electronics': 0.2313966602087021, 'tech': 0.2644815444946289, nan: 0.16089162230491638}
✅ Topic détecté: tech

Option 1
Say hello to the future with 📱 💻 The latest in AI is changing the game. #FutureTech #TechNews #Innovation
CTA: Read now!
Tone: informative
✨ Enriched Version 1: New advancements in cognitive computing and artificial intelligence were revealed during the annual session of the Future Tech Innovation Accelerator at ITG in London.
✨ Enriched Version 2: The lates