Reading the Data

In [1]:
import pandas as pd
df = pd.read_csv('social_media_dataset_50000.csv',encoding='latin1')
print(df[['caption', 'hashtags']].head())

                                             caption  \
0  Adventures await in every corner of the world....   
1         Stepping out in style this weekend. (3640)   
2         Stepping out in style this weekend. (5738)   
3         Dream big, work hard, stay focused. (8319)   
4  Smart budgeting leads to financial freedom. (3...   

                                            hashtags  
0  #wanderlust #travel #adventure #vacation #explore  
1  #ootd #instafashion #fashionblogger #style #fa...  
2  #fashion #fashionblogger #instafashion #style ...  
3  #mindset #success #inspiration #motivation #goals  
4  #investing #money #financialfreedom #finance #...  


Cleaning the Captions

In [2]:
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    return ' '.join(tokens)

df['clean_caption'] = df['caption'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maanh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cleaning Hashtag Column

In [3]:
import ast
def parse_hashtags(raw):
    try:
        return ast.literal_eval(raw)
    except:
        return [tag.strip() for tag in raw.split() if tag.startswith('#')]

df['hashtags'] = df['hashtags'].apply(parse_hashtags)

Vectorizing

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer() 
Y = mlb.fit_transform(df['hashtags'])  
hashtag_labels = mlb.classes_         

Tokenizing,Padding and Splitting 

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['clean_caption'])
X = tokenizer.texts_to_sequences(df['clean_caption'])
X_padded = pad_sequences(X, maxlen=50)
X_train, X_test, Y_train, Y_test = train_test_split(X_padded, Y, test_size=0.2, random_state=42)

Model Building

In [17]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

model = Sequential([                                                                                                            
    Embedding(input_dim=10000, output_dim=64, input_length=50),                                                                 
    LSTM(128),                                                                                                                 
    Dense(len(hashtag_labels), activation='sigmoid')                                                                            
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, Y_train, epochs=5, batch_size=32, validation_split=0.1)

Epoch 1/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 42ms/step - accuracy: 0.1072 - loss: 0.2084 - val_accuracy: 0.1803 - val_loss: 0.0027
Epoch 2/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 47ms/step - accuracy: 0.1928 - loss: 0.0019 - val_accuracy: 0.1095 - val_loss: 6.6894e-04
Epoch 3/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 38ms/step - accuracy: 0.1076 - loss: 5.3049e-04 - val_accuracy: 0.1107 - val_loss: 2.7072e-04
Epoch 4/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 38ms/step - accuracy: 0.1034 - loss: 2.2562e-04 - val_accuracy: 0.1095 - val_loss: 1.3094e-04
Epoch 5/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 41ms/step - accuracy: 0.1025 - loss: 1.1174e-04 - val_accuracy: 0.1105 - val_loss: 6.8362e-05


<keras.src.callbacks.history.History at 0x1b3750db350>

Function to make Predictions,Fun Facts and Sentiment analysis

In [18]:
def predict_hashtags(caption):
    cleaned = clean_text(caption)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(seq, maxlen=50)
    pred = model.predict(padded)[0]

    threshold = 0.5
    result = [hashtag_labels[i] for i, score in enumerate(pred) if score > threshold]
    return result


In [8]:
import random

social_media_facts = [
    "The first hashtag ever used on Twitter was #barcamp in 2007.",
    "Instagram was originally called Burbn, focused on whiskey tasting.",
    "TikTok was the most downloaded app globally in 2021.",
    "Facebook's blue color scheme was chosen because Mark Zuckerberg is red-green colorblind.",
    "The egg photo that broke Instagram records in 2019 was posted just to beat Kylie Jenner’s likes.",
    "LinkedIn is older than Facebook, Twitter, and Instagram — it launched in 2003.",
    "Snapchat’s ghost mascot is named Ghostface Chillah.",
    "The first YouTube video was titled 'Me at the zoo' and posted in 2005.",
    "Over 80% of Pinterest users are women.",
    "Twitter was almost called 'FriendStalker'.",
    "Instagram hit 1 million users just 2 months after launch.",
    "Reddit’s alien mascot is named Snoo.",
    "Facebook’s first banner featured Al Pacino’s face.",
    "1 in 3 people check social media in the middle of the night.",
    "Over 10 billion emojis are sent daily — 😂 is the most popular."
]

def get_random_fact():
    return random.choice(social_media_facts)

In [14]:
positive_words = ["happy", "great", "amazing", "love", "excited", "fun", "joy", "awesome", "good", "fantastic", "beautiful", "smile","best","Smart"]
negative_words = ["sad", "tired", "angry", "bad", "hate", "bored", "upset", "terrible", "worst", "pain", "cry", "lonely"]
def simple_sentiment(text):
    words = text.lower().split()
    pos_count = sum(1 for word in words if word in positive_words)
    neg_count = sum(1 for word in words if word in negative_words)

    if pos_count > neg_count:
        return "Positive"
    elif neg_count > pos_count:
        return "Negative"
    elif pos_count == 0 and neg_count == 0:
        return "Unknown"
    else:
        return "Neutral"

Testing

In [11]:
#Testing without GUI
caption = "Cuddles with my dog makes me happy"
print(predict_hashtags(caption))
sentiment = simple_sentiment(caption)
if sentiment == "Unknown":
    print("🧠 Sentiment: Couldn’t detect mood from caption.")
else:
    print("🧠 Sentiment:", sentiment)
print("💡 Fun Social Media Fact:")
print(get_random_fact())


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164ms/step
['#catsofinstagram', '#doglife', '#petlovers', '#pets', '#puppylove']
🧠 Sentiment: Positive
💡 Fun Social Media Fact:
Over 10 billion emojis are sent daily — 😂 is the most popular.


GUI

In [15]:
# ------------------ GUI Setup ------------------
import tkinter as tk
from tkinter import ttk, font
root = tk.Tk()
root.title("✨ Hashtag Predictor by Harleen ✨")
root.geometry("800x550")
root.configure(bg="#2E2E4F")  

# Fonts
title_font = font.Font(family="Poppins", size=26, weight="bold")
subtitle_font = font.Font(family="Helvetica", size=15, weight="bold")
text_font = font.Font(family="Helvetica", size=12,weight="bold")
output_font = font.Font(family="Helvetica", size=12, weight="bold")

# ------------------ Title ------------------
title_label = tk.Label(
    root,
    text="✨ Social Media Hashtag Generator ✨",
    font=title_font,
    bg="#2E2E4F",
    fg="#FFD369"
)
title_label.pack(pady=(15, 10))

# ------------------ Input Frame ------------------
input_frame = tk.Frame(root, bg="#2E2E4F")
input_frame.pack(pady=10)

caption_label = tk.Label(
    input_frame,
    text="Enter your caption below 📸:",
    font=subtitle_font,
    bg="#2E2E4F",
    fg="#FFD369",
    anchor="w"
)
caption_label.pack(anchor="w", padx=8)

caption_entry = tk.Text(
    input_frame,
    height=6,
    width=100,
    font=text_font,
    wrap="word",
    relief="groove",
    borderwidth=2,
    bg="#393E60",
    fg="white",
    insertbackground="white",
)
caption_entry.pack(padx=20, pady=8)

# ------------------ Output Frame ------------------
output_frame = tk.Frame(root, bg="#2E2E4F")
output_frame.pack(pady=5, fill="x")




# Create output boxes & labels
result_label = tk.Label(root,text="🎯 Hashtags:",font=("Helvetica", 12,"bold"),fg="white",bg="#393E60",anchor="w",padx=20,pady=15,bd=2,relief="ridge")
result_label.pack(fill="x", padx=8, pady=5)


sentiment_label = tk.Label(root, text="🧠 Sentiment:", font=("Helvetica", 12,"bold"), fg="white", bg="#393E60", anchor="w", padx=20, pady=15,bd=2,relief="ridge")
sentiment_label.pack(fill="x", padx=8, pady=5)

fact_label = tk.Label(root, text="💡 Fun fact:", font=("Helvetica", 12,"bold"), fg="white", bg="#393E60", anchor="w", padx=20, pady=15,bd=2,relief="ridge")
fact_label.pack(fill="x", padx=8, pady=5)

# ------------------ Functions ------------------
def on_predict():
    caption = caption_entry.get("1.0", tk.END).strip()
    if caption:
        hashtags = predict_hashtags(caption)
        result = ", ".join(hashtags) if hashtags else "No hashtags predicted 🤷‍♀️"
        result_label.config(text=f"🎯 Hashtags: {result}")

        sentiment = simple_sentiment(caption)
        if sentiment == "Unknown":
            sentiment_label.config(text="🧠 Sentiment: Couldn’t detect mood from caption.")
        else:
            sentiment_label.config(text=f"🧠 Sentiment: {sentiment}")

        fact = get_random_fact()
        fact_label.config(text=f"💡 Social Media Fun Fact: {fact}")
        status_bar.config(text="✅ Prediction complete!")

def clear_all():
    caption_entry.delete("1.0", tk.END)
    result_label.config(text="")
    sentiment_label.config(text="")
    fact_label.config(text="")
    status_bar.config(text="🧹 Cleared. Ready for a new caption!")

# ------------------ Button Styles ------------------
style = ttk.Style()
style.theme_use("clam")
style.configure(
    "My.TButton",
    foreground="black",
    background="#FFD369",
    padding=10,
    font=("Helvetica", 12, "bold")
)
style.map(
    "My.TButton",
    background=[("active", "#E1B74A")]
)

# ------------------ Buttons ------------------
button_frame = tk.Frame(root, bg="#2E2E4F")
button_frame.pack(pady=10)

predict_btn = ttk.Button(button_frame, text="Generate Hashtags", command=on_predict, style="My.TButton")
predict_btn.grid(row=0, column=0, padx=10)

clear_btn = ttk.Button(button_frame, text="Clear", command=clear_all, style="My.TButton")
clear_btn.grid(row=0, column=1, padx=10)

# ------------------ Status Bar ------------------
status_bar = tk.Label(
    root,
    text="Ready",
    bd=1,
    relief="sunken",
    anchor="w",
    bg="#393E60",
    fg="white",
    font=("Helvetica", 10)
)
status_bar.pack(side="bottom", fill="x")

# ------------------ Run App ------------------
root.mainloop()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
