In [143]:

# INSTAPER: END-TO-END AUTOMATION 


import os
import requests
import joblib
import pandas as pd
from bs4 import BeautifulSoup
from collections import Counter


# LOAD ML ARTIFACTS

ml_model = joblib.load("instaper_personality_model.pkl")
feature_columns = joblib.load("feature_columns.pkl")
label_encoder = joblib.load("label_encoder.pkl")


# LOAD HASHTAG LOOKUP TABLE

hashtag_df = pd.read_csv("Hashtag_Table.csv")
hashtag_df["hashtag"] = (
    hashtag_df["hashtag"]
    .astype(str)
    .str.lower()
    .str.strip()
    .str.replace("#", "", regex=False)
)

hashtag_lookup = hashtag_df.set_index("hashtag")[["category", "sentiment"]]


# EXTRACT HASHTAGS FROM REEL

def extract_hashtags_from_reel(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        html = requests.get(url, headers=headers, timeout=10).text
    except Exception:
        return []

    soup = BeautifulSoup(html, "html.parser")
    desc = soup.find("meta", property="og:description")
    if not desc:
        return []

    caption = desc.get("content", "")

    hashtags = []
    for word in caption.split():
        if word.startswith("#"):
            tag = (
                word.lower()
                .replace("#", "")
                .replace(",", "")
                .replace(".", "")
                .replace('"', "")
                .strip()
            )
            hashtags.append(tag)

    return hashtags


# HASHTAGS ‚Üí FEATURES

def hashtags_to_features(hashtags):

    cat_counter = Counter()
    sent_counter = Counter()

    for tag in hashtags:
        if tag in hashtag_lookup.index:
            row = hashtag_lookup.loc[tag]

            if isinstance(row, pd.DataFrame):
                category = row["category"].iloc[0]
                sentiment = row["sentiment"].iloc[0]
            else:
                category = row["category"]
                sentiment = row["sentiment"]

            cat_counter[f"cat_{category}"] += 1
            sent_counter[f"sent_{sentiment}"] += 1

    features = {**cat_counter, **sent_counter}
    for col in feature_columns:
        features.setdefault(col, 0)

    return pd.DataFrame([features])[feature_columns]


# STORE RESULT
def store_result(record):
    file = "instaper_results.csv"
    pd.DataFrame([record]).to_csv(
        file,
        mode="a",
        header=not os.path.exists(file),
        index=False
    )

# MAIN PIPELINE
def instaper_pipeline(reel_url, user_id):

    print("üîç Extracting hashtags...")
    hashtags = extract_hashtags_from_reel(reel_url)
    #hashtags=["gaming", "motiongraphics", "story"]
    print("Hashtags:", hashtags)

    print("üìä Creating ML features...")
    X = hashtags_to_features(hashtags)
    

    
    X["openness"] = (
        X["cat_Travel & Nature"] +
        X["cat_Art & Creativity"] +
        0.5 * X["cat_Fashion"] +
        X["cat_Technology & Programming"] +
        0.5 * X["cat_Education"]
    )

    X["extraversion"] = (
        X["cat_Entertainment"] +
        X["cat_Gaming"] +
        X["cat_Sports"] +
        0.5 * X["cat_Fashion"]
    )
    
    X["agreeableness"] = (
        X["cat_Pets & Animals"] +
        X["cat_Relationships"] +
        0.5 * X["cat_Beauty & Personal Care"]
    )
    
    X["conscientiousness"] = (
        X["cat_Fitness & Health"] +
        X["cat_Business & Finance"] +
        X["cat_Education"] +
        0.5 * X["cat_Motivation & Self-Improvement"]
    )
    
    X["neuroticism"] = (
        X["sent_Negative"] +
        0.5 * X["cat_Motivation & Self-Improvement"]
    )
    #X = X[feature_columns]
    print("Feature vector:\n", X)

    print("ü§ñ Predicting personality (ML)...")
    pred_encoded = ml_model.predict(X)[0]
    label = label_encoder.inverse_transform([pred_encoded])[0]

    record = {
        "reel_url": reel_url,
        "hashtags": ",".join(hashtags),
        "predicted_personality_label": label
    }
    store_result(record)
    return record

if __name__ == "__main__":
    reel_url = "https://www.instagram.com/reel/DOcwI9mCLjv/?igsh=MTk5ZnM3NXI3cDlrcQ=="
    user_id = 1

    output = instaper_pipeline(reel_url, user_id)
    print("\nFINAL OUTPUT:\n", output)


üîç Extracting hashtags...
Hashtags: []
üìä Creating ML features...
Feature vector:
    cat_Pets & Animals  cat_Fitness & Health  cat_Other  sent_Neutral  \
0                   0                     0          0             0   

   cat_Technology & Programming  cat_Relationships  cat_Art & Creativity  \
0                             0                  0                     0   

   cat_Home & Interior  cat_Fashion  cat_Education  ...  cat_Entertainment  \
0                    0            0              0  ...                  0   

   cat_News & Politics  cat_Sports  cat_Gaming  sent_Negative  openness  \
0                    0           0           0              0       0.0   

   extraversion  agreeableness  conscientiousness  neuroticism  
0           0.0            0.0                0.0          0.0  

[1 rows x 27 columns]
ü§ñ Predicting personality (ML)...

FINAL OUTPUT:
 {'reel_url': 'https://www.instagram.com/reel/DOcwI9mCLjv/?igsh=MTk5ZnM3NXI3cDlrcQ==', 'hashtags': '', 