
# Social Media Trends — Step-by-Step Notebook

This notebook walks through:
1. Loading & cleaning your dataset  
2. Feature engineering (time features, engagement rate)  
3. Hashtag **time-series** analysis  
4. Hashtag **co-occurrence** **graph**  
5. **Region × Platform** heatmap (by engagement)  
6. Baseline **ML** model to predict `Engagement_Level`  

> **Assumed columns:**  
`['Post_ID','Post_Date','Platform','Hashtag','Content_Type','Region','Views','Likes','Shares','Comments','Engagement_Level']`


## 0. Setup

In [None]:

# If you don't have networkx installed, uncomment:
# !pip install networkx scikit-learn

import os, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

try:
    import networkx as nx
except Exception as e:
    nx = None
    print("[Warning] networkx not available; co-occurrence graph will be skipped.")


## 1. Load your data

In [None]:

# ⬇️ Edit this path to your CSV
CSV_PATH = "path/to/your.csv"  # e.g., "data/social_trends.csv"
DATE_COL = "Post_Date"

# --- Load ---
df = pd.read_csv(CSV_PATH)

# Parse datetime
if DATE_COL in df.columns:
    df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors="coerce", utc=True).dt.tz_localize(None)

print("Rows:", len(df))
df.head()


## 2. Basic cleaning & preview

In [None]:

# Drop rows with missing dates (if needed)
if DATE_COL in df.columns:
    df = df.dropna(subset=[DATE_COL])

# Ensure numeric types for metrics
for c in ["Views","Likes","Shares","Comments"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

print(df.dtypes)
df.sample(min(5, len(df)))


## 3. Feature engineering

In [None]:

# Time features
if DATE_COL in df.columns:
    dt = df[DATE_COL]
    df["Year"] = dt.dt.year
    df["Month"] = dt.dt.month
    df["Week"] = dt.dt.isocalendar().week.astype(int)
    df["Weekday"] = dt.dt.dayofweek
    df["Hour"] = dt.dt.hour

# Engagement rate
for c in ["Views","Likes","Shares","Comments"]:
    if c not in df.columns:
        df[c] = 0

df["Engagement_Rate"] = (df["Likes"].fillna(0) + df["Shares"].fillna(0) + df["Comments"].fillna(0)) / (
    df["Views"].replace(0, np.nan)
)
df["Engagement_Rate"] = df["Engagement_Rate"].fillna(0.0)

df.head()


## 4. Hashtag time-series (top-N)

In [None]:

# Parameters
RESAMPLE = "W"     # 'D','W','M'
TOP_HASHTAGS = 6   # change as needed
METRIC = "Engagement_Rate" if "Engagement_Rate" in df.columns else "Likes"

if "Hashtag" in df.columns and DATE_COL in df.columns:
    dft = df.dropna(subset=["Hashtag"]).copy()
    dft["Hashtag"] = dft["Hashtag"].astype(str).str.strip().str.lower()

    top_tags = dft["Hashtag"].value_counts().head(TOP_HASHTAGS).index.tolist()
    plt.figure(figsize=(10,6))
    for tag in top_tags:
        temp = dft[dft["Hashtag"] == tag].set_index(DATE_COL).sort_index()
        ts = temp[METRIC].resample(RESAMPLE).mean()
        plt.plot(ts.index, ts.values, label=tag)

    plt.title(f"{METRIC} over time — Top {len(top_tags)} hashtags (resample={RESAMPLE})")
    plt.xlabel("Date"); plt.ylabel(METRIC); plt.legend()
    plt.show()
else:
    print("Hashtag or Post_Date column missing — skipping time-series.")


## 5. Hashtag co-occurrence graph

In [None]:

if "Hashtag" not in df.columns:
    print("No 'Hashtag' column — skipping co-occurrence graph.")
elif nx is None:
    print("networkx not installed — skipping co-occurrence graph.")
else:
    # Assume comma-separated hashtags per post. Single hashtags are okay (edges require ≥2 per post).
    tags_series = (
        df["Hashtag"].astype(str)
        .str.lower()
        .str.replace(" ", "", regex=False)
        .str.split(",")
    )

    from collections import Counter
    edge_counter = Counter()

    for tags in tags_series:
        tags = [t for t in tags if t]
        unique = sorted(set(tags))
        for i in range(len(unique)):
            for j in range(i+1, len(unique)):
                a, b = unique[i], unique[j]
                edge_counter[(a, b)] += 1

    # Build graph with a simple threshold for readability
    MIN_EDGE_WEIGHT = 2
    G = nx.Graph()
    for (a, b), w in edge_counter.items():
        if w >= MIN_EDGE_WEIGHT:
            G.add_edge(a, b, weight=w)

    if len(G.nodes) == 0:
        print("No co-occurring hashtag pairs ≥ threshold; adjust MIN_EDGE_WEIGHT.")
    else:
        deg = dict(G.degree())
        node_sizes = [50 + 20*deg[n] for n in G.nodes()]
        pos = nx.spring_layout(G, seed=42)

        plt.figure(figsize=(12,8))
        nx.draw_networkx_nodes(G, pos, node_size=node_sizes)
        nx.draw_networkx_edges(G, pos, width=[0.5 + 0.3*G[u][v]['weight'] for u, v in G.edges()])
        labels = {n: n if deg[n] >= 2 else "" for n in G.nodes()}
        nx.draw_networkx_labels(G, pos, labels=labels, font_size=8)
        plt.title("Hashtag Co-occurrence Graph (labels shown for degree ≥ 2)")
        plt.axis("off")
        plt.show()


## 6. Region × Platform heatmap (mean engagement)

In [None]:

if ("Region" in df.columns) and ("Platform" in df.columns):
    agg = "Engagement_Rate" if "Engagement_Rate" in df.columns else ("Likes" if "Likes" in df.columns else None)
    if agg is None:
        print("No numeric engagement column found; skipping heatmap.")
    else:
        pivot = df.pivot_table(index="Region", columns="Platform", values=agg, aggfunc="mean").fillna(0.0)
        fig, ax = plt.subplots(figsize=(10,6))
        im = ax.imshow(pivot.values, aspect="auto")
        ax.set_xticks(range(len(pivot.columns)))
        ax.set_xticklabels(pivot.columns, rotation=45, ha="right")
        ax.set_yticks(range(len(pivot.index)))
        ax.set_yticklabels(pivot.index)
        ax.set_title(f"{agg} — Region × Platform (mean)")
        cbar = fig.colorbar(im, ax=ax, shrink=0.8)
        cbar.set_label(agg)
        plt.tight_layout()
        plt.show()
else:
    print("Region or Platform missing — skipping heatmap.")


## 7. ML: Predict `Engagement_Level` (baseline Random Forest)

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

if "Engagement_Level" not in df.columns:
    print("No 'Engagement_Level' column — skipping ML.")
else:
    cat_cols = [c for c in ["Platform","Content_Type","Region","Hashtag"] if c in df.columns]
    num_cols = [c for c in ["Views","Likes","Shares","Comments","Engagement_Rate","Hour","Weekday","Month"] if c in df.columns]

    X = df[cat_cols + num_cols].copy()
    y = df["Engagement_Level"].astype(str)

    # Reduce hashtag cardinality
    if "Hashtag" in X.columns:
        freq = X["Hashtag"].astype(str).value_counts()
        top = set(freq.head(50).index)
        X["Hashtag"] = X["Hashtag"].where(X["Hashtag"].isin(top), other="_other_")

    pre = ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", StandardScaler(), num_cols)
    ])

    clf = Pipeline([
        ("pre", pre),
        ("rf", RandomForestClassifier(n_estimators=300, random_state=42, class_weight="balanced"))
    ])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print("[Classification report]")
    print(classification_report(y_test, y_pred))

    labels = sorted(y.unique())
    cm = confusion_matrix(y_test, y_pred, labels=labels)
    fig, ax = plt.subplots(figsize=(8,6))
    im = ax.imshow(cm, aspect="auto")
    ax.set_xticks(range(len(labels))); ax.set_xticklabels(labels, rotation=45, ha="right")
    ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels)
    ax.set_xlabel("Predicted"); ax.set_ylabel("True")
    ax.set_title("Confusion Matrix — Engagement_Level")
    fig.colorbar(im, ax=ax, shrink=0.8)
    plt.tight_layout()
    plt.show()
