In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
import os

data_path = "/content/drive/MyDrive/project dataset/"

files = [f for f in os.listdir(data_path) if f.endswith(".csv")]
print("CSV files found:", files)

CSV files found: ['pakistan_today.csv', 'daily_times.csv', 'tribune.csv', 'combined_dataset.csv', 'dawn.csv', 'business_recorder.csv']


In [5]:
# available years
years_found = set()
for file in files:
    file_path = os.path.join(data_path, file)
    print(f"Scanning {file}...")
    for chunk in pd.read_csv(file_path, chunksize=50000, encoding="latin-1", usecols=['date']):
        chunk['date'] = pd.to_datetime(chunk['date'], errors='coerce')
        years_found.update(chunk['date'].dt.year.dropna().unique())

years_found = sorted([int(y) for y in years_found if pd.notna(y)])
print("\nYears found in dataset:", years_found)

Scanning pakistan_today.csv...
Scanning daily_times.csv...
Scanning tribune.csv...
Scanning combined_dataset.csv...
Scanning dawn.csv...
Scanning business_recorder.csv...

Years found in dataset: [2020, 2021, 2022, 2023]


In [6]:
file_name = 'combined_dataset.csv'
combined_file_path = os.path.join(data_path, file_name)

chunk_size = 50000

csv_chunks = pd.read_csv(combined_file_path, chunksize=chunk_size, encoding='latin-1')

first_chunk = next(csv_chunks)
print(f"Displaying the head of the first chunk from '{file_name}':")

display(first_chunk.head())

Displaying the head of the first chunk from 'combined_dataset.csv':


Unnamed: 0,headline,date,link,source,categories,description,mapped_categories
0,CJP to hear Karak temple attack case on Tuesda...,12/31/2020 14:36,https://www.pakistantoday.com.pk/2020/12/31/14...,Pakistan Today,national,ISLAMABAD: Chief Justice of Pakistan (CJP...,National
1,NAB secures 14-day remand of Asif,12/31/2020 15:30,https://www.pakistantoday.com.pk/2020/12/31/na...,Pakistan Today,national,LAHORE: An accountability in Lahore on Thurs...,National
2,PMC response sought in license case,12/31/2020 16:37,https://www.pakistantoday.com.pk/2020/12/31/pm...,Pakistan Today,national,ISLAMABAD: The Islamabad High Court (IHC) has...,National
3,Saudi-led coalition strikes at Yemen capital a...,12/31/2020 16:47,https://www.pakistantoday.com.pk/2020/12/31/sa...,Pakistan Today,world,ADEN: Saudi-led coalition warplanes struck tar...,World
4,"Asif, another PML-N leader asked to quit party...",12/31/2020 17:11,https://www.pakistantoday.com.pk/2020/12/31/as...,Pakistan Today,national,LAHORE: Former National Assembly (NA) speaker ...,National


In [7]:
! pip install streamlit

Collecting streamlit
  Downloading streamlit-1.52.1-py3-none-any.whl.metadata (9.8 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.52.1-py3-none-any.whl (9.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m78.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m112.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.52.1


In [14]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import re
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import os

# ------------------------------------------------------
# PAGE CONFIG
# ------------------------------------------------------
st.set_page_config(
    page_title="News Topic Evolution",
    layout="wide"
)

st.title("News Topic Evolution and Forecasting Dashboard")
st.markdown("Topic modeling with temporal forecasting using XGBoost regression")

# ------------------------------------------------------
# MOUNTED DRIVE PATH
# ------------------------------------------------------
# Make sure you have already mounted your drive in Colab
# from google.colab import drive
# drive.mount('/content/drive')

data_path = "/content/drive/MyDrive/project dataset/"
file_name = "combined_dataset.csv"
combined_file_path = os.path.join(data_path, file_name)

chunk_size = 50000

# ------------------------------------------------------
# LOAD DATA FUNCTION
# ------------------------------------------------------
@st.cache_data
def load_data(file_path, chunk_size=50000):
    chunks = pd.read_csv(file_path, chunksize=chunk_size, encoding="latin-1")
    df_list = []
    for chunk in chunks:
        # Keep only required columns
        chunk = chunk[['headline', 'description', 'date', 'mapped_categories']]
        chunk['date'] = pd.to_datetime(chunk['date'], errors='coerce')
        chunk = chunk.dropna(subset=['headline', 'description', 'date', 'mapped_categories'])
        df_list.append(chunk)
    df = pd.concat(df_list, ignore_index=True)
    df['mapped_categories'] = df['mapped_categories'].str.strip()
    df['text'] = df['headline'] + " " + df['description']
    return df

df = load_data(combined_file_path, chunk_size)

# ------------------------------------------------------
# SIDEBAR CONFIGURATION
# ------------------------------------------------------
st.sidebar.header("Text & Topic Settings")
USE_PREPROCESSING = st.sidebar.checkbox("Enable Text Preprocessing", True)
USE_NMF = st.sidebar.checkbox("Enable Topic Modeling (NMF)", True)
NUM_TOPICS = st.sidebar.slider("Number of Topics", 2, 8, 3)

st.sidebar.header("Time Series Features")
USE_TIME_FEATURES = st.sidebar.checkbox("Use Calendar Features", True)
USE_LAGS = st.sidebar.checkbox("Use Lag Features", True)
USE_ROLLING = st.sidebar.checkbox("Use Rolling Mean", True)

TRAIN_SPLIT = st.sidebar.slider("Train Split Ratio", 0.6, 0.9, 0.8)
FORECAST_DAYS = st.sidebar.slider("Forecast Horizon (Days)", 3, 14, 7)

# ------------------------------------------------------
# TEXT PREPROCESSING
# ------------------------------------------------------
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

if USE_PREPROCESSING:
    df['text'] = df['text'].apply(clean_text)

# ------------------------------------------------------
# TF-IDF + NMF
# ------------------------------------------------------
vectorizer = TfidfVectorizer(max_features=1500, stop_words="english")
tfidf = vectorizer.fit_transform(df['text'])

if USE_NMF:
    nmf = NMF(n_components=NUM_TOPICS, random_state=42)
    topic_matrix = nmf.fit_transform(tfidf)
    df['topic'] = topic_matrix.argmax(axis=1)
else:
    df['topic'] = 0

# ------------------------------------------------------
# SHOW TOPICS
# ------------------------------------------------------
if USE_NMF:
    st.header("Discovered Topics")
    feature_names = vectorizer.get_feature_names_out()
    cols = st.columns(NUM_TOPICS)
    for i in range(NUM_TOPICS):
        top_words = [feature_names[j] for j in nmf.components_[i].argsort()[-10:][::-1]]
        cols[i].markdown(f"**Topic {i}**")
        cols[i].write(", ".join(top_words))

# ------------------------------------------------------
# DAILY AGGREGATION
# ------------------------------------------------------
df['day'] = df['date'].dt.date
ts_df = df.groupby(['day', 'mapped_categories', 'topic']).size().reset_index(name='count')
ts_df['day'] = pd.to_datetime(ts_df['day'])
ts_df = ts_df.sort_values('day')

# ------------------------------------------------------
# SIDEBAR FILTERS
# ------------------------------------------------------
st.sidebar.header("Data Filters")
category = st.sidebar.selectbox("Select Category", sorted(ts_df['mapped_categories'].unique()))
topic_id = st.sidebar.selectbox("Select Topic", sorted(ts_df['topic'].unique()))

filtered_ts = ts_df[
    (ts_df['mapped_categories'] == category) &
    (ts_df['topic'] == topic_id)
]

# ------------------------------------------------------
# FEATURE ENGINEERING
# ------------------------------------------------------
def create_features(df):
    df = df.copy()
    if USE_TIME_FEATURES:
        df['dayofweek'] = df['day'].dt.dayofweek
        df['day_num'] = df['day'].dt.day
        df['month'] = df['day'].dt.month
    if USE_LAGS:
        df['lag_1'] = df['count'].shift(1)
        df['lag_7'] = df['count'].shift(7)
    if USE_ROLLING:
        df['rolling_mean_7'] = df['count'].rolling(7).mean()
    return df.dropna()

# ------------------------------------------------------
# TRAIN XGBOOST
# ------------------------------------------------------
def train_xgb(df):
    df_feat = create_features(df)
    X = df_feat.drop(columns=['day', 'mapped_categories', 'topic', 'count'])
    y = df_feat['count']
    split = int(len(df_feat) * TRAIN_SPLIT)
    X_train, X_test = X[:split], X[split:]
    y_train, y_test = y[:split], y[split:]
    model = XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=5, random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return model, df_feat, y_test, preds

# ------------------------------------------------------
# MODEL PERFORMANCE
# ------------------------------------------------------
st.header("Model Performance")
if len(filtered_ts) < 30:
    st.warning("Not enough data for this topic-category combination.")
    st.stop()

model, df_model, y_test, preds = train_xgb(filtered_ts)
mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))

col1, col2 = st.columns(2)
col1.metric("MAE", f"{mae:.2f}")
col2.metric("RMSE", f"{rmse:.2f}")

# ------------------------------------------------------
# ACTUAL VS PREDICTED
# ------------------------------------------------------
st.subheader("Actual vs Predicted")
test_dates = df_model.iloc[-len(y_test):]['day']

fig_pred = px.line(x=test_dates, y=y_test, labels={'x':'Date','y':'Topic Count'})
fig_pred.add_scatter(x=test_dates, y=preds, mode='lines', name='Predicted')
st.plotly_chart(fig_pred, use_container_width=True)

# ------------------------------------------------------
# FORECASTING
# ------------------------------------------------------
def forecast_future(model, df, days):
    future = df.copy()
    for _ in range(days):
        last = future.iloc[-1:].copy()
        next_day = last['day'].values[0] + np.timedelta64(1,'D')
        last['day'] = next_day

        if USE_LAGS:
            last['lag_1'] = future['count'].iloc[-1]
            last['lag_7'] = future['count'].iloc[-7]

        if USE_ROLLING:
            last['rolling_mean_7'] = future['count'].tail(7).mean()

        if USE_TIME_FEATURES:
            last['dayofweek'] = pd.to_datetime(next_day).dayofweek
            last['day_num'] = pd.to_datetime(next_day).day
            last['month'] = pd.to_datetime(next_day).month

        X_new = last.drop(columns=['day','mapped_categories','topic','count'])
        last['count'] = model.predict(X_new)[0]
        future = pd.concat([future,last])
    return future.tail(days)

forecast = forecast_future(model, df_model, FORECAST_DAYS)

# ------------------------------------------------------
# FORECAST PLOT
# ------------------------------------------------------
st.subheader("Topic Forecast")
fig_forecast = px.line(x=df_model['day'], y=df_model['count'], labels={'x':'Date','y':'Topic Count'})
fig_forecast.add_scatter(x=forecast['day'], y=forecast['count'], mode='lines+markers', name='Forecast')
st.plotly_chart(fig_forecast, use_container_width=True)

# ------------------------------------------------------
# DATA TABLE
# ------------------------------------------------------
with st.expander("View Aggregated Data"):
    st.dataframe(filtered_ts)


Overwriting app.py


In [10]:
! pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.5.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.5.0-py3-none-any.whl (24 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.5.0


In [16]:
from pyngrok import ngrok
import os

ngrok.set_auth_token("2w3AgeUvnrQQ7UWh2xVbIg8litV_7NoZYCVA9sC7qSUyfddaP")

# Kill any previous tunnels
ngrok.kill()
os.system("streamlit run app.py &")

public_url = ngrok.connect(8501, "http")
print("Public URL:", public_url)


Public URL: NgrokTunnel: "https://3e5eaf4928f8.ngrok-free.app" -> "http://localhost:8501"
