In [1]:
# Run this once in a new Colab runtime
!pip install -q scikit-learn pandas scipy


In [2]:
# Option A: Mount Google Drive (recommended if file is big)
from google.colab import drive
drive.mount('/content/drive')
# Now set input_path to your file on Drive, e.g.:
# input_path = '/content/drive/MyDrive/your_folder/bd_eng_news_daily.csv'

# Option B: Upload from local (small files)
from google.colab import files
# uploaded = files.upload()  # Uncomment to upload interactively
# After uploading, get filename via: list(uploaded.keys())[0]


Mounted at /content/drive


In [3]:

input_path = 'bd_eng_news_daily.csv'


In [4]:
import os
import json
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans

# Helper: attempt a few encodings
def read_csv_try_encodings(path):
    for enc in ("utf-8", "latin1", "cp1252"):
        try:
            return pd.read_csv(path, encoding=enc)
        except Exception:
            pass
    return pd.read_csv(path)  # last attempt (may raise)

# Detect best text column heuristically
def detect_text_column(df):
    text_cols = df.select_dtypes(include=['object']).columns.tolist()
    if not text_cols:
        raise ValueError("No object/text columns found.")
    preferred_names = ['text','content','article','body','headline','title','news','summary']
    for name in preferred_names:
        for c in text_cols:
            if name in c.lower():
                return c
    # fallback: choose column with largest average length
    avg_len = {c: df[c].dropna().astype(str).map(len).mean() for c in text_cols}
    return max(avg_len, key=avg_len.get)

def choose_clustering_algo(n_samples):
    return MiniBatchKMeans if n_samples > 5000 else KMeans

def get_top_k_indices_per_cluster(X, clusters, centroids, k):
    # X: sparse matrix (n_samples x n_features)
    # centroids: dense (n_clusters, n_features)
    centroid_norms = np.linalg.norm(centroids, axis=1)
    top_indices = {}
    for c in range(centroids.shape[0]):
        idx = np.where(clusters == c)[0]
        if idx.size == 0:
            top_indices[c] = []
            continue
        Xc = X[idx]
        centroid = centroids[c]
        denom = centroid_norms[c] if centroid_norms[c] > 0 else 1.0
        sims = Xc.dot(centroid) / denom
        sims = np.asarray(sims).ravel()
        top_k = min(k, sims.shape[0])
        order = np.argsort(-sims)[:top_k]
        top_indices[c] = idx[order].tolist()
    return top_indices

def cluster_and_sample(
    input_path,
    text_col=None,
    n_clusters=5,
    samples_per_cluster=3,
    max_features=10000,
    output_prefix='cluster_run'
):
    print("Loading CSV:", input_path)
    df = read_csv_try_encodings(input_path)
    if text_col is None:
        text_col = detect_text_column(df)
        print("Auto-detected text column:", text_col)
    else:
        print("Using provided text column:", text_col)
    texts = df[text_col].fillna('').astype(str)
    nonempty_mask = texts.str.strip().astype(bool)
    if nonempty_mask.sum() < max(2, n_clusters):
        raise ValueError("Not enough non-empty texts to cluster into the requested number of clusters.")
    texts_nonempty = texts[nonempty_mask]
    orig_indices = texts_nonempty.index.to_numpy()

    print("TF-IDF vectorization (max_features =", max_features, ") ...")
    tf = TfidfVectorizer(max_features=max_features, stop_words='english', ngram_range=(1,2))
    X = tf.fit_transform(texts_nonempty)

    Cls = choose_clustering_algo(X.shape[0])
    print("Clustering with", Cls.__name__, "into", n_clusters, "clusters ...")
    if Cls is MiniBatchKMeans:
        model = Cls(n_clusters=n_clusters, random_state=42, batch_size=10000, n_init=5)
    else:
        model = Cls(n_clusters=n_clusters, random_state=42, n_init=10)
    clusters = model.fit_predict(X)

    try:
        centroids = model.cluster_centers_
    except Exception:
        centroids = model.cluster_centers_
    print("Selecting representative samples ...")
    top_idx_map = get_top_k_indices_per_cluster(X, clusters, centroids, samples_per_cluster)

    # Build sample dataframe
    samples = []
    for c in sorted(top_idx_map.keys()):
        for idx_in_X in top_idx_map[c]:
            orig_idx = orig_indices[idx_in_X]
            row = df.loc[orig_idx].copy()
            preview_text = str(row[text_col])
            preview = preview_text if len(preview_text) <= 400 else preview_text[:400] + '...'
            row_data = {
                "_cluster": c,
                "original_row_index": orig_idx,
                text_col: row[text_col],
                "_preview": preview
            }
            other_cols = [cname for cname in df.columns if cname != text_col][:2]
            for oc in other_cols:
                row_data[oc] = row.get(oc, "")
            samples.append(row_data)
    samples_df = pd.DataFrame(samples)

    sample_out = f"{output_prefix}_cluster_samples.csv"
    samples_df.to_csv(sample_out, index=False)
    print("Saved sample file:", sample_out)

    # full dataset with assignments
    df_with_cluster = df.copy()
    df_with_cluster["_cluster_full"] = -1
    df_with_cluster.loc[orig_indices, "_cluster_full"] = clusters
    cluster_out = f"{output_prefix}_with_clusters.csv"
    df_with_cluster.to_csv(cluster_out, index=False)
    print("Saved full dataset with clusters:", cluster_out)

    counts = Counter(clusters)
    print("Cluster counts (non-empty rows):")
    for c in range(n_clusters):
        print(f"  Cluster {c}: {counts.get(c,0)} rows")

    # return paths and small objects for interactive follow-up
    return {
        "samples_path": os.path.abspath(sample_out),
        "clusters_path": os.path.abspath(cluster_out),
        "samples_df": samples_df,
        "df_with_cluster": df_with_cluster,
        "detected_text_col": text_col
    }


In [6]:
# Robust CSV loader + automatic repair for "EOF inside string" problems,
# then call cluster_and_sample(...) defined earlier in the notebook.
import os, csv
import pandas as pd

input_path = '/content/bd_eng_news_daily.csv'   # change if needed
clean_path = '/content/bd_eng_news_daily.cleaned.csv'

def try_read_variants(path):
    """Try a few tolerant pandas read_csv variants. Returns (df, method_name) or (None, None)."""
    variants = [
        {"engine":"python", "sep":",", "quoting":csv.QUOTE_NONE, "on_bad_lines":"skip", "escapechar":"\\"},
        {"engine":"python", "sep":",", "quoting":csv.QUOTE_MINIMAL, "on_bad_lines":"skip", "escapechar":"\\"},
        {"engine":"python", "sep":",", "quoting":csv.QUOTE_ALL, "on_bad_lines":"skip", "escapechar":"\\"},
        {"engine":"python", "sep":",", "on_bad_lines":"skip"},
    ]
    for opts in variants:
        try:
            print("Trying pandas.read_csv with options:", opts)
            df = pd.read_csv(path, **opts)
            print("Success with variant:", opts)
            return df, f"pandas_variant_{opts.get('quoting','default')}"
        except Exception as e:
            print("Failed variant:", opts, "->", repr(e))
    return None, None

def repair_unbalanced_quotes(inpath, outpath, quotechar='"'):
    """
    Simple repair: reads input line-by-line and accumulates lines until the number of
    quote characters in the buffer is even (balanced). Then writes buffer as one line
    to the output. This merges broken multi-line fields into single lines.
    NOTE: This is a heuristic — may not be perfect but works for many CSVs with
    stray newlines inside quoted fields.
    """
    print("Attempting manual repair of CSV by balancing quote counts...")
    with open(inpath, 'r', encoding='utf-8', errors='replace') as fin, \
         open(outpath, 'w', encoding='utf-8') as fout:
        buffer = ""
        total_in = 0
        total_out = 0
        for line in fin:
            total_in += 1
            buffer += line
            # count quotechar occurrences in buffer
            if buffer.count(quotechar) % 2 == 0:
                fout.write(buffer)
                total_out += 1
                buffer = ""
        # if leftover buffer exists, write it anyway
        if buffer:
            fout.write(buffer)
            total_out += 1
    print(f"Repair done. Lines in: {total_in}, lines out (after merge): {total_out}")
    return outpath

# 1) Try tolerant pandas reads first
df, method = try_read_variants(input_path)
if df is not None:
    print("Loaded CSV successfully with method:", method)
    used_path = input_path
else:
    # 2) Repair CSV and try reading repaired file
    repaired = repair_unbalanced_quotes(input_path, clean_path)
    try:
        print("Trying to read repaired CSV...")
        df = pd.read_csv(repaired, engine='python', on_bad_lines='skip')
        print("Loaded repaired CSV successfully.")
        used_path = repaired
    except Exception as e:
        print("Failed to read repaired CSV:", repr(e))
        raise RuntimeError("Unable to robustly parse CSV with automatic repair. "
                           "If this fails, consider opening the file in a text editor or Excel to inspect row ~12606 noted in the original error.")

# quick diagnostics
print("Dataframe shape:", df.shape)
print("Columns:", list(df.columns)[:30])
print("Showing first 3 rows:")
display(df.head(3))

# Save cleaned copy (so later cells can reuse it)
if used_path != clean_path:
    # if original read worked, still save a cleaned copy to avoid re-parsing problems later
    try:
        df.to_csv(clean_path, index=False)
        print("Saved a cleaned copy to:", clean_path)
        used_path = clean_path
    except Exception as e:
        print("Could not save cleaned copy:", e)

# Now call the clustering pipeline (cluster_and_sample) on the cleaned file.
# Make sure cluster_and_sample(...) is already defined in the notebook (from previous cell).
try:
    print("Calling cluster_and_sample on:", used_path)
    res = cluster_and_sample(
        input_path=used_path,
        text_col=None,   # auto-detect; set a string if you want to force a column name
        n_clusters=5,
        samples_per_cluster=3,
        max_features=10000,
        output_prefix='/content/cluster_run'   # writes outputs to /content/
    )
    print("cluster_and_sample finished. Sample and cluster files saved.")
    display(res['samples_df'])
except NameError as ne:
    print("cluster_and_sample not defined in this notebook. Please run the earlier cell that defines cluster_and_sample(), then re-run this cell.")
except Exception as e:
    print("cluster_and_sample failed:", repr(e))
    raise


Trying pandas.read_csv with options: {'engine': 'python', 'sep': ',', 'quoting': 3, 'on_bad_lines': 'skip', 'escapechar': '\\'}
Success with variant: {'engine': 'python', 'sep': ',', 'quoting': 3, 'on_bad_lines': 'skip', 'escapechar': '\\'}
Loaded CSV successfully with method: pandas_variant_3
Dataframe shape: (168520, 7)
Columns: ['Unnamed: 0', 'title', 'text', 'publish_date', 'urls', 'news_collection_time', 'publisher']
Showing first 3 rows:


Unnamed: 0.1,Unnamed: 0,title,text,publish_date,urls,news_collection_time,publisher
0,4,Govt may let its employees trade in shares,"""Says draft amendment to government servants r...",,,,
1,The public administration ministry is likely t...,a reversal of a decades-old rule.,,,,,
2,The ministry sent a draft amendment to the Gov...,1979,to the law ministry,which vetted and returned it,sources said.,,


Saved a cleaned copy to: /content/bd_eng_news_daily.cleaned.csv
Calling cluster_and_sample on: /content/bd_eng_news_daily.cleaned.csv
Loading CSV: /content/bd_eng_news_daily.cleaned.csv
Auto-detected text column: text
TF-IDF vectorization (max_features = 10000 ) ...
Clustering with MiniBatchKMeans into 5 clusters ...
Selecting representative samples ...
Saved sample file: /content/cluster_run_cluster_samples.csv
Saved full dataset with clusters: /content/cluster_run_with_clusters.csv
Cluster counts (non-empty rows):
  Cluster 0: 2364 rows
  Cluster 1: 89522 rows
  Cluster 2: 1210 rows
  Cluster 3: 979 rows
  Cluster 4: 1839 rows
cluster_and_sample finished. Sample and cluster files saved.


Unnamed: 0.1,_cluster,original_row_index,text,_preview,Unnamed: 0,title
0,0,106277,""""" he said.",""""" he said.","""""When Pabna sugar mill was running",I was very much interested about sugarcane cu...
1,0,105844,said,said,Atiqul Karim Apel,chairman of Bhandarbari Union Parishad (UP) i...
2,0,105845,said,said,Humayun Kabir,sub-divisional engineer of the Water Developm...
3,1,91599,https://www.thedailystar.net/country/news/ferd...,https://www.thedailystar.net/country/news/ferd...,Ferdousi is not just working to educate childr...,2020-03-12 00:00:00
4,1,123498,https://www.thedailystar.net/country/news/ferd...,https://www.thedailystar.net/country/news/ferd...,Ferdousi is not just working to educate childr...,2020-03-12 00:00:00
5,1,30077,https://www.thedailystar.net/country/news/ferd...,https://www.thedailystar.net/country/news/ferd...,Ferdousi is not just working to educate childr...,2020-03-11 18:00:00
6,2,94993,New Jersey,New Jersey,The Yonkers,New York-born Such was a veteran figure in th...
7,2,17740,such as interviewing new hires,such as interviewing new hires,Third,the emergence of ChatGPT and other generative...
8,2,45709,"""""There's nothing new","""""There's nothing new",As journalists drew his attention to Jamaat ca...,he said
9,3,76171,it's not your day to shine.,it's not your day to shine.,This syndrome often stems from the mistaken be...,pulling off mind-numbingly outrageous dance n...


In [7]:
# Edit these parameters if you want:
input_path = input_path  # from previous cell
text_col = None          # set to 'article' or 'content' etc if you want to force a column
n_clusters = 5
samples_per_cluster = 3
max_features = 10000
output_prefix = 'cluster_run'  # files will be saved as cluster_run_*.csv

res = cluster_and_sample(
    input_path=input_path,
    text_col=text_col,
    n_clusters=n_clusters,
    samples_per_cluster=samples_per_cluster,
    max_features=max_features,
    output_prefix=output_prefix
)

# show the sample dataframe (3 examples per cluster)
res['samples_df']


Loading CSV: /content/bd_eng_news_daily.csv
Auto-detected text column: text
TF-IDF vectorization (max_features = 10000 ) ...
Clustering with MiniBatchKMeans into 5 clusters ...
Selecting representative samples ...
Saved sample file: cluster_run_cluster_samples.csv
Saved full dataset with clusters: cluster_run_with_clusters.csv
Cluster counts (non-empty rows):
  Cluster 0: 2889 rows
  Cluster 1: 5458 rows
  Cluster 2: 2942 rows
  Cluster 3: 418 rows
  Cluster 4: 1967 rows


Unnamed: 0.1,_cluster,original_row_index,text,_preview,Unnamed: 0,title
0,0,8178,BNP senior leader Khandakar Mosharraf Hossain ...,BNP senior leader Khandakar Mosharraf Hossain ...,4416,PM fretting over Fakhrul’s UN visit: BNP
1,0,10832,BNP secretary general Mirza Fakhrul Islam Alam...,BNP secretary general Mirza Fakhrul Islam Alam...,3006,"Upcoming election a big challenge for AL, says..."
2,0,2974,Awami League (AL) General Secretary and Road T...,Awami League (AL) General Secretary and Road T...,1496,BNP is daydreaming of creating another one-ele...
3,1,12537,"Shasha Denims, a garment exporter, had plans t...","Shasha Denims, a garment exporter, had plans t...",1074,Businesses stall investment amid high interest...
4,1,6248,An elderly man showing a token that mentions h...,An elderly man showing a token that mentions h...,304,More and more people now flocking to OMS trucks
5,1,9596,"Like most other girls, Ayesha Akter also wante...","Like most other girls, Ayesha Akter also wante...",1058,Fighting inflation: Apparel worker's struggle ...
6,2,2360,The Daily Star in association with the Strengt...,The Daily Star in association with the Strengt...,553,"Achieving SDGs 5, 8 & 13 through whole of soci..."
7,2,7315,'Barir Naam Shahana' is director Leesa Gazi's ...,'Barir Naam Shahana' is director Leesa Gazi's ...,276,A HOUSE CALLED SHAHANA
8,2,6494,Present Perspectives and Future Outlook\n\nThe...,Present Perspectives and Future Outlook\n\nThe...,551,SDGs and Youth in Bangladesh
9,3,9404,When the printing press was invented in the 15...,When the printing press was invented in the 15...,813,The rise of AI and leveraging it for employabi...


In [8]:
# To download the produced CSV files to your local machine:
from google.colab import files
files.download(res['samples_path'])   # samples csv
files.download(res['clusters_path'])  # full dataset with _cluster_full


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
# After you examine the sample CSV, prepare a mapping from cluster number -> label name.
# Example mapping (edit to your labels):
mapping = {
    0: "Politics",
    1: "Business",
    2: "Others",
    3: "Revolution",
    4: "Others"
}

# OR load mapping from JSON file if you have one:
# with open('/content/drive/MyDrive/mapping.json','r') as f:
#     mapping = json.load(f)
# Ensure mapping keys are ints if loaded from JSON
mapping = {int(k): v for k,v in mapping.items()}

# Apply mapping to the full dataset saved earlier (or use the df returned)
df_with_cluster = res['df_with_cluster'].copy()
def map_label(val):
    if val == -1:
        return ""
    return mapping.get(int(val), "")

df_with_cluster['labels'] = df_with_cluster['_cluster_full'].apply(map_label)

labels_out = f"{output_prefix}_with_labels.csv"
df_with_cluster.to_csv(labels_out, index=False)
print("Saved labeled dataset:", labels_out)

# Download if you want:
from google.colab import files
files.download(labels_out)


Saved labeled dataset: cluster_run_with_labels.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>