In [1]:
import pandas as pd
df = pd.read_csv("news_clean.csv")
df.head()

Unnamed: 0,Article,Date,Heading,NewsType,AH,year,month,day,sentiment,emotion,Location,article_clean,heading_clean,location_reported,location_extracted
0,KARACHI: The Sindh government has decided to b...,2015-01-01,sindh govt decides to cut public transport far...,business,,2015,1,1,,,,karachi the sindh government has decided to br...,sindh govt decides to cut public transport far...,UNKNOWN,UNKNOWN
1,HONG KONG: Asian markets started 2015 on an up...,2015-01-02,asia stocks up in new year trad,business,,2015,1,2,,,,hong kong asian markets started on an upswing ...,asia stocks up in new year trad,UNKNOWN,asia
2,HONG KONG: Hong Kong shares opened 0.66 perce...,2015-01-05,hong kong stocks open 0.66 percent lower,business,,2015,1,5,,,,hong kong hong kong shares opened percent lowe...,hong kong stocks open percent lower,UNKNOWN,hong kong
3,HONG KONG: Asian markets tumbled Tuesday follo...,2015-01-06,asian stocks sink euro near nine year,business,,2015,1,6,,,,hong kong asian markets tumbled tuesday follow...,asian stocks sink euro near nine year,UNKNOWN,UNKNOWN
4,NEW YORK: US oil prices Monday slipped below $...,2015-01-06,us oil prices slip below 50 a barr,business,,2015,1,6,,,,new york us oil prices monday slipped below a ...,us oil prices slip below a barr,UNKNOWN,us


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

text_corpus = (
    df["heading_clean"].fillna("") + " " +
    df["article_clean"].fillna("")
)

tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(text_corpus)

X_tfidf.shape

(2692, 3000)

In [3]:
from sklearn.ensemble import IsolationForest

iso = IsolationForest(
    n_estimators=200,
    contamination=0.05,
    random_state=42
)

iso.fit(X_tfidf)

df["anomaly_label"] = iso.predict(X_tfidf)
df["anomaly_score"] = -iso.decision_function(X_tfidf)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

text_corpus = (
    df["heading_clean"].fillna("") + " " +
    df["article_clean"].fillna("")
)

tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(text_corpus)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np

# Boolean mask as NumPy array
mask = df["location_extracted"].ne("UNKNOWN").values

# Filter X and y using the same mask
X_loc = X_tfidf[mask]
y_loc = df.loc[mask, "location_extracted"]

X_train, X_test, y_train, y_test = train_test_split(
    X_loc, y_loc,
    test_size=0.2,
    random_state=42,
    stratify=y_loc
)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np

mask = df["location_extracted"].ne("UNKNOWN").values

X_loc = X_tfidf[mask]
y_loc = df.loc[mask, "location_extracted"]

X_train, X_test, y_train, y_test = train_test_split(
    X_loc, y_loc,
    test_size=0.2,
    random_state=42   # ðŸ‘ˆ NO stratify here
)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

                  precision    recall  f1-score   support

     Afghanistan       0.00      0.00      0.00         1
            Afri       0.00      0.00      0.00         1
            Apri       0.00      0.00      0.00         1
Arcelik Pakistan       0.00      0.00      0.00         1
       Argentina       0.67      1.00      0.80         2
            Asia       0.57      0.67      0.62         6
         Asia Cu       0.00      0.00      0.00         1
       Australia       0.64      1.00      0.78         9
      Bangladesh       1.00      0.50      0.67         8
         Beijing       0.00      0.00      0.00         1
         Belgium       0.00      0.00      0.00         2
        Brisbane       0.00      0.00      0.00         1
         Britain       0.00      0.00      0.00         3
           Chile       0.00      0.00      0.00         1
           China       0.67      1.00      0.80         6
        Colombia       0.00      0.00      0.00         1
            C

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
location_pred = clf.predict(X_tfidf)
df["location_predicted"] = location_pred

df["location_mismatch"] = df["location_predicted"] != df["Location"]
df["is_suspicious"] = (df["anomaly_label"] == -1) | (df["location_mismatch"])

In [11]:
df[[
    "Heading", "location_extracted", "location_predicted",
    "anomaly_score", "is_suspicious"
]].head(10)

Unnamed: 0,Heading,location_extracted,location_predicted,anomaly_score,is_suspicious
0,sindh govt decides to cut public transport far...,UNKNOWN,Pakistan,-0.033961,True
1,asia stocks up in new year trad,asia,asia,-0.007987,True
2,hong kong stocks open 0.66 percent lower,hong kong,Pakistan,-0.040761,True
3,asian stocks sink euro near nine year,UNKNOWN,US,-0.005093,True
4,us oil prices slip below 50 a barr,us,US,0.002804,True
5,oil hits new 5.5 year lows as saudis defend,UNKNOWN,US,-0.009039,True
6,bullish kse jumps over 33000 psychological bar...,UNKNOWN,Pakistan,-0.023379,True
7,oil falls further in asian trad,UNKNOWN,US,-0.009435,True
8,sugar prices drop to rs 49.80 in sind,UNKNOWN,Pakistan,-0.033315,True
9,oil extends losses as world bank cuts growth for,UNKNOWN,US,-0.002587,True


In [12]:
import joblib

joblib.dump(tfidf, "vectorizer_tfidf.pkl")
joblib.dump(iso, "model_isolation_forest.pkl")
joblib.dump(clf, "model_location_classifier.pkl")

df.to_csv("news_with_flags.csv", index=False)