In [1]:

import pandas as pd
import os
import glob

# Set the folder path
folder_path = r"C:\Users\ilama\Documents\topic_modeling_BBC"

# Get all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

# Read and combine all CSV files
df_list = [pd.read_csv(file) for file in csv_files]
combined_df = pd.concat(df_list, ignore_index=True)

# Preview the result
combined_df.head()


Unnamed: 0,title,link,published,description,category,text,article,date,panda_date,headline,summary,url,pubDate,guid,labels,percentage
0,Gas prices soar as Russia cuts German supply,https://www.bbc.co.uk/news/business-62318376?a...,"Wed, 27 Jul 2022 13:21:41 GMT",The Nord Stream 1 pipeline is now operating at...,business,,,,,,,,,,,
1,McDonald's puts up price of cheeseburger for f...,https://www.bbc.co.uk/news/business-62317453?a...,"Wed, 27 Jul 2022 13:07:59 GMT",The fast food firm increases the price for the...,business,,,,,,,,,,,
2,Train strikes: 'I'm missing the football becau...,https://www.bbc.co.uk/news/business-62309668?a...,"Wed, 27 Jul 2022 10:53:22 GMT",Tens of thousands of rail workers are walking ...,business,,,,,,,,,,,
3,Vow to tackle long waits to make a complaint,https://www.bbc.co.uk/news/business-62308181?a...,"Tue, 26 Jul 2022 23:04:12 GMT",The financial regulator says new rules should ...,business,,,,,,,,,,,
4,MPs call for debt repayments holiday for benef...,https://www.bbc.co.uk/news/business-62312313?a...,"Wed, 27 Jul 2022 03:16:49 GMT",Households struggling through the cost-of-livi...,business,,,,,,,,,,,


In [3]:
import pandas as pd
import os
import glob
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK data (only once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Setup
folder_path = r"C:\Users\ilama\Documents\topic_modeling_BBC"
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

# Load all CSVs into one DataFrame
df_list = [pd.read_csv(file) for file in csv_files]
df = pd.concat(df_list, ignore_index=True)

# Print shape and columns
print(f"Loaded {df.shape[0]} rows from {len(csv_files)} files.")
print("Columns available:", df.columns)

# ---- STEP 1: Identify the text column ----
# Try to find a column with article text (adjust as needed)
possible_text_cols = [col for col in df.columns if "text" in col.lower() or "article" in col.lower()]
print("Possible text columns:", possible_text_cols)

# Select the first matching column (change if needed)
text_column = possible_text_cols[0] if possible_text_cols else "text"

# ---- STEP 2: NLP Text Cleaning ----
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www.\S+", "", text)  # remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)      # remove punctuation/numbers
    text = nltk.word_tokenize(text)
    text = [word for word in text if word not in stop_words and len(word) > 2]  # remove stopwords and short words
    text = [lemmatizer.lemmatize(word) for word in text]
    return " ".join(text)

# Apply cleaning
df['clean_text'] = df[text_column].apply(clean_text)

# Preview cleaned data
print(df[['clean_text']].head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ilama\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ilama\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ilama\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loaded 41486 rows from 5 files.
Columns available: Index(['title', 'link', 'published', 'description', 'category', 'text',
       'article', 'date', 'panda_date', 'headline', 'summary', 'url',
       'pubDate', 'guid', 'labels', 'percentage'],
      dtype='object')
Possible text columns: ['text', 'article']
  clean_text
0           
1           
2           
3           
4           


In [5]:

from sklearn.feature_extraction.text import TfidfVectorizer

# Use cleaned text column
corpus = df['clean_text'].values.astype('U')  # make sure all text is unicode

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # You can change max_features as needed
X = vectorizer.fit_transform(corpus)

# Convert to DataFrame for inspection (optional)
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Show shape and preview
print("TF-IDF shape:", tfidf_df.shape)
print(tfidf_df.head())


TF-IDF shape: (41486, 5000)
   aaa  abandoned  abbas  abc  ability  able  abn  abortion  abroad  absence  \
0  0.0        0.0    0.0  0.0      0.0   0.0  0.0       0.0     0.0      0.0   
1  0.0        0.0    0.0  0.0      0.0   0.0  0.0       0.0     0.0      0.0   
2  0.0        0.0    0.0  0.0      0.0   0.0  0.0       0.0     0.0      0.0   
3  0.0        0.0    0.0  0.0      0.0   0.0  0.0       0.0     0.0      0.0   
4  0.0        0.0    0.0  0.0      0.0   0.0  0.0       0.0     0.0      0.0   

   ...  yuan  yugansk  yuganskneftegas  yukos  yushchenko  zealand  zero  \
0  ...   0.0      0.0              0.0    0.0         0.0      0.0   0.0   
1  ...   0.0      0.0              0.0    0.0         0.0      0.0   0.0   
2  ...   0.0      0.0              0.0    0.0         0.0      0.0   0.0   
3  ...   0.0      0.0              0.0    0.0         0.0      0.0   0.0   
4  ...   0.0      0.0              0.0    0.0         0.0      0.0   0.0   

   zombie  zone  zurich  
0     0.

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Step 1: Convert text to CountVectorizer (LDA prefers count data)
count_vectorizer = CountVectorizer(max_features=5000, stop_words='english')
X_counts = count_vectorizer.fit_transform(df['clean_text'])

# Step 2: Fit LDA Model
num_topics = 5  # Change this to how many topics you want
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(X_counts)

# Step 3: Show top words for each topic
def print_topics(model, vectorizer, top_n=10):
    terms = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(model.components_):
        print(f"\n🟦 Topic #{idx+1}:")
        print(" ".join([terms[i] for i in topic.argsort()[-top_n:][::-1]]))

print_topics(lda, count_vectorizer)



🟦 Topic #1:
said game player win england year time world play club

🟦 Topic #2:
said year company market government firm economy bank sale price

🟦 Topic #3:
said labour party government election people blair minister say tory

🟦 Topic #4:
film year said game best award music new star time

🟦 Topic #5:
said people technology service phone firm user company new computer


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Step 1: Convert text to CountVectorizer (LDA prefers count data)
count_vectorizer = CountVectorizer(max_features=5000, stop_words='english')
X_counts = count_vectorizer.fit_transform(df['clean_text'])

# Step 2: Fit LDA Model
num_topics = 5  # Change this to how many topics you want
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(X_counts)

# Step 3: Show top words for each topic
def print_topics(model, vectorizer, top_n=10):
    terms = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(model.components_):
        print(f"\n🟦 Topic #{idx+1}:")
        print(" ".join([terms[i] for i in topic.argsort()[-top_n:][::-1]]))

# Show topics found
print_topics(lda, count_vectorizer)




🟦 Topic #1:
said game player win england year time world play club

🟦 Topic #2:
said year company market government firm economy bank sale price

🟦 Topic #3:
said labour party government election people blair minister say tory

🟦 Topic #4:
film year said game best award music new star time

🟦 Topic #5:
said people technology service phone firm user company new computer


In [11]:
import os
print(os.getcwd())



C:\Users\ilama


In [13]:
import os

# Create the folder if it doesn't exist
folder_path = r"C:\Users\ilama\Documents\topic_modeling_BBC"
os.makedirs(folder_path, exist_ok=True)



In [15]:
import joblib

# Save the LDA model and CountVectorizer in the specified folder
lda_model_path = r"C:\Users\ilama\Documents\topic_modeling_BBC\lda_model.pkl"
count_vectorizer_path = r"C:\Users\ilama\Documents\topic_modeling_BBC\count_vectorizer.pkl"

# Save models
joblib.dump(lda, lda_model_path)
joblib.dump(count_vectorizer, count_vectorizer_path)

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


In [17]:
# Load the LDA model and CountVectorizer from the folder
lda = joblib.load(r"C:\Users\ilama\Documents\topic_modeling_BBC\lda_model.pkl")
count_vectorizer = joblib.load(r"C:\Users\ilama\Documents\topic_modeling_BBC\count_vectorizer.pkl")

print("Model and vectorizer loaded successfully!")


Model and vectorizer loaded successfully!


In [19]:
code = '''import streamlit as st
import pickle
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load saved models
lda_model = pickle.load(open("lda_model.pkl", "rb"))
vectorizer = pickle.load(open("vectorizer.pkl", "rb"))

def preprocess_text(text):
    text = re.sub(r'\\W+', ' ', text.lower())
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

st.title("BBC News Topic Modeling")

user_input = st.text_area("Enter BBC News Article Here:")

if st.button("Predict Topic"):
    if user_input.strip() == "":
        st.warning("Please enter some text.")
    else:
        clean = preprocess_text(user_input)
        vec = vectorizer.transform([clean])
        topic_number = lda_model.transform(vec).argmax()
        st.success(f"Predicted Topic: Topic {topic_number + 1}")
'''

with open('app.py', 'w') as f:
    f.write(code)



In [21]:

import joblib
from sklearn.feature_extraction.text import CountVectorizer

# Create and fit the vectorizer
vectorizer = CountVectorizer(stop_words='english')
# Fit the vectorizer to your text data (replace `corpus` with your actual data)
corpus = ["example text data", "another article", "etc..."]
vectorizer.fit(corpus)

# Save the vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')


['vectorizer.pkl']

In [23]:

joblib.dump(lda_model, 'lda_model.pkl')



NameError: name 'lda_model' is not defined

In [25]:
from sklearn.decomposition import LatentDirichletAllocation

# Transform the corpus using the vectorizer
doc_term_matrix = vectorizer.transform(corpus)

# Train the LDA model
lda_model = LatentDirichletAllocation(n_components=3, random_state=42)
lda_model.fit(doc_term_matrix)


In [27]:
joblib.dump(lda_model, 'lda_model.pkl')


['lda_model.pkl']

In [29]:
joblib.dump(vectorizer, 'vectorizer.pkl')


['vectorizer.pkl']

In [31]:
lda_model = joblib.load('lda_model.pkl')
vectorizer = joblib.load('vectorizer.pkl')




In [33]:
import joblib
from sklearn.feature_extraction.text import CountVectorizer  # Or TfidfVectorizer

# Create the vectorizer
vectorizer = CountVectorizer(stop_words='english')

# Example: Fit the vectorizer with some text data (replace `corpus` with your actual data)
corpus = ["This is a sample text", "Another news article here", "Text data to fit vectorizer"]
vectorizer.fit(corpus)

# Save the vectorizer to a file
joblib.dump(vectorizer, 'vectorizer.pkl')


['vectorizer.pkl']

In [35]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import joblib

# Load the dataset
df = pd.read_csv("bbc-text (1).csv")  # adjust if using another file

# Vectorize the text
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])

# Fit the LDA model
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(X)

# ✅ Save the vectorizer and model
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(lda_model, 'lda_model.pkl')

print("✅ Saved vectorizer.pkl and lda_model.pkl successfully.")


✅ Saved vectorizer.pkl and lda_model.pkl successfully.


In [37]:
joblib.dump(lda_model, 'lda_model.pkl')


['lda_model.pkl']

In [39]:
# Display top keywords per topic for manual labeling
feature_names = vectorizer.get_feature_names_out()

for idx, topic in enumerate(lda_model.components_):
    print(f"\n🔹 Topic {idx} Top Keywords:")
    print(", ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))



🔹 Topic 0 Top Keywords:
said, year, company, market, mr, new, growth, economy, sales, 2004

🔹 Topic 1 Top Keywords:
said, game, england, time, year, win, world, club, play, just

🔹 Topic 2 Top Keywords:
said, mr, people, government, labour, new, election, party, blair, told

🔹 Topic 3 Top Keywords:
music, band, said, album, song, best, years, london, rock, singer

🔹 Topic 4 Top Keywords:
film, said, best, year, music, mobile, people, tv, new, digital


In [41]:
# Fit your LDA model
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(X_train)  # or X_vectorized, etc.

# Display top keywords per topic
feature_names = vectorizer.get_feature_names_out()
for idx, topic in enumerate(lda_model.components_):
    print(f"\n🔹 Topic {idx} Top Keywords:")
    print(", ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))

# Save the model and vectorizer
import joblib
joblib.dump(lda_model, 'lda_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')


NameError: name 'X_train' is not defined

In [43]:
# Step 1: Import Libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import joblib

# Step 2: Load Dataset
df = pd.read_csv("bbc-text (1).csv")  # Replace with your correct CSV file if needed
print("Dataset Loaded. Sample:")
print(df.head())

# Step 3: Vectorize Text (LDA requires CountVectorizer, not TF-IDF)
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(df['text'])

# Step 4: Fit LDA Model
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(X_train)

# Step 5: Print Top Keywords Per Topic (for manual labeling)
feature_names = vectorizer.get_feature_names_out()

for idx, topic in enumerate(lda_model.components_):
    print(f"\n🔹 Topic {idx} Top Keywords:")
    print(", ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))

# Step 6: Save LDA model and Vectorizer
joblib.dump(lda_model, 'lda_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

print("\n✅ LDA model and vectorizer saved successfully!")


Dataset Loaded. Sample:
        category                                               text
0           tech  tv future in the hands of viewers with home th...
1       business  worldcom boss  left books alone  former worldc...
2          sport  tigers wary of farrell  gamble  leicester say ...
3          sport  yeading face newcastle in fa cup premiership s...
4  entertainment  ocean s twelve raids box office ocean s twelve...

🔹 Topic 0 Top Keywords:
said, year, company, market, mr, new, growth, economy, sales, 2004

🔹 Topic 1 Top Keywords:
said, game, england, time, year, win, world, club, play, just

🔹 Topic 2 Top Keywords:
said, mr, people, government, labour, new, election, party, blair, told

🔹 Topic 3 Top Keywords:
music, band, said, album, song, best, years, london, rock, singer

🔹 Topic 4 Top Keywords:
film, said, best, year, music, mobile, people, tv, new, digital

✅ LDA model and vectorizer saved successfully!
