In [1]:
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


In [2]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove stop words
    text = " ".join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
    return text


In [3]:
# Load the dataset from JSON
df = pd.read_json('stacktest.json', lines=True)

# Preprocess the 'body' column
df["processed_text"] = df["body"].apply(preprocess_text)


In [4]:
df.head()

Unnamed: 0,id,title,body,accepted_answer_id,answer_count,comment_count,creation_date,last_activity_date,last_edit_date,last_editor_display_name,last_editor_user_id,owner_display_name,owner_user_id,post_type_id,score,tags,view_count,favorite_count,processed_text
0,3247246,Integrate War-Plugin for m2eclipse into Eclips...,<p>I set up a small web project with JSF and M...,3247526.0,2,0,2010-07-14 14:39:48.053 UTC,2010-07-14 16:02:19.683 UTC,2010-07-14 15:56:37.803 UTC,,70604.0,,389430.0,1,2,eclipse|maven-2|tomcat|m2eclipse,1653,,pi set small web project jsf maven want deploy...
1,40270764,phantomjs-node page.evaulate seems to hang,<p>I have an implementation of 'waitfor' with ...,,1,0,2016-10-26 19:35:00.537 UTC,2016-11-02 20:05:09.143 UTC,,,,,245076.0,1,0,node.js|phantomjs,35,,pi implementation waitfor phantomjsnode codesi...
2,27532383,Dynamic operations can only be performed in ho...,<p>I'm working with an API that requires:</p>\...,,1,0,2014-12-17 18:31:18.6 UTC,2014-12-17 19:57:43.443 UTC,,,,,3105880.0,1,1,c#|asp.net-mvc,4372,,pim working api requiresp precodelttrust level...
3,33511888,CSS with relative URL to background image?,<p>I have a file structure of:</p>\n\n<pre><co...,,2,2,2015-11-04 00:50:35.223 UTC,2015-11-04 01:51:03.037 UTC,2015-11-04 01:51:03.037 UTC,,5464492.0,,5464492.0,1,0,css|background-image,406,,pi file structure ofp precodehomehtml imgbgdam...
4,46160163,Share canvas image on android,<p>Hello so I write a small game where in the ...,46160246.0,1,0,2017-09-11 16:19:18.32 UTC,2017-09-11 16:24:12.69 UTC,,,,,8570512.0,1,0,android|canvas|bitmap|share,52,,phello write small game end share result resul...


In [5]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000, stop_words='english')

# Fit and transform the processed text
tfidf_matrix = tfidf_vectorizer.fit_transform(df["processed_text"])


In [6]:
def extract_top_keywords(feature_names, tfidf_vector, top_n=10):
    sorted_nzs = np.argsort(tfidf_vector.data)[:-(top_n + 1):-1]
    keywords = [(feature_names[tfidf_vector.indices[i]], tfidf_vector.data[i]) for i in sorted_nzs]
    return keywords

# Get feature names (words) from the TF-IDF vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

# Extract top keywords for each document
df["keywords"] = [extract_top_keywords(feature_names, tfidf_matrix[i], top_n=10) for i in range(tfidf_matrix.shape[0])]


In [8]:
def format_keywords(keywords):
    return ", ".join([f"{word} ({score:.2f})" for word, score in keywords])

df["formatted_keywords"] = df["keywords"].apply(format_keywords)


In [9]:

print(df[["body", "formatted_keywords"]])

                                                  body  \
0    <p>I set up a small web project with JSF and M...   
1    <p>I have an implementation of 'waitfor' with ...   
2    <p>I'm working with an API that requires:</p>\...   
3    <p>I have a file structure of:</p>\n\n<pre><co...   
4    <p>Hello so I write a small game where in the ...   
..                                                 ...   
495  <p>Is there any .NET string.format compatible ...   
496  <p>I am parsing a piece of XML returned from a...   
497  <p>I'm trying to have a cool little animation ...   
498  <p>I was wondering if there is a way to introd...   
499  <p>I am trying to set FB SDK to work in my loc...   

                                    formatted_keywords  
0    project (0.27), eclipsem2eclipsep (0.24), tomc...  
1    evaluate (0.32), return (0.27), phinstanceexit...  
2    vs (0.32), dynamic (0.32), requiresp (0.19), w...  
3    urlimgbgdamask1jpg (0.46), backgroundimage (0....  
4    uri (0.35), s