'''
This script is adapted from Kavita Ganesan's freeCodeCamp lesson.
https://www.freecodecamp.org/news/how-to-extract-keywords-from-text-with-tf-idf-and-pythons-scikit-learn-b2a0f3d7e667/
'''

In [4]:
import pandas as pd
import json

json_file = input("Enter the file path: ")
with open(json_file, "r") as file:
    json_data = json.load(file)

json_data = pd.DataFrame(json_data)

print("Schema:\n\n", json_data.dtypes)
print("Number of records, columns = ", json_data.shape)
print(json_data.head())

Schema:

 issue_date    object
fulltext      object
dtype: object
Number of records, columns =  (1263, 2)
   issue_date                                           fulltext
0  1891-01-01  arine review. * - v0u; 111, cleveland, ohio, t...
1  1891-01-08  ; t t i ; t i arine review. not... bide clevel...
2  1891-01-15  i e i & , vou 111, marine review. cleveland, o...
3  1891-01-22  3 r be pertti ‘where the island house, of tole...
4  1891-01-29  marine review. mot ul ceeveland, ohio, thursda...


In [5]:
import re
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("</?.*?>"," <> ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

json_data['fulltext'] = json_data['fulltext'].apply(lambda x:pre_process(x))

#show the first 'text'
json_data['fulltext'][2:10]

2    i e i vou marine review cleveland ohio thursda...
3     r be pertti where the island house of toledo ...
4    marine review mot ul ceeveland ohio thursday j...
5     f hodge co detroit you ul engines for the c p...
6    marine review little faith in census figures o...
7    marine review vou fil cleveland ohio thursday ...
8     von le government work will give some help to...
9    marine review wot cleveland ohio thursday marc...
Name: fulltext, dtype: object

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
import re

def get_stop_words(stop_file_path):
    """load stop words """
    
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

#load a set of stop words
stopwords=get_stop_words("stopwords.txt")
#get the text column 
docs=json_data['fulltext'].tolist()

#create a vocabulary of words, 
#ignore words that appear in 85% of documents, 
#eliminate stop words
cv=CountVectorizer(max_df=0.85,stop_words=list(stopwords))
word_count_vector=cv.fit_transform(docs)



In [8]:
word_count_vector.shape
print(stopwords)

frozenset({'excepted', 'such', 'latterest', 'whosoever', 'aught', 'evenser', 'thyself', 'oftenest', 'among', 'six', 'up', 'canst', 'come-ons', 'sup', 'j', 'excepts', 'after', 'apart', 'appeared', 'never', 'neither', 'woulded', 'withal', 's', 'nowheres', 'whereinto', 'oh', 'hast', 'towards', 'downwards', 'oftener', 'q', 'said', 'beyond', 'someone', 'herein', 'used', 'andor', 'idemer', 'athwart', 'dost', 'go', 'appropriates', 'unlikest', 'appear', 'thence', 'therefore', 'aside', 'circa', 'our', 'usually', 'thus', 'summat', 'que', 'therest', 'allest', 'bist', 'because', 'information', 'downs', 'inwardest', 'ought', 'lotted', 'rathe', 'hitherer', 'apartest', 'of', 'co', 'nor', 'between', 'severalest', 'woulding', 'she', 'necessarier', 'mostly', 'umpteen', 'ours', 'although', 'severaler', 'rath', 'wherewithal', 'otherwise', 'thous', 'again', 'aest', 'astridest', 'seeminger', 'indicate', 'how', 'wheresoever', 'whencesoever', 'wert', 'gotta', 'orest', 'whomever', 'everyone', 'appropriated', '

In [9]:
cv=CountVectorizer(max_df=0.85,stop_words=list(stopwords),max_features=10000)
word_count_vector=cv.fit_transform(docs)
word_count_vector.shape

(1263, 10000)

In [10]:
list(cv.vocabulary_.keys())[:10]


['arine',
 'thursday',
 'january',
 'ino',
 'dockage',
 'upper',
 'figures',
 'show',
 'frontage',
 'furnishes']

In [12]:
list(cv.get_feature_names_out())[2000:2015]


['content',
 'contention',
 'contents',
 'contest',
 'contin',
 'continent',
 'continental',
 'continual',
 'continually',
 'continuance',
 'continuation',
 'continue',
 'continued',
 'continues',
 'continuing']

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)



In [14]:
tfidf_transformer.idf_

array([1.67589185, 2.92710082, 3.57768838, ..., 3.77258872, 2.85376954,
       2.8190266 ])

In [None]:
'''import pandas as pd
import json

json_file = input("Enter the file path: ")
with open(json_file, "r") as file:
    json_data = json.load(file)

json_data = pd.DataFrame(json_data)

# Display schema and data overview
print("Schema:\n\n", json_data.dtypes)
print("Number of records, columns = ", json_data.shape)
print(json_data.head())'''

In [15]:
###############################
# ##############################
# ############################## read test docs into a dataframe and concatenate title and body
json_file = input("Enter the file path: ")
with open(json_file, "r") as file:
    json_data = json.load(file)

json_data = pd.DataFrame(json_data)
json_data['fulltext'] =json_data['fulltext'].apply(lambda x:pre_process(x))

# get test docs into a list
docs_test=json_data['fulltext'].tolist()

In [16]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [17]:
# you only needs to do this once
feature_names=cv.get_feature_names_out()

# get the document that we want to extract keywords from
#docs_test = json_data['fulltext'].tolist()
docs_date = json_data['issue_date'].tolist()  
docs_fulltext = json_data['fulltext'].tolist()
doc=docs_test[0]

#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())

#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,10)

# now print the results
print("\n=====Date=====")
print(docs_date[0])
print("\n=====Text=====")
print(docs_fulltext[0])
print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])


=====Date=====
1891-01-01

=====Text=====
arine review v u cleveland ohio thursday january or ino dockage at upper lake ports footed up the above figures show a frontage of feet the iron trade review furnishes figures showing the ore or between eight and nine miles with an average width for the and coal handling facilities of upper lake ports the dockage coal docks of feet and for the ore docks of feet the total of leading ore shipping ports is as as follows amount of space occupied by these docks is square feet ss the total storage capacity is tons and the total daily fs be ei ees handling capacity is tons ee ee eh ls bu similar statistics for lake erie ports collected last summer bs oe s so pac es e showed a total frontage of ore docks of feet with a total eo as s e s square feet area of a daily handling capacity of go é o a a an a i tons and a storage capacity of tons ss ery coal rns easy pe in connection with this summary drawings of the leading arg parp boer es cccc ore shipping 

In [18]:
# put the common code into several methods
def get_keywords(idx):

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([docs_test[idx]]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return keywords

def print_results(idx,keywords):
    # now print the results
    print("\n=====Date=====")
    print(docs_date[idx])
    print("\n=====Text=====")
    print(docs_fulltext[idx])
    print("\n===Keywords===")
    for k in keywords:
        print(k,keywords[k])

In [20]:
idx=500
keywords=get_keywords(idx)
print_results(idx,keywords)


=====Date=====
1900-08-23

=====Text=====
 marine review vou xx cleveland o august no all dealers _ macnolia metal used by all the leading governments best anti friction metal for all machinery bearings beware of tmitations for sale by all dealers if local dé lers have not magnolia metal in stock telegraph us at our expense magnolia metal co west street new york owners and sole manufacturers fisher bidg dearborn st chicago also london moentreal pittsburg boston san francisco the wm gramp sons ae and engine building co philadelphia sole manufacturers in america of parsons manganese bronze ano parsons white brass h co boston mass is edward w hyde president h h mecarty treas john s hyde vice pres and gen supt bath iron works ltd ship builders and engineers bath maine bertram s oil po w l brown president r l ireland vice president r c wetmore sec y and treas jas c wallace gen l manager the american ship building co office viaduct cleveland ohio steel ships marine and stationary engines bo

In [25]:
tf_idf_vector=tfidf_transformer.transform(cv.transform(docs_test))

results=[]
for i in range(tf_idf_vector.shape[0]):
    
    # get vector for a single document
    curr_vector=tf_idf_vector[i]
    
    #sort the tf-idf vector by descending order of scores
    sorted_items=sort_coo(curr_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    
    results.append(keywords)

keywords_df=pd.DataFrame(zip(docs_date,results),columns=['docs_date','keywords'])
keywords_df

output_file = input("Enter the name of the CSV output file: ")
if not output_file.endswith('.csv'):
    output_file += '.csv'

# Convert the 'keywords' dictionary to a string to make it CSV-friendly
keywords_df['keywords'] = keywords_df['keywords'].apply(lambda x: ", ".join([f"{key}: {value:.4f}" for key, value in x.items()]))

# Save to CSV
keywords_df.to_csv(output_file, index=False)
print(f"Saved to {output_file}")

Saved to marine_review_tfidf_keywords.csv


In [24]:
df = pd.DataFrame(zip(issue_dates, results), columns=['issue_date', 'keywords'])

output_file = input("Enter the name of the CSV output file: ")
if not output_file.endswith('.csv'):
    output_file += '.csv'

# Convert the 'keywords' column (a dictionary) to a string to make it CSV-friendly
df['keywords'] = df['keywords'].apply(lambda x: ", ".join([f"{key}: {value:.4f}" for key, value in x.items()]))

# Save to CSV
df.to_csv(output_file, index=False)
print(f"Data saved to {output_file}")

NameError: name 'issue_dates' is not defined