In [2]:
import pandas as pd

# Load the dataset
data = pd.read_csv(r"C:\Users\jappaka\Desktop\python\news-train-1.csv")

# Taking first rows of the dataset
data.head()

Unnamed: 0,ArticleId,Text,Category
0,893,rangers seal old firm win goals from gregory v...,sport
1,1164,bt program to beat dialler scams bt is introdu...,tech
2,1696,new yob targets to be unveiled fifty new are...,politics
3,396,holmes is hit by hamstring injury kelly holmes...,sport
4,1862,capriati out of australian open jennifer capri...,sport


In [3]:
# Adding get_tokens method file
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import *



# stemming tool from nltk
stemmer = PorterStemmer()
# a mapping dictionary to remove punctuations
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def get_tokens(text):
    # turn document into lowercase
    lowers = text.lower()
    # remove punctuations
    no_punctuation = lowers.translate(remove_punctuation_map)
    # tokenize document
    tokens = nltk.word_tokenize(no_punctuation)
    # remove stop words
    filtered = [w for w in tokens if not w in stopwords.words('english')]
    # stemming process
    stemmed = [stemmer.stem(item) for item in filtered]
    # final unigrams
    return stemmed

# Applying the function to the 'Text' column
data["Unigrams"] = data["Text"].apply(get_tokens)

# First rows with the "Unigrams" column
data.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jappaka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jappaka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,ArticleId,Text,Category,Unigrams
0,893,rangers seal old firm win goals from gregory v...,sport,"[ranger, seal, old, firm, win, goal, gregori, ..."
1,1164,bt program to beat dialler scams bt is introdu...,tech,"[bt, program, beat, dialler, scam, bt, introdu..."
2,1696,new yob targets to be unveiled fifty new are...,politics,"[new, yob, target, unveil, fifti, new, area, g..."
3,396,holmes is hit by hamstring injury kelly holmes...,sport,"[holm, hit, hamstr, injuri, kelli, holm, forc,..."
4,1862,capriati out of australian open jennifer capri...,sport,"[capriati, australian, open, jennif, capriati,..."


In [4]:
data.to_csv(r"data_pickle.csv")

In [5]:
# Loading the dictionary file
with open(r"C:\Users\jappaka\Desktop\python\dictionary.txt", "r") as file:
    dictionary_words = set(file.read().splitlines())

# Filtering the unigrams based on the dictionary
data["Filtered_Unigrams"] = data["Unigrams"].apply(lambda unigrams: [word for word in unigrams if word in dictionary_words])

# Displaying the first few rows with the new "Filtered_Unigrams" column
data.head()

Unnamed: 0,ArticleId,Text,Category,Unigrams,Filtered_Unigrams
0,893,rangers seal old firm win goals from gregory v...,sport,"[ranger, seal, old, firm, win, goal, gregori, ...","[old, firm, win, goal, gave, victori, park, mo..."
1,1164,bt program to beat dialler scams bt is introdu...,tech,"[bt, program, beat, dialler, scam, bt, introdu...","[bt, program, beat, bt, introduc, two, initi, ..."
2,1696,new yob targets to be unveiled fifty new are...,politics,"[new, yob, target, unveil, fifti, new, area, g...","[new, target, unveil, new, area, get, special,..."
3,396,holmes is hit by hamstring injury kelly holmes...,sport,"[holm, hit, hamstr, injuri, kelli, holm, forc,...","[hit, injuri, forc, weekend, european, athlet,..."
4,1862,capriati out of australian open jennifer capri...,sport,"[capriati, australian, open, jennif, capriati,...","[australian, open, becom, third, lead, austral..."


In [6]:
# Load the dictionary content from the 'dictionary.txt' file
with open('C:\\Users\\jappaka\\Desktop\\python\\dictionary.txt', 'r') as file:
    dictionary_content = file.readlines()
    
import numpy as np
from collections import Counter
from math import log

# Assuming your dataset is loaded as 'data' with the 'Filtered_Unigrams' column already populated.
# Make sure to preprocess your data to have the 'Filtered_Unigrams' column as we did earlier.

# Number of documents
n = len(data)

# List of words from dictionary.txt
dictionary_list = [word.strip() for word in dictionary_content]

# Pre-computing the IDF for each word across all documents
idf_values = {}
for word in dictionary_list:
    # Count of documents where the word appears
    mj = sum([1 for unigram_list in data["Filtered_Unigrams"] if word in unigram_list])
    idf_values[word] = log(n / mj)

# Computing the TFIDF matrix iteratively
tfidf_matrix_optimized = np.zeros((n, len(dictionary_list)))

for i, unigrams in enumerate(data["Filtered_Unigrams"]):
    word_counts = Counter(unigrams)
    for j, word in enumerate(dictionary_list):
        # TF calculation
        tf = word_counts[word] / max(word_counts.values()) if word in word_counts else 0
        # TFIDF calculation
        tfidf_matrix_optimized[i][j] = tf * idf_values[word]

# Print the TFIDF matrix in Jupyter notebook
print(tfidf_matrix_optimized)


[[0.         0.         0.36427118 ... 0.         0.         0.        ]
 [0.36385541 0.32813062 0.         ... 0.         0.         0.        ]
 [0.24257027 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [7]:
# Calculating the top 3 most frequent words for each category
top_words_frequency = {}

for category in sorted(data["Category"].unique()):
    # Filtering the data for the current category
    category_data = data[data["Category"] == category]
    
    # Counting word frequencies for the current category
    word_frequencies = Counter(word for unigrams in category_data["Filtered_Unigrams"] for word in unigrams)
    
    # Getting the top 3 most frequent words for each category 
    top_words = dict(word_frequencies.most_common(3))
    
    top_words_frequency[category] = top_words

top_words_frequency


{'business': {'said': 773, 'us': 393, 'year': 388},
 'entertainment': {'film': 519, 'said': 423, 'year': 268},
 'politics': {'said': 1059, 'mr': 782, 'would': 531},
 'sport': {'said': 452, 'game': 368, 'win': 303},
 'tech': {'said': 767, 'use': 476, 'peopl': 431}}

In [8]:

# Initialize a dictionary to store the sum of TFIDF values for each word by category
tfidf_sums_by_category = {category: {word: 0 for word in dictionary_list} for category in data["Category"].unique()}

# Sum up the TFIDF values for each word by category
for i, row in data.iterrows():
    category = row["Category"]
    for j, word in enumerate(dictionary_list):
        tfidf_sums_by_category[category][word] += tfidf_matrix_optimized[i][j]

# Compute the average TFIDF for each word by category
average_tfidf_by_category = {}
for category, word_sums in tfidf_sums_by_category.items():
    num_docs_in_category = len(data[data["Category"] == category])
    average_tfidf_by_category[category] = {word: sum_val / num_docs_in_category for word, sum_val in word_sums.items()}

# Extract the top 3 words with the highest average TFIDF for each category
top_3_avg_tfidf_by_category = {}
for category, word_averages in average_tfidf_by_category.items():
    top_3_avg_tfidf_by_category[category] = dict(sorted(word_averages.items(), key=lambda item: item[1], reverse=True)[:3])

# Sort the dictionary based on its keys
sorted_top_3_avg_tfidf_by_category = dict(sorted(top_3_avg_tfidf_by_category.items()))

sorted_top_3_avg_tfidf_by_category



{'business': {'firm': 0.3008261955203651,
  'bank': 0.26751818490824486,
  'compani': 0.26191001038677303},
 'entertainment': {'film': 0.7194241701450167,
  'star': 0.39396079323022454,
  'award': 0.39317143948747996},
 'politics': {'labour': 0.4567223824204719,
  'elect': 0.43448728678902737,
  'mr': 0.42492065049653804},
 'sport': {'game': 0.3541562034993347,
  'england': 0.3133166953690412,
  'win': 0.30306092923403677},
 'tech': {'mobil': 0.3494922062358214,
  'phone': 0.32999603143748346,
  'softwar': 0.3190357174296957}}