In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load the dataset and select relevant columns
df = pd.read_json('News_Category_Dataset_v3.json', lines=True)
selected_columns = ['headline', 'category', 'short_description']
df = df[selected_columns]

# Step 2: Feature Extraction and Encoding
# Convert text data to numerical representation using TF-IDF vectorization
vectorizer = TfidfVectorizer()
text_features = vectorizer.fit_transform(df['headline'] + ' ' + df['short_description'])

# Encode categorical variable 'category' using one-hot encoding
encoded_category = pd.get_dummies(df['category'], prefix='category')

# Combine the encoded category with the text features
encoded_features = pd.concat([encoded_category.reset_index(drop=True), pd.DataFrame(text_features.toarray())],
                             axis=1)

# Step 3: Define the input data
input_data = {
    'headline': 'Your input headline',
    'short_description': 'Your input description',
    'category': 'Your input category'
}

# Convert the input data to numerical representation
input_vector = vectorizer.transform([input_data['headline'] + ' ' + input_data['short_description']])
input_encoded_category = pd.get_dummies([input_data['category']], prefix='category')
input_features = pd.concat([input_encoded_category, pd.DataFrame(input_vector.toarray())], axis=1)

# Step 4: Compute Similarity Scores
similarity_scores = cosine_similarity(input_features, encoded_features)

# Step 5: Find Most Similar Data
top_k = 5  # Set the number of most similar data points to retrieve
most_similar_indices = similarity_scores.argsort()[0][-top_k:][::-1]
most_similar_data = df.iloc[most_similar_indices]

print("Most Similar Data:")
print(most_similar_data)


MemoryError: Unable to allocate 138. GiB for an array with shape (209527, 88507) and data type float64