In [None]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split



# Load the data into a DataFrame
news_df = pd.read_csv("news.csv")

# Fill any missing values with an empty string
news_df["content"] = news_df["content"].fillna("")

# Define a function to preprocess the text
def preprocess_text(text):
    """
    Remove unwanted characters and stopwords, and convert the text to lowercase.
    """
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    cleaned_text = " ".join(token.lemma_ for token in doc if not token.is_stop and not token.is_punct)
    return cleaned_text.lower()

# Apply the preprocess_text function to the "content" column
news_df["content"] = news_df["content"].apply(preprocess_text)

# Define a function to rank the sentences in a document based on their importance
def rank_sentences(text, top_n=5):
    """
    Extract the top n most important sentences from a document based on their TF-IDF scores.
    """
    # Initialize the TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Compute the TF-IDF scores for each sentence in the document
    sentence_scores = []
    for sentence in text.split("."):
        if sentence:
            sentence_scores.append(vectorizer.fit_transform([sentence]).todense())

    # Convert the sentence scores to a numpy array
    sentence_scores = np.array(sentence_scores)

    # Calculate the average TF-IDF score for each sentence
    avg_scores = np.mean(sentence_scores, axis=1)

    # Sort the sentences by their average TF-IDF scores in descending order
    top_sentences_idx = np.argsort(avg_scores)[::-1][:top_n]

    # Extract the top n most important sentences from the document
    top_sentences = []
    for idx in top_sentences_idx:
        top_sentences.append(text.split(".")[int(idx)])


    # Remove the top n most important sentences from the document
    removed_lines = "\n".join(sentence for sentence in text.split(".") if sentence.strip() not in top_sentences)

    # Combine the remaining sentences into a new document
    new_content = ".".join(sentence for sentence in top_sentences)

    # Calculate the cosine similarity between the sentence embeddings
    sentence_vectors = np.array([vectorizer.fit_transform([sentence]).todense() for sentence in top_sentences])
    sim_matrix = cosine_similarity(sentence_vectors)

    # Calculate the sentence scores by summing the cosine similarities for each sentence
    scores = np.sum(sim_matrix, axis=1)

    # Normalize the scores
    scores = scores / np.sum(scores)

    # Create a dictionary of the top n most important sentences and their scores
    summary = {}
    for i in range(top_n):
        summary[top_sentences[i]] = scores[i]

    return new_content, removed_lines, summary

# Split the dataset into train and test sets
train_set, test_set = train_test_split(news_df, test_size=0.1, random_state=42)

# Rank the sentences in the test set and store the results in a dataframe
results = pd.DataFrame(columns=["Original Content", "New Content", "Removed Lines", "Further Metrics"])
for index, row in test_set.iterrows():
    original_content = row["content"]
    new_content, removed_lines, summary = rank_sentences(original_content)
    results = results.append({"Original Content": original_content, "New Content": new_content, "Removed Lines": removed_lines, "Further Metrics": summary}, ignore_index=True)

# Save the results to a CSV file
results.to_csv("test_results.csv", index=False)

