In [None]:
import numpy as np
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.tokenize import sent_tokenize

# Download required NLTK data
nltk.download('punkt')

# Sample long text document
document = """
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of "intelligent agents": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals.
Colloquially, the term "artificial intelligence" is often used to describe machines (or computers) that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem-solving". As machines become increasingly capable, tasks considered to require "intelligence" are often removed from the definition of AI, a phenomenon known as the AI effect.
For instance, optical character recognition is frequently excluded from things considered to be AI, having become a routine technology. Modern machine capabilities generally classified as AI include successfully understanding human speech, competing at the highest level in strategic game systems (such as chess and Go), autonomously operating cars, and intelligent routing in content delivery networks and military simulations.
AI is a multidisciplinary field with multiple approaches, but advancements in machine learning and deep learning are creating a paradigm shift in virtually every sector of the tech industry.
"""

# Preprocessing: clean and tokenize the sentences
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces/newlines
    sentences = sent_tokenize(text)  # Sentence tokenization
    return sentences

# Extract sentences from the document
sentences = preprocess_text(document)

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(sentences)  # Convert sentences into TF-IDF matrix

# Apply K-means clustering to group sentences into clusters
n_clusters = int(len(sentences) / 2)  # Set the number of clusters (tune as needed)
kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X)

# Rank sentences based on their proximity to the cluster centroid
def rank_sentences(sentences, model, X):
    ranked_sentences = []
    for i in range(model.n_clusters):
        cluster_center = model.cluster_centers_[i]
        distances = np.linalg.norm(X.toarray() - cluster_center, axis=1)
        ranked_sentences.append((np.argmin(distances), distances[np.argmin(distances)]))
    ranked_sentences = sorted(ranked_sentences, key=lambda x: x[1])  # Sort by distance
    return [sentences[idx] for idx, _ in ranked_sentences]

# Generate the summary by selecting top-ranked sentences
summary_sentences = rank_sentences(sentences, kmeans, X)
summary = ' '.join(summary_sentences)

# Output the summarized text
print("Original Document:\n", document)
print("\nSummarized Text:\n", summary)