In [2]:
import re

def clean_text(text):
    """
    Cleans the input text by removing newline characters and reducing all instances of multiple
    spaces to a single space.

    Parameters:
    - text (str): The input text to be cleaned.

    Returns:
    - str: The cleaned text with newline characters removed and multiple spaces reduced to a single space.
    """
    # Use regex to replace one or more whitespace characters (including spaces, tabs, and newlines)
    # with a single space, and then strip leading and trailing whitespace from the result.
    cleaned_text = re.sub(r'\s+', ' ', text).strip()
    return cleaned_text

def split_into_segments(sentence, limit = 200, backtrack_limit = 100):
    """
    Splits a long sentence into multiple smaller segments based on specified limits, ensuring
    segments end with complete sentences where possible.

    Parameters:
    - sentence (str): The long sentence to split.
    - limit (int): The approximate limit for the number of words in each segment.
    - backtrack_limit (int): The maximum number of words to backtrack in an effort to end a segment with complete sentences.

    Returns:
    - list: A list of sentence segments.
    """
    segments = []
    # Clean the sentence to remove excessive whitespace and newline characters.
    remaining_sentence = clean_text(sentence)

    while remaining_sentence:
        # Generate a segment that respects the limit and attempts to end with complete sentences.
        segment = limit_to_approx_words(remaining_sentence, limit, backtrack_limit)
        segments.append(segment)
        # Update the remaining sentence by removing the processed segment and leading spaces.
        remaining_sentence = remaining_sentence[len(segment):].lstrip()

        # If there's no remaining sentence to process, exit the loop.
        if not remaining_sentence:
            break
    return segments

def limit_to_approx_words(sentence, limit=200, backtrack_limit=100):
    """
    Truncates a sentence to a specified limit of words, attempting to end with a full stop, question mark,
    or exclamation point within a backtrack limit if possible.

    Parameters:
    - sentence (str): The sentence to truncate.
    - limit (int, optional): The maximum number of words desired in the truncated sentence. Defaults to 200.
    - backtrack_limit (int, optional): The maximum number of words to backtrack through to find a suitable
      ending punctuation. Defaults to 100.

    Returns:
    - str: The truncated sentence, ideally ending with a complete sentence.
    """
    words = sentence.split()
    if len(words) <= limit:
        return sentence

    # Join words up to the limit, then strip to remove any leading or trailing whitespace.
    limited_text = " ".join(words[:limit]).strip()
    # Attempt to find a sentence-ending punctuation within the backtrack limit.
    for i in range(limit - 1, max(0, limit - backtrack_limit), -1):
        if words[i][-1] in ".!?":
            # Return the text up to and including the punctuation.
            return " ".join(words[:i + 1])

    # If no suitable punctuation is found, return the text up to the word limit.
    return limited_text

In [3]:
long_sentence_example = """When Asaf-ud-Daula, the fourth Nawab of Awadh (state rulers of the area now known as Uttar Pradesh), shifted the state capital from Faizabad to Lucknow in 1775, it led to a cultural renaissance. An exodus of architects and craftsmen landed there from Delhi, and many other poets, artists and learned men from all around made Lucknow their home. It is said that artisans from as far as the Uzbekistan capital Tashkent, and masons from Isfahan in Iran, were brought to Lucknow by the Persian Nawabs.
One of the important crafts of Uttar Pradesh is Chikankari, which entails delicate and traditional hand embroidery. This form of handicrafts is mainly practiced in Lucknow. It is done on fabrics like chiffon, muslin, organza, organdie and silk. Chikan saris and Kurtas which are the perfect summer wear.
State Lalit Kala Akademi, located in Lal Baradari Bhawan, was founded in 1962 by the Uttar Pradesh Department of Culture. As per the official website of the Akademi, outstanding works of art have been collected from various exhibitions, shows and camps, since the existence of the Akademi. Specialised exhibitions are organised in the Modern Art Gallery of the Akademi throughout the year, eelecting from a vast reserve collection of almost 1800 works consisting of paintings, sculptures, drawings, and graphics. The Akademi conducts a variety of events with the goal of promoting and giving a platform to renowned as well as emerging artists to showcase their talent.
Lucknow is one of the few cities in the country where there is not one, but two Lalit Kala Akademis! Apart from the State Lalit Kala Akademi, the regional centre in Aliganj, Lucknow is also a buzzing centre for art and culture. The Regional Centers of the Akademi are located in Delhi, Chennai, Bhubaneswar and Kolkata apart from Lucknow for the development of art. Apart from hosting art and photography exhibitions from time to time, the Regional Lalit Kala Akademi in Lucknow also has a work place for the artists namely community workshops for all the disciplines of visual art.
The Picture Gallery, built in 1838 by Nawab Mohammed Ali Shah, exhibits full-length portraits of Awadh’s nawabs and pictures of the administrators and officials of the British era. These life-size portraits were painted by visiting European painters including Dawling, Gravet, and Harrison, as well as an Indian artist named DS Singh. As you go by them, you may notice that their eyes, heads, and accessories, such as shoes, appear to be pointed towards you, leaving you spellbound. As per the legend, these paintings are said to have been done on elephant skin, and the colour used to fill them were created with the help of diamonds and gems, and are painted in such a manner that the different parts seem to move with the angle of view
Kalasrot is a private art gallery committed to promoting contemporary art practices in all visual art disciplines. It has art works ranging from graphics to sculptures and paintings on display. While its online gallery provides a glimpse into their current collections, it also allows art enthusiasts everywhere easy access to Kalasrot’s displayed artworks.
Timings: 11 am to 9 pm
Location: A1/9B,Sector-B,Near Nehru bal vatika, Aliganj, Lucknow
The Saraca Art Gallery is known for hosting group art shows by contemporary artists such as Dheeraj Yadav and Bhupender Asthana, as well as solo art exhibitions curated by art connoisseur Vandana Sehgal. It is located at hotel Lebua, a heritage boutique hotel in a traditional 19th-century bungalow. Please check if an art show is on at the gallery before landing there.
Location: 19, Sarva Palli, Saraca Estate, The Mall Avenue, Lucknow
Timings: 11 am to 7 pm
Opened in 2014, Cosmos Art Gallery by Dr. Aron hosts contemporary art exhibitions, both solo, and group, ranging from a collection of sculptures to paintings and photography shows by up-and-coming artist. Apart from hosting artists to exhibit their works, they also conduct art promotion workshops and classes.
Location: 41, Prag Narayan Rd, Butler Colony, Lucknow
Timings: 11 am to 7 pm, Thursday Closed
A melting pot of art, culture, cuisine and architecture, the bustling city of Lucknow, the capital of Uttar Pradesh, preserves its essence of grandeur in its kaleidoscopic experiences. From a vibrant culinary scene and exquisite historical monuments to its rich art and culture and vestiges of colonial charm, the city of nawabs, as it is popularly called, is as welcoming as is the warmth of its people.
Chikankari
A delicate hand embroidery technique done on fabrics like chiffon, muslin, organza, organdie and silk, chikankari is one of the most important crafts in Lucknow. The word 'chikan' means embroidery, and this form of art incorporates 36 different stitching techniques. In the beginning, only white yarn or muslin cloth was used. Stitching is done on the back of the cloth whereas the design is made on the front by tiny, running stitches. There are three main types of stitches – flat, embossed, and jaali (net effect). Due to a Persian influence, flowers have a permanent place in this art form. Typical chikankari motifs include creepers and vines. However, the types and styles in which these flowers are made keep varying with fashion trends. Today, there are hundreds of retailers of chikankari fabric across the country. In Lucknow, the lanes of markets are dotted with shops selling chikan work in various forms. You can pick up shirts, kurtas, bedsheets, table cloth, pillow covers and many other items adorned in chikankari.
"""


split_sentences = split_into_segments(long_sentence_example)


In [4]:
for short_sentence in split_sentences:
  print(short_sentence)
  print("\n\n\n")

When Asaf-ud-Daula, the fourth Nawab of Awadh (state rulers of the area now known as Uttar Pradesh), shifted the state capital from Faizabad to Lucknow in 1775, it led to a cultural renaissance. An exodus of architects and craftsmen landed there from Delhi, and many other poets, artists and learned men from all around made Lucknow their home. It is said that artisans from as far as the Uzbekistan capital Tashkent, and masons from Isfahan in Iran, were brought to Lucknow by the Persian Nawabs. One of the important crafts of Uttar Pradesh is Chikankari, which entails delicate and traditional hand embroidery. This form of handicrafts is mainly practiced in Lucknow. It is done on fabrics like chiffon, muslin, organza, organdie and silk. Chikan saris and Kurtas which are the perfect summer wear. State Lalit Kala Akademi, located in Lal Baradari Bhawan, was founded in 1962 by the Uttar Pradesh Department of Culture. As per the official website of the Akademi, outstanding works of art have be