In [1]:
from __future__ import print_function
from __future__ import unicode_literals
import collections
import copy
import io
import nltk
import re
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
stopwords = set()
sentences = []
sentences_processing = []
sentence_dictionary = collections.defaultdict(dict)
stemWords = {}

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hardik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [33]:
def readStemWords():
    '''
        Reads the words from the stem words list and transforms the data into usable format
     '''
    global stemWords
    with io.open("word_list_marathi.txt", encoding='utf-8') as textFile:
        index = 0
        for line in textFile:
            line = line.strip()
            if len(line) > 0:
                index += 1
                wordEndIndex = line.find(">")
                word = line[2:wordEndIndex]
                line = line[wordEndIndex + 1:]
                baseEndIndex = line.find("]")
                base = line[1:baseEndIndex].strip()
                line = line[baseEndIndex + 1:]
                stem = None
                if len(base) >= 0:
                    stemEndIndex = base.find('-')
                    if stemEndIndex > 0:
                        stem = base[:stemEndIndex]

                valid = line[line.find("(") + 1: line.find(")")].strip()
                if valid == "0":
                    continue
                line = line[line.find("{") + 1: line.find("}")].strip()
                related = []
                if len(line) > 0:
                    split = line.split(",")
                    for s in split:
                        related.append(s[:s.find("|")])
                if stem == None and len(related) > 0:
                    stem = related[0]
                if stem != None:
                    stemWords[word] = {}
                    stemWords[word]["stem"] = stem
                    stemWords[word]["related"] = related


In [38]:
def readStopWords():
    '''
    Reads the stopwords from the file
    '''
    with io.open("stopwords.txt", encoding='utf-8') as textFile:
        for line in textFile:
            words = line.lower().strip()
            stopwords.add(words)
        textFile.close()


def removeStopWords(wordlist):
    '''
    Removes the stopwords from the sentences
    :param wordlist: list of stopwords
    '''
    newlist = []
    for word in wordlist:
        if word not in stopwords:
            newlist.append(word)
    return newlist

In [42]:


def tokenize_text(text):
    '''
    Tokenizes the sentences and words
    :param text: text to be tokenized
    '''
    global sentences, sentences_processing, sentence_dictionary
    
    # Assuming `sent_tokenize` is imported from NLTK
    sentences = sent_tokenize(text)
    sentences_processing = copy.deepcopy(sentences)
    counter = 0
    for sentence in sentences_processing:
        sentence = sentence[:-1]
        sentence = re.sub(',|\.|-|\(|\)', ' ', sentence)
        tokens = sentence.strip().split()
#         Assuming removeStopWords and stemmerMarathi functions are defined elsewhere
        actualTokens = removeStopWords(tokens)
        stemmedTokens = stemmerMarathi(actualTokens)
        sentence_dictionary[counter] = stemmedTokens
        counter += 1

# Example usage
text = "वर्तमान गटामध्ये आपलं स्वप्न पूर्ण होणार आहे. आपलं असा परिपूर्ण समुदाय तयार करण्यासाठी, आपण सर्वांचं सहयोग कडेच कायम करावं लागेल."
tokenize_text(text)
print(sentence_dictionary)

defaultdict(<class 'dict'>, {0: ['वर्तमान', 'गटामध्', 'आपलं', 'स्वप्न', 'पूर्ण', 'होणार', 'आहे'], 1: ['आपलं', 'असा', 'परिपूर्ण', 'समुदाय', 'तयार', 'करण्यासाठी', 'आपण', 'सर्वांचं', 'सहयोग', 'कडेच', 'काय', 'करावं', 'लागे']})


In [None]:
वर्तमान गटामध्ये आपलं स्वप्न पूर्ण आपलं असा परिपूर्ण समुदाय तयार करण्यासाठी आपण सर्वांचं सहयोग कडेच कायम करावं लागेल

In [6]:
def removeCase(word):
    '''
    :param word: word to be reduced its stem
    :return: stem of the word
    '''
    word_length = len(word) - 1
    if word_length > 5:
        suffix = "शया"
        if word.endswith(suffix):
            return word[:-len(suffix)]

    if word_length > 4:
        suffix = "शे"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "शी"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "चा"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "ची"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "चे"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "हून"
        if word.endswith(suffix):
            return word[:-len(suffix)]

    if word_length > 3:
        suffix = "नो"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "तो"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "ने"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "नी"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "ही"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "ते"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "या"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "ला"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "ना"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "ऊण"
        if word.endswith(suffix):
            return word[:-len(suffix)]

    if word_length > 2:
        suffix = " े"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = " ी"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "स"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "ल"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = " ा"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "त"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "म"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        return word



असा


In [7]:
def removeNoGender(word):
    global stemWords
    orig = word
    if word in stemWords:
        return stemWords[word]["stem"]
    word_length = len(word) - 1

    if word_length > 5:
        suffix = " ुरडा"
        if word.endswith(suffix):
            return word[:-len(suffix)]
    if word_length > 4:
        suffix = "ढा"
        if word.endswith(suffix):
            return word[:-len(suffix)]
    if word_length > 3:
        suffix = "रु"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "डे"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "ती"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = " ान"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = " ीण"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "डा"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "डी"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "गा"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "ला"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "ळा"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "या"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "वा"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "ये"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "वे"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "ती"
        if word.endswith(suffix):
            return word[:-len(suffix)]
    if word_length > 2:
        suffix = "अ"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = " े"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "ि "
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = " ु"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = " ौ"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = " ै"
        if word.endswith(suffix):
            return word[:-len(suffix)]

        suffix = " ा"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = " ी"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = " ू"
        if word.endswith(suffix):
            return word[:-len(suffix)]
        suffix = "त"
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

In [17]:
def stemmerMarathi(words):
    return [removeNoGender(removeCase(word)) for word in words]

In [34]:
w=stemmerMarathi(tokens)

In [35]:
w

['वर्तमान',
 'गटामध्',
 'आपलं',
 'स्वप्न',
 'पूर्ण',
 'आपलं',
 'असा',
 'परिपूर्ण',
 'समुदाय',
 'तयार',
 'करण्यासाठी',
 'आपण',
 'सर्वांचं',
 'सहयोग',
 'कडेच',
 'काय',
 'करावं',
 'लागे']

In [8]:
import io
import copy
import re
from nltk.tokenize import sent_tokenize

# Define global variables (assuming they are defined elsewhere in your code)
sentences = []
sentences_processing = []
sentence_dictionary = {}

In [16]:
# def tokenize_sentence(sentence):
#     # Initialize an empty list to store tokens
#     tokens = []
#     # Initialize a variable to keep track of the start index of the current token
#     start = 0
    
#     # Iterate through each character in the sentence
#     for i, char in enumerate(sentence):
#         # Check if the current character is a space or punctuation
#         if char.isspace() or char in [',', '.', '!', '?', ';', ':']:
#             # If the current token is not empty, add it to the list of tokens
#             if i > start:
#                 tokens.append(sentence[start:i])
#             # Update the start index for the next token
#             start = i + 1
    
#     # Handle the last token (if any)
#     if start < len(sentence):
#         tokens.append(sentence[start:])
    
#     return tokens

# # Example usage
# sentence = "वर्तमान गटामध्ये आपलं स्वप्न पूर्ण आपलं असा परिपूर्ण समुदाय तयार करण्यासाठी आपण सर्वांचं सहयोग कडेच कायम करावं लागेल"
# tokens = tokenize_sentence(sentence)
# print(tokens)

['वर्तमान', 'गटामध्ये', 'आपलं', 'स्वप्न', 'पूर्ण', 'आपलं', 'असा', 'परिपूर्ण', 'समुदाय', 'तयार', 'करण्यासाठी', 'आपण', 'सर्वांचं', 'सहयोग', 'कडेच', 'कायम', 'करावं', 'लागेल']
