Given a set of Twitter hashtags, split each hashtag into its constituent words. For example:

wearethepeoplewearethepeople is split into we are the peoplewe are the people
mentionyourfavesmentionyourfaves is split into mention your favesmention your faves
Input Format

The first line contains an integer, NN, denoting a number of hashtags. 
Each of the NN subsequent lines contains a single hashtag.

Dictionaries and Corpuses of Text

We don't strictly prescribe a particular dictionary or corpus or a set of features. To get started, you may find it useful to embed this list of 5000 common words as a dictionary in your program. For more effective segmentation models, you are encouraged to use your own word list, or corpus, or features extracted from a corpus, as required by whatever model you choose. Project Gutenberg is a good starting point, but keep in mind that language and its usage has evolved and transformed over time.

You may use serialization to build and compress your model offline and to decompress and use it from your program. If you end up with a corpus or model that is too large, you may compress and serialize it, then deserialize it from within your code using zlib (that is in Python) or another tool. This means that your code will contain a compressed string representing the dictionary which will then be de-compressed and used. You can take a look at this code submitted during CodeSprint5 here. For Java users, you might want to look up java.util.zip.GZIPInputStream for this purpose.

Constraints

5≤N≤505≤N≤50
The hashtags will not contain named entities, other than the names of countries and their abbreviations (e.g.: US, UK, UAE, etc.).}
The hashtags may occasionally contain slang phrases, such as "faves" (a slang abbreviation for "favorites").

In [1]:
# #Corpus of English Words
import nltk
from nltk.corpus import words
from nltk.corpus import wordnet

In [2]:
len(words.words())

236736

In [3]:
def FindWord(token, wordlist):
    i = len(token) + 1
    while i > 1:
        i -= 1
        if token[:i] in wordlist:
            return token[:i]
    return None 

In [4]:
FindWord("helloworld", words.words())

'hello'

In [5]:
"".join("helloworld".rsplit("hello"))

'world'

In [6]:
FindWord('world', words.words())

'world'

In [7]:
"".join("world".rsplit("world"))

''

In [8]:
in_word = "helloworld"
out_word = FindWord(in_word, words.words())
print out_word

temp_word = "".join(in_word.rsplit(out_word))
out_word = FindWord(temp_word, words.words())
print out_word



hello
world


In [9]:
temp_word = "wearethepeople"
out_word = ""

arr_output = []
while out_word != temp_word:
    out_word = FindWord(temp_word, words.words())
    print out_word
    arr_output.append(out_word)
    temp_word = "".join(temp_word.rsplit(out_word))
    if out_word == None:
        break



wear
e
th
pop
l
None


In [10]:
arr_output[:-1]

['wear', 'e', 'th', 'pop', 'l']

In [11]:
[i for i in words.words()]

[u'A',
 u'a',
 u'aa',
 u'aal',
 u'aalii',
 u'aam',
 u'Aani',
 u'aardvark',
 u'aardwolf',
 u'Aaron',
 u'Aaronic',
 u'Aaronical',
 u'Aaronite',
 u'Aaronitic',
 u'Aaru',
 u'Ab',
 u'aba',
 u'Ababdeh',
 u'Ababua',
 u'abac',
 u'abaca',
 u'abacate',
 u'abacay',
 u'abacinate',
 u'abacination',
 u'abaciscus',
 u'abacist',
 u'aback',
 u'abactinal',
 u'abactinally',
 u'abaction',
 u'abactor',
 u'abaculus',
 u'abacus',
 u'Abadite',
 u'abaff',
 u'abaft',
 u'abaisance',
 u'abaiser',
 u'abaissed',
 u'abalienate',
 u'abalienation',
 u'abalone',
 u'Abama',
 u'abampere',
 u'abandon',
 u'abandonable',
 u'abandoned',
 u'abandonedly',
 u'abandonee',
 u'abandoner',
 u'abandonment',
 u'Abanic',
 u'Abantes',
 u'abaptiston',
 u'Abarambo',
 u'Abaris',
 u'abarthrosis',
 u'abarticular',
 u'abarticulation',
 u'abas',
 u'abase',
 u'abased',
 u'abasedly',
 u'abasedness',
 u'abasement',
 u'abaser',
 u'Abasgi',
 u'abash',
 u'abashed',
 u'abashedly',
 u'abashedness',
 u'abashless',
 u'abashlessly',
 u'abashment',
 u'ab

In [12]:
arr_combos = []
def FindWordModified(token, wordlist):
    i = len(token) + 1
    while i > 1:
        i -= 1
        if token[:i] in wordlist:
            if len(token[:i]) > 1:
                arr_combos.append(token[:i])
#               print token[:i]
    return arr_combos

In [13]:
FindWordModified("hello", words.words())

['hello', 'hell', 'he']

In [14]:
input_word = "wearethepeople"


for i in range(0,len(input_word)+1):
    temp_word = input_word[:i]
    if len(temp_word) > 1:
        if temp_word in words.words():
            print temp_word

we
wear


In [15]:
# #Generates all valid combinations of words - according to the words.words() list.
# #Assumption - A word is atleast of length two or more.

input_word = "wearethepeople"
for N in range(2, len(input_word)):
    print [input_word[i:i+N] for i in range(len(input_word)-N+1) if input_word[i:i+N] in words.words()]

['we', 'ea', 'ar', 're', 'th', 'he']
['ear', 'are', 'ret', 'the', 'hep']
['wear']
['rethe']
['people']
[]
[]
[]
[]
[]
[]
[]


In [16]:
arr_valid_words = []
for N in range(2, len(input_word)):
    for i in range(len(input_word)-N+1):
        if input_word[i:i+N] in words.words():
            arr_valid_words.append(input_word[i:i+N])
            
arr_valid_words

['we',
 'ea',
 'ar',
 're',
 'th',
 'he',
 'ear',
 'are',
 'ret',
 'the',
 'hep',
 'wear',
 'rethe',
 'people']

In [17]:
input_word[:1]

'w'

In [18]:
[i for i in arr_valid_words if i[0] == input_word[:1]]

['we', 'wear']

In [19]:
input_word.replace("we", "")

'arethepeople'

In [20]:
# #TESTED LOGIC
def my_func(var_word, var_arr):
    print [i for i in var_arr if i[0] == var_word[:1]]
    for i in [i for i in var_arr if i[0] == var_word[:1]]:
        print var_word.replace(i, "")
        if var_word.replace(i, "") != var_word:
            my_func(var_word.replace(i, ""), arr_valid_words)
        else:
            break
        
    print "\n\n"
     
my_func("wearethepeople", arr_valid_words)

['we', 'wear']
arethepeople
['ar', 'are']
ethepeople
['ea', 'ear']
ethepeople



thepeople
['th', 'the']
epeople
['ea', 'ear']
epeople



people
['people']

[]












ethepeople
['ea', 'ear']
ethepeople








In [21]:
def my_func(var_word, var_arr):
    print [i for i in var_arr if i[0] == var_word[:1]]
    for i in [i for i in var_arr if i[0] == var_word[:1]]:
        if var_word.replace(i, "") != var_word:
            my_func(var_word.replace(i, ""), arr_valid_words)
        else:
            break
     
my_func("wearethepeople", arr_valid_words)

['we', 'wear']
['ar', 'are']
['ea', 'ear']
['th', 'the']
['ea', 'ear']
['people']
[]
['ea', 'ear']


In [22]:
input_word = "wearethepeople"

arr_valid_words = []
for N in range(2, len(input_word)):
    for i in range(len(input_word)-N+1):
        if input_word[i:i+N] in words.words():
            arr_valid_words.append(input_word[i:i+N])
            
arr_valid_words

arr_valid_sub_words = []
def my_func(var_word, var_arr):
    if [i for i in var_arr if i[0] == var_word[:1]]:
#         print [i for i in var_arr if i[0] == var_word[:1]]
        arr_valid_sub_words.append([i for i in var_arr if i[0] == var_word[:1]])
    for i in [i for i in var_arr if i[0] == var_word[:1]]:
        if var_word.replace(i, "") != var_word:
            my_func(var_word.replace(i, ""), arr_valid_words)
        else:
            break
     
my_func(input_word, arr_valid_words)

print arr_valid_sub_words

[['we', 'wear'], ['ar', 'are'], ['ea', 'ear'], ['th', 'the'], ['ea', 'ear'], ['people'], ['ea', 'ear']]


In [38]:
input_word = "wearethepeople"

def TagSegmenter(input_word):
    arr_valid_words = []
    for N in range(2, len(input_word)):
        for i in range(len(input_word)-N+1):
            if input_word[i:i+N] in words.words():
                arr_valid_words.append(input_word[i:i+N])

    arr_valid_words

    arr_valid_sub_words = []
    arr_vallid_sub_words_2 = []
    def my_func(var_word, var_arr, var_sub_word):
        arr_vallid_sub_words_2.append(var_sub_word)
    #     if [i for i in var_arr if i[0] == var_word[:1]]:
    #         if [i for i in var_arr if i[0] == var_word[:1] if i in input_word] not in arr_valid_sub_words:
    #             arr_valid_sub_words.append([i for i in var_arr if i[0] == var_word[:1] if i in input_word])

        for i in [i for i in var_arr if i[0] == var_word[:1]]:
            if var_word.replace(i, "") != var_word:
                my_func(var_word.replace(i, ""), arr_valid_words, i )
            else:
                break

    my_func(input_word, arr_valid_words, "")

    # print arr_valid_sub_words
    # print arr_vallid_sub_words_2

    arr_valid_output = []
    import itertools
    for i in list(itertools.combinations_with_replacement(arr_vallid_sub_words_2,4)):
    #     print "".join(list(i))
        if "".join(list(i)) == input_word:
    #         print " ".join(list(i)).lstrip()
            arr_valid_output.append(" ".join(list(i)).lstrip())
    if arr_valid_output:
        print arr_valid_output[0]
    else:
        print "No Output"
        
TagSegmenter(input_word)

we are the people


In [199]:
import zlib
import binascii
# a = ["Hello World", "Kartik"]
f = open("corpus - Copy.txt", "r")
a = f.read()

In [200]:
compressed = zlib.compress(str(a))
b = str(binascii.hexlify(compressed))

In [201]:
import base64
f_compress = open("compressed.txt", "w")
f_compress.write((b))
f_compress.close()