In [2]:
# Task 1
import re
import numpy as np

# load up all of the 19997 documents in the corpus
corpus = sc.textFile(
    "s3://chrisjermainebucket/comp330_A6/20_news_same_line.txt")

# each entry in validLines will be a line from the text file
validLines = corpus.filter(lambda x: 'id' in x)

# now we transform it into a bunch of (docID, text) pairs
keyAndText = validLines.map(
    lambda x: (x[x.index('id="') + 4:x.index('" url=')], x[x.index('">') + 2:]))

# now we split the text in each (docID, text) pair into a list of words
# after this, we have a data set with (docID, ["word1", "word2", "word3", ...])
# we have a bit of fancy regular expression stuff here to make sure that we do not
# die on some of the documents
regex = re.compile('[^a-zA-Z]')
keyAndListOfWords = keyAndText.map(
    lambda x: (str(x[0]), regex.sub(' ', x[1]).lower().split()))

# now get the top 20,000 words... first change (docID, ["word1", "word2", "word3", ...])
# to ("word1", 1) ("word2", 1)...
allWords = keyAndListOfWords.flatMap(lambda x: ((j, 1) for j in x[1]))
# now, count all of the words, giving us ("word1", 1433), ("word2", 3423423), etc.
allCounts = allWords.reduceByKey(lambda a, b: a + b)

# and get the top 20,000 words in a local array
# each entry is a ("word1", count) pair
topWords = allCounts.top(20000, lambda x: x[1])

# and we'll create a RDD that has a bunch of (word, dictNum) pairs
# start by creating an RDD that has the number 0 thru 20000
# 20000 is the number of words that will be in our dictionary
twentyK = sc.parallelize(range(20000))

# now, we transform (0), (1), (2), ... to ("mostcommonword", 1) ("nextmostcommon", 2), ...
# the number will be the spot in the dictionary used to tell us where the word is located
# A bunch of (word, posInDictionary) pairs
# dictionary.top(3) => [('zz', 6505), ('zyxel', 13838), ('zyeh', 18666)]
dictionary = twentyK.map(lambda x: (topWords[x][0], x))
# A bunch of (word, docID) pairs
# wordDictPair.top(3) => [('zzzzzzt', '20_newsgroups/rec.sport.baseball/104569'), ('zzzzzz', '20_newsgroups/rec.sport.hockey/53841'), ('zzzzzz', '20_newsgroups/rec.sport.baseball/105004')]
wordDictPair = keyAndListOfWords.flatMap(lambda x: ((j, x[0]) for j in x[1]))

# Join the two RDDs, you'll have a bunch of (word, (docID, posInDictionary)) pairs
# wordPair.top(3) => [('zz', ('20_newsgroups/talk.politics.guns/54380', 6505)), ('zz', ('20_newsgroups/talk.politics.guns/54380', 6505)), ('zz', ('20_newsgroups/talk.politics.guns/54380', 6505))]
wordPair = wordDictPair.join(dictionary)

# Get a bunch of (docid, (listOfAllDictonaryPos)) pairs
# docIDIdxPair.top(3) => [('20_newsgroups/talk.religion.misc/84570', 19590), ('20_newsgroups/talk.religion.misc/84570', 16881), ('20_newsgroups/talk.religion.misc/84570', 16475)]
# docIDAllIdxPair.top(2) => [('20_newsgroups/talk.religion.misc/84570', <pyspark.resultiterable.ResultIterable object at 0x7fe93fa07190>), ('20_newsgroups/talk.religion.misc/84569', <pyspark.resultiterable.ResultIterable object at 0x7fe93fa07d50>)]
docIDIdxPair = wordPair.map(lambda x: (x[1][0], x[1][1]))
docIDAllIdxPair = docIDIdxPair.groupByKey()
docIDAllIdxList = docIDAllIdxPair.map(lambda x: (x[0], list(x[1])))
docIDAllIdxList.top(1)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[('20_newsgroups/talk.religion.misc/84570', [1, 12, 12, 12, 12, 12, 12, 12, 12, 14, 201, 317, 1305, 12209, 4409, 15, 33, 78, 4002, 38, 38, 46, 46, 53, 123, 357, 812, 812, 2498, 4195, 9336, 92, 143, 143, 950, 930, 7271, 4722, 13, 26, 1817, 1958, 6, 6, 23, 34, 17, 45, 45, 2293, 4402, 6020, 9265, 164, 176, 1660, 2, 200, 36, 360, 210, 2153, 1654, 31, 367, 1262, 19904, 9552, 16847, 0, 0, 18, 18, 28, 28, 28, 28, 1237, 6102, 16467, 2403, 20, 20, 20, 25, 25, 49, 154, 415, 1573, 5794, 2269, 3, 4, 4, 4, 4, 32, 59, 59, 59, 59, 59, 184, 4989, 2003, 2218, 5, 5, 57, 87, 98, 117, 117, 148, 473, 6273, 6274, 24, 37, 47, 54, 61, 133, 1096, 1458, 1574, 4786, 1950])]

In [3]:
# Then finally, you will write a map () that will take that RDD and convert into the listOfAllDictonaryPos values to a NumPy array.
def transformNP(idxList):
    array = np.zeros(20000)
    for i in idxList:
        array[i] += 1
    return array


result = docIDAllIdxList.map(lambda x: (x[0], transformNP(x[1])))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
arr37261 = np.array(result.lookup("20_newsgroups/comp.graphics/37261"))
arr37261[arr37261.nonzero()]

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

array([ 8.,  2.,  6.,  3., 12.,  4.,  3.,  6.,  2.,  1.,  1.,  5.,  2.,
        2.,  2.,  3.,  1.,  1.,  1.,  1.,  3.,  1.,  1.,  2.,  3.,  4.,
        1.,  1.,  1.,  1.,  1.,  3.,  1.,  1.,  1.,  2.,  1.,  1.,  1.,
        2.,  1.,  1.,  2.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  2.,
        1.,  1.,  2.,  2.,  1.,  2.,  1.,  1.,  1.,  3.,  4.,  1.,  1.,
        1.,  1.,  2.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  2.,  1.,
        1.,  1.,  1.,  5.,  2.,  2.,  1.,  1.,  5.,  1.,  4.,  1.,  1.,
        1.,  2.,  1.,  2.,  1., 11.,  1.,  1.,  1.,  1.,  2.,  2.,  2.,
        5.,  1.,  2.,  1.,  1.,  1.,  1.,  2.,  1.,  2.,  2.,  4.,  1.,
        1.,  1.,  5.,  1.,  1.,  1.,  1.,  2.,  4.,  1.,  1.,  1.,  3.,
        1.,  1.,  1.,  1.,  3.,  2.,  2.,  1.,  1.,  6.,  1.,  6.,  1.,
        1.,  3.,  1.,  1.,  2.,  1.,  1.,  1.,  1.,  2.,  7.,  1.,  1.,
        1.,  1.,  1.])

In [5]:
arr75944 = np.array(result.lookup("20_newsgroups/talk.politics.mideast/75944"))
arr75944[arr75944.nonzero()]

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

array([135.,  37.,  71.,  28.,  49.,  19.,  46.,  16.,  13.,  22.,   9.,
        22.,  11.,   7.,   7.,   6.,   4.,   6.,  12.,  11.,  10.,   3.,
        10.,   4.,   2.,  21.,   5.,   4.,   2.,   2.,   1.,   1.,   1.,
         5.,   1.,  23.,   5.,   2.,   1.,   6.,   8.,   4.,   7.,   3.,
         3.,   2.,   1.,   1.,   6.,   4.,   4.,   7.,   1.,   8.,   7.,
        13.,   4.,   4.,  10.,   3.,   3.,   2.,   2.,   3.,   7.,   4.,
         1.,   2.,   4.,   8.,   4.,   7.,   2.,   1.,   1.,   2.,   1.,
         2.,   2.,   1.,   5.,   3.,   3.,   3.,   1.,   1.,   1.,   2.,
         1.,   4.,   3.,   1.,   3.,   3.,   4.,   7.,   1.,   2.,   1.,
         3.,   2.,   1.,   4.,   6.,   3.,  11.,   1.,   6.,   3.,   1.,
         3.,   1.,   2.,   1.,   1.,   1.,   3.,   3.,   2.,   5.,   2.,
         2.,   2.,   2.,   1.,   1.,   1.,   1.,   1.,   3.,   1.,   1.,
         1.,   1.,   3.,   3.,   4.,   1.,   1.,   5.,   1.,   1.,   2.,
         6.,   2.,   2.,   1.,   1.,   1.,   1.,   

In [6]:
arr58763 = np.array(result.lookup("20_newsgroups/sci.med/58763"))
arr58763[arr58763.nonzero()]

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

array([4., 4., 3., 2., 1., 1., 4., 3., 1., 2., 1., 5., 1., 2., 1., 1., 1.,
       2., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 2., 1., 1., 1., 1., 2., 1., 1., 1., 2., 1., 5., 1.,
       1., 1., 1., 1., 1., 1., 2., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 3., 1., 1.])

In [7]:
# Task 2
# i => word_i, d => documents_d
import math


def findWord(wordList):
    arr = np.zeros(20000)
    for idx, i in enumerate(wordList):
        if i > 0:
            arr[idx] += 1
    return arr

def IDF(result):
    cnt = 0
    val = result.map(lambda x: ("key", findWord(x[1])))
    finalList = val.reduceByKey(lambda a, b: a + b)
    return finalList


# Calculate IDF values
size = result.count()
IDFList = IDF(result)
IDFArr = np.array(IDFList.lookup("key"))
IDFArr = np.log(size / IDFArr)

# 20_newsgroups/comp.graphics/37261
arrCnt = np.array(result.lookup("20_newsgroups/comp.graphics/37261"))
TFArr = arrCnt / arrCnt.sum()
finalResult = np.multiply(TFArr, IDFArr)
print(finalResult[finalResult.nonzero()])

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[1.92555059e-03 8.50478606e-04 3.65947527e-03 1.27515309e-03
 7.26881462e-03 2.19975741e-03 2.66216132e-03 6.35871363e-03
 3.43014584e-03 1.65693809e-03 9.44999233e-03 3.95217425e-03
 4.19787791e-03 5.18781198e-03 6.54906258e-03 9.61804463e-06
 9.42766276e-03 2.87951994e-03 5.38035007e-04 6.14710796e-03
 9.49332595e-03 1.50258572e-02 3.37418661e-03 4.94317579e-03
 5.98488082e-03 4.54842076e-03 4.99141867e-03 1.65903978e-02
 4.91237317e-03 7.28885166e-03 6.36745621e-03 1.33917572e-02
 6.28356577e-03 7.13964429e-03 7.13821425e-03 1.59751652e-02
 6.41882998e-03 7.64721754e-03 1.62776158e-02 8.57654687e-03
 7.64721754e-03 8.98232287e-03 8.13687431e-03 7.89645803e-03
 8.90098531e-03 8.51964347e-03 8.08508990e-03 1.72643992e-02
 8.86948389e-03 9.22023607e-03 2.01086375e-02 2.13909906e-02
 1.05521284e-02 1.99249503e-02 1.03349188e-02 9.82126974e-03
 1.04846708e-02 3.22766724e-02 4.18291549e-02 1.12750179e-02
 1.09072460e-02 1.15831427e-02 1.14176787e-02 2.29934203e-02
 1.12501771e-02 1.208446

In [8]:
# 20_newsgroups/talk.politics.mideast/75944
arrCnt = np.array(result.lookup("20_newsgroups/talk.politics.mideast/75944"))
TFArr = arrCnt / arrCnt.sum()
finalResult = np.multiply(TFArr, IDFArr)
print(finalResult[finalResult.nonzero()])

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[5.54725111e-03 2.68605086e-03 7.39273306e-03 2.03178717e-03
 5.06707738e-03 1.94545485e-03 4.31868710e-03 2.42388609e-03
 2.46472840e-03 4.14154280e-03 1.62831955e-03 6.62550483e-03
 3.22073049e-03 2.43657927e-03 1.69721414e-03 1.15698386e-03
 1.93593964e-03 4.04824180e-03 3.94159141e-03 2.94504525e-03
 9.88048494e-04 4.42826235e-03 1.49072312e-03 5.28678488e-04
 1.04214371e-02 1.86094552e-03 1.49772974e-03 1.04087154e-03
 1.64197257e-06 2.22622265e-03 4.73573904e-05 1.23392775e-02
 1.00104738e-03 9.83171309e-04 9.65207547e-04 3.03599781e-03
 3.98260227e-03 2.13198368e-03 3.57653570e-03 1.57413222e-03
 2.17955958e-03 1.07444211e-03 5.40226952e-04 5.59517954e-04
 3.84777461e-03 1.44634481e-03 2.21205679e-03 6.24049705e-03
 5.63104209e-04 4.86079553e-03 4.90455048e-03 7.84729739e-03
 1.77626038e-03 2.30413648e-03 7.10498595e-03 1.96850758e-03
 2.16703483e-03 1.31035475e-03 1.36934217e-03 1.94164808e-03
 5.81927334e-03 2.86998439e-03 1.36267792e-03 1.45303972e-03
 3.47796655e-03 6.751109

In [9]:
# 20_newsgroups/sci.med/58763
arrCnt = np.array(result.lookup("20_newsgroups/sci.med/58763"))
TFArr = arrCnt / arrCnt.sum()
finalResult = np.multiply(TFArr, IDFArr)
print(finalResult[finalResult.nonzero()])

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[2.34482371e-03 4.14265386e-03 4.42577019e-03 2.92148526e-03
 1.33936842e-03 2.16121698e-03 1.08191180e-02 8.05687963e-03
 2.58109075e-03 8.59275516e-03 4.17703243e-03 4.12642026e-03
 9.62545664e-03 4.20143956e-03 6.31741620e-03 5.31671210e-03
 1.48492077e-02 2.34245926e-05 6.75606643e-04 2.85621422e-03
 1.39301109e-02 7.98215533e-03 5.15843543e-03 7.88937191e-03
 8.61158132e-03 6.33508993e-03 9.41006605e-03 1.04688183e-02
 1.10776054e-02 1.06254158e-02 1.15837546e-02 1.47099141e-02
 1.39793137e-02 1.47149138e-02 1.53791960e-02 1.36221220e-02
 1.58114516e-02 3.46935867e-02 3.42222836e-02 1.70775536e-02
 1.82594869e-02 1.83374808e-02 2.12790916e-02 1.99693017e-02
 2.16014850e-02 2.17616535e-02 2.77570110e-02 2.30871061e-02
 2.31437988e-02 2.52717384e-02 2.50165686e-02 5.76677542e-02
 2.58076567e-02 2.72681028e-02 3.02984539e-02 2.82909350e-02
 6.64934520e-02 3.03504273e-02 3.12699320e-02 3.30500207e-02
 6.64437477e-02 4.11898951e-02 2.22132596e-01 3.43093815e-02
 3.48071684e-02 3.878699

In [27]:
# Task 3
import re
import numpy as np


dictTuple = dict(dictionary.collect())

def TFIDF(arrCnt):
    TFArr = arrCnt / arrCnt.sum()
    finalResult = np.multiply(TFArr, IDFArr)
    return finalResult

def predictLabel(cnt, inputStr):
    regex = re.compile('[^a-zA-Z]')
    # Input string to a bunch of words
    words = regex.sub(' ', inputStr).lower().split()
    rdd = sc.parallelize(list(words))

    wordArr = np.zeros(20000)
    for word in words:
        if word in dictTuple.keys():
            wordArr[dictTuple[word]] += 1
    inputTFIDF = TFIDF(wordArr)
    
    # np.linalg.norm() is to sqare each element of the matrix, sum them up, and apply sqrt() on it
    knnDisRdd = result.map(lambda x: (x[0], np.linalg.norm(inputTFIDF - TFIDF(x[1]))))
    knnDis = knnDisRdd.collect()
    knnDis.sort(key = lambda x: x[1])
    
    # It then finds the k documents in the corpus that are closest to the query vector
    topCntList = list()
    for i in range(cnt):
        topCntList.append(knnDis[i][0])
    
    # Returns the newsgroup label that is most frequent in those top k
    categoryDict = dict()
    for newsGroup in topCntList:
        category = newsGroup.split('/')[1]
        if category not in categoryDict:
            categoryDict[category] = 1
        else:
            categoryDict[category] += 1
    mostCategory = max(categoryDict, key = categoryDict.get)
    
    return "20_newsgroups/" + mostCategory

predictLabel(10, 'Graphics are pictures and movies created using computers – usually referring to image data created by a computer specifically with help from specialized graphical hardware and software. It is a vast and recent area in computer science. The phrase was coined by computer graphics researchers Verne Hudson and William Fetter of Boeing in 1960. It is often abbreviated as CG, though sometimes erroneously referred to as CGI. Important topics in computer graphics include user interface design, sprite graphics, vector graphics, 3D modeling, shaders, GPU design, implicit surface visualization with ray tracing, and computer vision, among others. The overall methodology depends heavily on the underlying sciences of geometry, optics, and physics. Computer graphics is responsible for displaying art and image data effectively and meaningfully to the user, and processing image data received from the physical world. The interaction and understanding of computers and interpretation of data has been made easier because of computer graphics. Computer graphic development has had a significant impact on many types of media and has revolutionized animation, movies, advertising, video games, and graphic design generally.')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'20_newsgroups/comp.graphics'

In [28]:
predictLabel(10, 'A deity is a concept conceived in diverse ways in various cultures, typically as a natural or supernatural being considered divine or sacred. Monotheistic religions accept only one Deity (predominantly referred to as God), polytheistic religions accept and worship multiple deities, henotheistic religions accept one supreme deity without denying other deities considering them as equivalent aspects of the same divine principle, while several non-theistic religions deny any supreme eternal creator deity but accept a pantheon of deities which live, die and are reborn just like any other being. A male deity is a god, while a female deity is a goddess. The Oxford reference defines deity as a god or goddess (in a polytheistic religion), or anything revered as divine. C. Scott Littleton defines a deity as a being with powers greater than those of ordinary humans, but who interacts with humans, positively or negatively, in ways that carry humans to new levels of consciousness beyond the grounded preoccupations of ordinary life.')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'20_newsgroups/talk.religion.misc'

In [29]:
predictLabel(10, 'Egypt, officially the Arab Republic of Egypt, is a transcontinental country spanning the northeast corner of Africa and southwest corner of Asia by a land bridge formed by the Sinai Peninsula. Egypt is a Mediterranean country bordered by the Gaza Strip and Israel to the northeast, the Gulf of Aqaba to the east, the Red Sea to the east and south, Sudan to the south, and Libya to the west. Across the Gulf of Aqaba lies Jordan, and across from the Sinai Peninsula lies Saudi Arabia, although Jordan and Saudi Arabia do not share a land border with Egypt. It is the worlds only contiguous Eurafrasian nation. Egypt has among the longest histories of any modern country, emerging as one of the worlds first nation states in the tenth millennium BC. Considered a cradle of civilisation, Ancient Egypt experienced some of the earliest developments of writing, agriculture, urbanisation, organised religion and central government. Iconic monuments such as the Giza Necropolis and its Great Sphinx, as well the ruins of Memphis, Thebes, Karnak, and the Valley of the Kings, reflect this legacy and remain a significant focus of archaeological study and popular interest worldwide. Egypts rich cultural heritage is an integral part of its national identity, which has endured, and at times assimilated, various foreign influences, including Greek, Persian, Roman, Arab, Ottoman, and European. One of the earliest centers of Christianity, Egypt was Islamised in the seventh century and remains a predominantly Muslim country, albeit with a significant Christian minority.')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'20_newsgroups/alt.atheism'

In [30]:
predictLabel(10, 'The term atheism originated from the Greek atheos, meaning without god(s), used as a pejorative term applied to those thought to reject the gods worshiped by the larger society. With the spread of freethought, skeptical inquiry, and subsequent increase in criticism of religion, application of the term narrowed in scope. The first individuals to identify themselves using the word atheist lived in the 18th century during the Age of Enlightenment. The French Revolution, noted for its unprecedented atheism, witnessed the first major political movement in history to advocate for the supremacy of human reason. Arguments for atheism range from the philosophical to social and historical approaches. Rationales for not believing in deities include arguments that there is a lack of empirical evidence; the problem of evil; the argument from inconsistent revelations; the rejection of concepts that cannot be falsified; and the argument from nonbelief. Although some atheists have adopted secular philosophies (eg. humanism and skepticism), there is no one ideology or set of behaviors to which all atheists adhere.')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'20_newsgroups/alt.atheism'

In [31]:
predictLabel(10, 'President Dwight D. Eisenhower established NASA in 1958 with a distinctly civilian (rather than military) orientation encouraging peaceful applications in space science. The National Aeronautics and Space Act was passed on July 29, 1958, disestablishing NASAs predecessor, the National Advisory Committee for Aeronautics (NACA). The new agency became operational on October 1, 1958. Since that time, most US space exploration efforts have been led by NASA, including the Apollo moon-landing missions, the Skylab space station, and later the Space Shuttle. Currently, NASA is supporting the International Space Station and is overseeing the development of the Orion Multi-Purpose Crew Vehicle, the Space Launch System and Commercial Crew vehicles. The agency is also responsible for the Launch Services Program (LSP) which provides oversight of launch operations and countdown management for unmanned NASA launches.')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'20_newsgroups/sci.space'

In [32]:
predictLabel(10, 'The transistor is the fundamental building block of modern electronic devices, and is ubiquitous in modern electronic systems. First conceived by Julius Lilienfeld in 1926 and practically implemented in 1947 by American physicists John Bardeen, Walter Brattain, and William Shockley, the transistor revolutionized the field of electronics, and paved the way for smaller and cheaper radios, calculators, and computers, among other things. The transistor is on the list of IEEE milestones in electronics, and Bardeen, Brattain, and Shockley shared the 1956 Nobel Prize in Physics for their achievement.')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'20_newsgroups/talk.politics.misc'

In [33]:
predictLabel(10, 'The Colt Single Action Army which is also known as the Single Action Army, SAA, Model P, Peacemaker, M1873, and Colt .45 is a single-action revolver with a revolving cylinder holding six metallic cartridges. It was designed for the U.S. government service revolver trials of 1872 by Colts Patent Firearms Manufacturing Company – todays Colts Manufacturing Company – and was adopted as the standard military service revolver until 1892. The Colt SAA has been offered in over 30 different calibers and various barrel lengths. Its overall appearance has remained consistent since 1873. Colt has discontinued its production twice, but brought it back due to popular demand. The revolver was popular with ranchers, lawmen, and outlaws alike, but as of the early 21st century, models are mostly bought by collectors and re-enactors. Its design has influenced the production of numerous other models from other companies.')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'20_newsgroups/talk.politics.guns'

In [34]:
predictLabel(10, 'Howe was recruited by the Red Wings and made his NHL debut in 1946. He led the league in scoring each year from 1950 to 1954, then again in 1957 and 1963. He ranked among the top ten in league scoring for 21 consecutive years and set a league record for points in a season (95) in 1953. He won the Stanley Cup with the Red Wings four times, won six Hart Trophies as the leagues most valuable player, and won six Art Ross Trophies as the leading scorer. Howe retired in 1971 and was inducted into the Hockey Hall of Fame the next year. However, he came back two years later to join his sons Mark and Marty on the Houston Aeros of the WHA. Although in his mid-40s, he scored over 100 points twice in six years. He made a brief return to the NHL in 1979–80, playing one season with the Hartford Whalers, then retired at the age of 52. His involvement with the WHA was central to their brief pre-NHL merger success and forced the NHL to expand their recruitment to European talent and to expand to new markets.')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'20_newsgroups/talk.politics.mideast'