In [3]:
%matplotlib inline
import os
from pymongo import MongoClient

If dataset/comments.txt doesn't exist, pull the comments from the database.

In [4]:
if not os.path.isfile('dataset/comments.txt'):
    client = MongoClient()
    scrape = client.scrape

    f = open("dataset/comments.txt", "w")

    # print the comments to a file, separated with \n, 
    # so we can use the croatian stemmer on them
    for comment in scrape.blic.find():
        text = comment['comment']
        text.replace('\n', ' ')
        text += '\n'
        f.write(text.encode('utf8'))

    f.close()

In [163]:
cyrillic = set(u"АаБбВвГгДдЂђЕеЖжЗзИиЈјКкЛлЉљМмНнЊњОоПпРрСсТтЋћУуФфХхЦцЧчЏџШш")
def remove_cyrillic_comments(comments, print_perc=True):
    cyrillic_count = 0.0; all_count = len(comments)
    clean_coms = []
    
    for comment in comments:
        if not bool(set(comment.decode('utf8')).intersection(cyrillic)):
            clean_coms.append(comment)
        else:
            cyrillic_count += 1
        
    if print_perc:
        print "Cyrillic comments make up %s percent" % (cyrillic_count / all_count * 100)
    return clean_coms

corpus = open('dataset/comments.txt', 'r').readlines()[:10**4]
corpus = remove_cyrillic_comments(corpus)

Cyrillic comments make up 1.43 percent


### Now that we have comments that don't include cyrillic, let's find all the tokens and create a vocabulary file _vocab.txt_

In [164]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

croatian_stop_words = set([u"a",u"ako",u"ali",u"bi",u"bih",u"bila",u"bili",u"bilo",u"bio",u"bismo",u"biste",u"biti",u"bumo",u"da",u"do",u"duž",u"ga",u"hoće",u"hoćemo",u"hoćete",u"hoćeš",u"hoću",u"i",u"iako",u"ih",u"ili",u"iz",u"ja",u"je",u"jedna",u"jedne",u"jedno",u"jer",u"jesam",u"jesi",u"jesmo",u"jest",u"jeste",u"jesu",u"jim",u"joj",u"još",u"ju",u"kada",u"kako",u"kao",u"koja",u"koje",u"koji",u"kojima",u"koju",u"kroz",u"li",u"me",u"mene",u"meni",u"mi",u"mimo",u"moj",u"moja",u"moje",u"mu",u"na",u"nad",u"nakon",u"nam",u"nama",u"nas",u"naš",u"naša",u"naše",u"našeg",u"ne",u"nego",u"neka",u"neki",u"nekog",u"neku",u"nema",u"netko",u"neće",u"nećemo",u"nećete",u"nećeš",u"neću",u"nešto",u"ni",u"nije",u"nikoga",u"nikoje",u"nikoju",u"nisam",u"nisi",u"nismo",u"niste",u"nisu",u"njega",u"njegov",u"njegova",u"njegovo",u"njemu",u"njezin",u"njezina",u"njezino",u"njih",u"njihov",u"njihova",u"njihovo",u"njim",u"njima",u"njoj",u"nju",u"no",u"o",u"od",u"odmah",u"on",u"ona",u"oni",u"ono",u"ova",u"pa",u"pak",u"po",u"pod",u"pored",u"prije",u"s",u"sa",u"sam",u"samo",u"se",u"sebe",u"sebi",u"si",u"smo",u"ste",u"su",u"sve",u"svi",u"svog",u"svoj",u"svoja",u"svoje",u"svom",u"ta",u"tada",u"taj",u"tako",u"te",u"tebe",u"tebi",u"ti",u"to",u"toj",u"tome",u"tu",u"tvoj",u"tvoja",u"tvoje",u"u",u"uz",u"vam",u"vama",u"vas",u"vaš",u"vaša",u"vaše",u"već",u"vi",u"vrlo",u"za",u"zar",u"će",u"ćemo",u"ćete",u"ćeš",u"ću",u"što"])

counter = CountVectorizer(
    min_df=5,
    strip_accents="unicode",
    lowercase=True,
    stop_words=croatian_stop_words
)

X = counter.fit_transform(corpus)
vocabulary = counter.vocabulary_.keys()

# The filename for saving the vocabulary
vocab_filename = "vocab.txt"

# print the vocab to a file vocab.txt
vocab_file = open(vocab_filename, 'w')
vocab_file.write("\n".join(vocabulary).encode('utf-8'))
vocab_file.close()

### Run the croatian stemmer on the vocab.txt file and read it back to a stemmed vocabulary.
### Create a dict mapping the original words to the stemmed words

In [165]:
import os

output_filename = "stemmed_vocab.txt"

command = "python cro_stemmer/Croatian_stemmer.py %s %s" % (vocab_filename, output_filename)
os.system(command)

stemmed_vocab = open(output_filename, 'r').readlines()
stemmed_vocab = [x.split()[1] for x in stemmed_vocab]

stem_map = {x:y for x,y in zip(vocabulary, stemmed_vocab)}

### Replace the tokens in the corpus with the stemmed tokens 

In [166]:
from scipy.sparse import csr_matrix, find

stemmed_corpus_array = [[] for x in corpus]  # create an empty array for every comment
coords = find(X)

locations  = zip(coords[0], coords[1], coords[2])

for l in locations:
    stemmed_corpus_array[l[0]].append([vocabulary[l[1]]]* l[2])

In [40]:
vectorizer = TfidfVectorizer(
    strip_accents="unicode",
    lowercase=True,
    ngram_range=(1,2),
    min_df=5,
    norm='l2',
    smooth_idf=True,
    use_idf=True,
    stop_words=croatian_stop_words)

X = vectorizer.fit_transform(corpus)

In [60]:
import numpy as np
popularity_ind = np.argsort(vectorizer.vocabulary_.values())

print vectorizer.vocabulary_.values()

[29136, 31249, 29297, 17022, 17440, 3320, 26058, 5826, 16639, 13477, 32151, 23014, 21274, 11476, 23010, 32603, 5006, 12577, 4204, 11188, 12122, 12105, 12102, 12127, 12230, 33507, 7328, 9177, 1704, 1703, 26522, 16402, 8261, 8268, 8272, 8671, 12516, 33538, 33537, 32290, 32291, 32292, 4230, 4229, 22421, 25228, 23033, 29758, 29755, 6540, 11912, 23705, 15690, 15691, 19358, 11500, 19999, 2643, 23306, 12712, 21253, 21256, 21255, 18756, 18755, 18757, 19709, 14481, 5400, 27811, 14948, 398, 24521, 17009, 6311, 8584, 8583, 14253, 12322, 12321, 10443, 22042, 33611, 12873, 25741, 1664, 1636, 19189, 18966, 18965, 25766, 25765, 18830, 10855, 8709, 6896, 14939, 14585, 8977, 8978, 26491, 100, 26650, 23549, 7276, 27097, 11929, 11933, 30296, 30297, 30298, 30299, 13438, 23851, 8202, 8203, 31308, 26373, 26371, 26374, 11024, 2624, 14596, 19403, 14262, 31178, 19472, 1628, 2657, 6169, 29388, 5476, 19758, 5473, 5474, 15184, 17836, 3254, 2065, 16826, 4346, 31095, 13521, 1196, 19663, 23286, 22188, 22189, 10268, 