### 1. Importowanie oraz pobieranie modułów

In [25]:
import nltk

In [26]:
# po pobraniu VADERA powinieneś zobaczyć "True", jeżeli nie możesz sciągnąć tego modelu, spróbuj odblokować dostęp w swoim Firewallu
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mgprivate/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [27]:
# moduł można zaimportować dopiero po pobraniu modelu
from nltk.sentiment.vader import SentimentIntensityAnalyzer

### 2. Wykorzystanie modelu Vader

In [28]:
# tworzenie instancji analizatora sentymentu
sid = SentimentIntensityAnalyzer()
sentence = input("wprowadź swoje zdanie tutaj")

# nacechowanie tekstu będzie widoczne jako wartość dla klucza 'compound'
sid.polarity_scores(sentence)

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

### 3. Modelowanie tematyczne (topic modelling)

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd

# load the dataset
# dataset https://gist.github.com/kunalj101/ad1d9c58d338e20d09ff26bcc06c4235
data = open('corpus').read()
labels, texts = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split()
    labels.append(content[0])
    texts.append(" ".join(content[1:]))

# create a dataframe using texts and lables
trainDF = pd.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

In [30]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# split the dataset into training and validation datasets
train_x, valid_x, train_y, valid_y = train_test_split(trainDF['text'], trainDF['label'])

# label encode the target variable
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [31]:
# create a count vectorizer object
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)
print(xtrain_count)

  (0, 710)	1
  (0, 1743)	1
  (0, 2752)	1
  (0, 3725)	1
  (0, 4138)	1
  (0, 4423)	4
  (0, 5175)	4
  (0, 5305)	2
  (0, 5306)	1
  (0, 8139)	1
  (0, 8143)	1
  (0, 8711)	1
  (0, 8755)	4
  (0, 9442)	1
  (0, 11064)	1
  (0, 11117)	1
  (0, 11390)	1
  (0, 13407)	4
  (0, 14104)	1
  (0, 15169)	2
  (0, 15569)	3
  (0, 16193)	1
  (0, 16953)	1
  (0, 19529)	1
  (0, 19630)	1
  :	:
  (7499, 17802)	2
  (7499, 18833)	1
  (7499, 18847)	1
  (7499, 19039)	1
  (7499, 19240)	2
  (7499, 19525)	1
  (7499, 19529)	1
  (7499, 19647)	1
  (7499, 19752)	2
  (7499, 21901)	1
  (7499, 23878)	1
  (7499, 24791)	1
  (7499, 25337)	1
  (7499, 26296)	1
  (7499, 28082)	7
  (7499, 28215)	1
  (7499, 28224)	1
  (7499, 28312)	1
  (7499, 28493)	3
  (7499, 30167)	1
  (7499, 30607)	1
  (7499, 30682)	1
  (7499, 31250)	1
  (7499, 31506)	1
  (7499, 31521)	1


In [35]:
from sklearn import decomposition
import numpy


# train a LDA Model
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
X_topics = lda_model.fit_transform(xtrain_count)
topic_word = lda_model.components_
vocab = count_vect.get_feature_names()

# view the topic models
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = numpy.array(vocab)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))
print(topic_summaries)

['toy jack installed software scanner elements x pet haiku advanced', 'the of book a and is in to this read', 'player software mp3 usb button sale files file 45 function', 'la de y un en el overrated et que a', 'prehistoric snmp march rockin clarity stages courses trained gnostic boston', 'rice action skin movie david van gonna ii ball special', 'turns bradley napoleon lake twists theology loosely michigan titan interactions', 'desk salt factual academic grain campaign strangers locate rewarded communication', 'manson illustrations run ready soundtrack returning german wall lola saved', 'the i it and to a this is of for', 'dolly bluegrass packaged cooking argento lee patterns sloppy troma kitchen', 'voodoo havent integrity dcr joanna misled divinity nicoletta candid decameron', 'product works price replacement computer apple power support plug shipping', 'printer hp thin print paper genius diane lane drive error', 'battery labor anti produced whose henry outlet co beds eargels', 'helpf