In [25]:
import sklearn
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True,random_state=1,remove=('headers','footers','quotes'))

dataset

In [25]:
dataset.keys()

dict_keys(['description', 'target', 'target_names', 'data', 'filenames', 'DESCR'])

In [28]:
len(dataset.data)

11314

In [29]:
data_samples = dataset.data[0:10]
data_samples

["Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n",
 "\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
dtm_vectorizer = CountVectorizer()
dtm = dtm_vectorizer.fit_transform(data_samples)
dtm

<10x845 sparse matrix of type '<class 'numpy.int64'>'
	with 1133 stored elements in Compressed Sparse Row format>

In [31]:
# print(dtm)
# dir(dtm)
dtm.shape[1]

845

In [32]:
dtm.toarray().size

8450

In [33]:
feature_names = dtm_vectorizer.get_feature_names()
len(feature_names)

845

In [34]:
dtm_vectorizer.inverse_transform(dtm)
dtm

<10x845 sparse matrix of type '<class 'numpy.int64'>'
	with 1133 stored elements in Compressed Sparse Row format>

In [35]:
corpus = ['This is a text mining book.',
          'Is this a text mining book?',
          'Text mining with python.']
vectorizer = CountVectorizer()
X = vectorizer .fit_transform(corpus)
X

<3x7 sparse matrix of type '<class 'numpy.int64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [36]:
vectorizer.get_feature_names()

['book', 'is', 'mining', 'python', 'text', 'this', 'with']

In [37]:
X.toarray()

array([[1, 1, 1, 0, 1, 1, 0],
       [1, 1, 1, 0, 1, 1, 0],
       [0, 0, 1, 1, 1, 0, 1]], dtype=int64)

In [38]:
bigram_vectorizer = CountVectorizer(ngram_range=(1,2))
print(bigram_vectorizer)
X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
X_2

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


array([[1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0],
       [1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1]], dtype=int64)

In [39]:
bigram_vectorizer2 = CountVectorizer(ngram_range=(1,3))
x2= bigram_vectorizer2.fit_transform(corpus).toarray()
x2

array([[1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1]], dtype=int64)

In [50]:
bigram_vectorizer.get_feature_names()
bigram_vectorizer.get_stop_words()
bigram_vectorizer.get_params()
bigram_vectorizer.encoding

'utf-8'

In [53]:
from sklearn.feature_extraction.text import HashingVectorizer
hashing_vectorizer = HashingVectorizer(n_features=50)
hashing_dtm = hashing_vectorizer.fit_transform(data_samples)
hashing_dtm

<10x50 sparse matrix of type '<class 'numpy.float64'>'
	with 393 stored elements in Compressed Sparse Row format>

In [54]:
hashing_dtm.toarray()

array([[ 0.        ,  0.0805823 , -0.04029115,  0.04029115, -0.04029115,
        -0.12087344,  0.        ,  0.04029115, -0.68494952,  0.12087344,
         0.        , -0.12087344, -0.04029115, -0.04029115, -0.04029115,
        -0.0805823 ,  0.        , -0.04029115,  0.12087344, -0.16116459,
        -0.04029115,  0.        ,  0.        ,  0.12087344, -0.04029115,
         0.04029115,  0.12087344,  0.40291148,  0.04029115,  0.12087344,
         0.        ,  0.        , -0.04029115,  0.04029115, -0.04029115,
        -0.04029115,  0.04029115,  0.        ,  0.12087344,  0.04029115,
         0.0805823 ,  0.24174689, -0.04029115, -0.04029115,  0.04029115,
        -0.28203804,  0.16116459, -0.04029115,  0.        ,  0.0805823 ],
       [-0.07905694,  0.07905694, -0.07905694,  0.07905694,  0.        ,
        -0.15811388,  0.07905694,  0.07905694, -0.07905694,  0.07905694,
         0.07905694,  0.        ,  0.07905694, -0.23717082, -0.07905694,
        -0.07905694,  0.        ,  0.        ,  0.

In [55]:
measurements = [{'city': 'Dubai', 'temperature': 33.},
                {'city': 'London', 'temperature': 12.},
                {'city': 'San Fransisco', 'temperature': 18.}]
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)#可选变量，是否生成 scipy.sparse 矩阵
measurements_vec = vec.fit_transform(measurements)
measurements_vec

array([[  1.,   0.,   0.,  33.],
       [  0.,   1.,   0.,  12.],
       [  0.,   0.,   1.,  18.]])

In [57]:
vec.vocabulary_
vec.inverse_transform(measurements_vec)

[{'city=Dubai': 1.0, 'temperature': 33.0},
 {'city=London': 1.0, 'temperature': 12.0},
 {'city=San Fransisco': 1.0, 'temperature': 18.0}]

In [45]:
D = [{'text': 1, 'mining': 2}, {'Python': 3, 'text': 1}]
X3 = vec.fit_transform(D)
X3

array([[ 0.,  2.,  1.],
       [ 3.,  0.,  1.]])

In [46]:
vec.vocabulary_

{'Python': 0, 'mining': 1, 'text': 2}

In [47]:
vec.inverse_transform(X3)

[{'mining': 2.0, 'text': 1.0}, {'Python': 3.0, 'text': 1.0}]

In [58]:
import gensim
from gensim import corpora
corpus = ['This is a text mining book',
          'Is this a text mining book',
          'Text mining with python']

texts = [[word for word in document.lower().split()] for document in corpus]
dictionary = corpora.Dictionary(texts)
print(dictionary.keys())
print(dictionary.values())
print(dictionary.items())

ImportError: No module named 'gensim'

In [None]:
import gensim

ImportError: No module named 'gensim'

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
transformer

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [None]:
tf = [[4, 0, 0],
      [3, 2, 0],
      [3, 0, 0],
      [3, 0, 2]]
tfidf = transformer.fit_transform(tf)
tfidf

<4x3 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [None]:
tfidf.toarray()

array([[ 1.        ,  0.        ,  0.        ],
       [ 0.61638324,  0.78744632,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.61638324,  0.        ,  0.78744632]])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizerTfidf= TfidfVectorizer()
data_samples_tfidf = vectorizerTfidf.fit_transform(data_samples)
data_samples_tfidf.toarray()

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [None]:
vectorizerTfidf.get_feature_names()

['0fhmt',
 '0jf',
 '0kbfkp',
 '0lzi3',
 '0pxve0b',
 '0qhh',
 '0rhj',
 '0rn',
 '0rr',
 '0s9',
 '0sc3',
 '10',
 '100',
 '1000',
 '10padk',
 '150',
 '18',
 '1f0rg',
 '1i',
 '1mci',
 '1r',
 '20',
 '200',
 '21z',
 '25',
 '2ba2',
 '2fp',
 '2gl',
 '2h5b8z',
 '2p',
 '2tpmif',
 '2vir',
 '300k',
 '32k',
 '33',
 '338083e',
 '339',
 '3c',
 '3frchypk',
 '3lt',
 '3qpjiteram0',
 '40',
 '4a3',
 '4bq',
 '4cpjy0',
 '4qb9c5ke1h',
 '4vl',
 '58pcd5emb',
 '5bi',
 '5el3',
 '5kajhr1efqr',
 '5p',
 '5pzk8',
 '64',
 '64k',
 '64kb',
 '6h0ajcy1br',
 '6mrm',
 '6ndih',
 '6t8d',
 '6v',
 '8m8qpq2rad',
 '9cc3',
 '9hy46',
 '9k',
 '9kpdgj1',
 '9l0a',
 '9ln',
 '9me',
 '9r',
 '9u',
 '_jesus',
 'a13x',
 'a2',
 'a6',
 'aacvkc',
 'aalm336chgur',
 'abfb',
 'about',
 'absolutely',
 'accept',
 'accepted',
 'acts',
 'actual',
 'actually',
 'admicj',
 'after',
 'air',
 'aj',
 'al8xrhjf',
 'all',
 'allowed',
 'also',
 'alt',
 'although',
 'always',
 'am',
 'amazed',
 'an',
 'and',
 'any',
 'anyone',
 'anything',
 'anyway',
 'ap',
 

In [None]:
import sklearn
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True,random_state=1,remove=('headers','footers','quotes'),categories=['rec.sport.baseball'])
len(dataset.data)

597

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
dtm_vectorizer = CountVectorizer(stop_words="english")
dtm = dtm_vectorizer.fit_transform(dataset.data).toarray()
dtm

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(dtm).toarray()
tfidf

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizerTfidf = TfidfVectorizer()
tfidf = vectorizerTfidf.fit_transform(dataset.data).toarray()
tfidf

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [None]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [None]:
import nltk
# nltk.download()
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
english_stopwords = stopwords.words("english")
english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '!', '@',
                        '#', '%', '$', '*']
words = [word_tokenize(t) for t in dataset.data]
words_lower = [[j.lower() for j in i] for i in words]

print(words_lower)

NameError: name 'dataset' is not defined

In [None]:
word_clear =[]
for i in words_lower:
    words_filter = []
    for  j in i :
        if j not in english_stopwords:
            if j not in english_punctuations:
                words_filter.append(j)
                
    word_clear.append(words_filter)
word_clear

In [None]:
import  gensim
model = gensim.models.Word2Vec(word_clear,size=100,window=5,min_count=5)
set(model.vocab.keys())

In [None]:
model['sorry']

In [None]:
with open("") as f :
    read = f.readlines()
title=[]
for i in read[0:500]:
    title.append(i.split("|")[1].decode("utf-8"))
import jieba
with open("stopwords.txt") as f:
    read = f.read().decode("utf-8")
stop_words = read.splitlines()
texts=[]
for i in title:
    title_seg=[]
    segs = jieba.cut(i)
    for seg in segs:
        if seg not in stop_words:
            title_seg.append(seg)
            
    texts.append(title_seg)
    texts

In [None]:
import gensim
from gensim import corpora
dictionary = corpora.Dictionary(texts)
word_count = [dictionary.doc2bow(text) for text in texts]

In [None]:
import nltk
import gensim
from nltk.corpus import brown
sentences = [[j.lower() for j in i ] for i in brown.sents()]
sentences

ImportError: No module named 'gensim'

In [None]:
import  sys
sys.version

'3.5.2 |Anaconda 4.2.0 (64-bit)| (default, Jul  5 2016, 11:41:13) [MSC v.1900 64 bit (AMD64)]'