In [5]:
# https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

## 1.) Dataset preparation

In [15]:
# load the dataset
data = open('corpus').read()

#print(data[1:200])
labels, texts = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split()
    #print(content[1])
    labels.append(content[0])
    texts.append(" ".join(content[1:]))
print(labels[1])    # Basis Datensatz und Datensatz nach Splittiing haben nicht die gleiche Reihenfolge bei Zeilen?
print(texts[1])


# create a dataframe using texts and lables
trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

trainDF.info()

__label__2
The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
text     10000 non-null object
label    10000 non-null object
dtypes: object(2)
memory usage: 156.3+ KB


In [14]:
# split the dataset into training and validation datasets 
# x= text, y= labels?? (0,1)
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])



# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

type(valid_x)
type(valid_y)
print(valid_y[3:8])


[0 1 0 0 0]


## 2.) Feature Engineering

* Count Vectors as features
* TF-IDF Vectors as features
    * Word level
    * N-Gram level
    * Character level
* Word Embeddings as features
* Text / NLP based features
* Topic Models as features

### Count Vectors as features

In [21]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

type(valid_x)
print(valid_x[2])

Amazing!: This soundtrack is my favorite music of all time, hands down. The intense sadness of "Prisoners of Fate" (which means all the more if you've played the game) and the hope in "A Distant Promise" and "Girl who Stole the Star" have been an important inspiration to me personally throughout my teen years. The higher energy tracks like "Chrono Cross ~ Time's Scar~", "Time of the Dreamwatch", and "Chronomantique" (indefinably remeniscent of Chrono Trigger) are all absolutely superb as well.This soundtrack is amazing music, probably the best of this composer's work (I haven't heard the Xenogears soundtrack, so I can't say for sure), and even if you've never played the game, it would be worth twice the price to buy it.I wish I could give it 6 stars.


### TF-IDF Vectors as features

* TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
* IDF(t) = log_e(Total number of documents / Number of documents with term t in it)


* **Word Level TF-IDF** : Matrix representing tf-idf scores of every term in different documents
* **N-gram Level TF-IDF** : N-grams are the combination of N terms together. This Matrix representing tf-idf scores of N-grams
* **Character Level TF-IDF** : Matrix representing tf-idf scores of character level n-grams in the corpus

In [23]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 



In [39]:
#type(tfidf_vect_ngram_chars)
#print(tfidf_vect.idf_)

print(tfidf_vect.idf_)
print(xtrain_tfidf)

#print(tfidf_vect_ngram.idf_)
#print(xtrain_tfidf_ngram)

#print(tfidf_vect_ngram_chars.idf_)
#print(xtrain_tfidf_ngram_chars)

[ 5.85373154  6.62692143  6.54687872 ...,  6.80924299  6.22145632
  6.77645316]
  (0, 4605)	0.167223224346
  (0, 4604)	0.161358967884
  (0, 4506)	0.186531787928
  (0, 4493)	0.102205809599
  (0, 4428)	0.0403740960037
  (0, 3881)	0.159525726254
  (0, 3692)	0.178013849358
  (0, 3545)	0.0985895389479
  (0, 3334)	0.257408291743
  (0, 3228)	0.404033584067
  (0, 2395)	0.259406120238
  (0, 2390)	0.0972523201176
  (0, 2384)	0.0517053925238
  (0, 2267)	0.11674022144
  (0, 2223)	0.141836496768
  (0, 2040)	0.174209638487
  (0, 1958)	0.145607498984
  (0, 1366)	0.226394425582
  (0, 1278)	0.305110707971
  (0, 962)	0.232036843409
  (0, 718)	0.231282110223
  (0, 717)	0.143308155782
  (0, 243)	0.0440947485395
  (0, 227)	0.152295448069
  (0, 222)	0.375094180337
  :	:
  (7498, 181)	0.101200750574
  (7498, 177)	0.0604778270579
  (7498, 161)	0.136654825227
  (7498, 92)	0.0496084344305
  (7498, 88)	0.0941801923985
  (7499, 4992)	0.148008231288
  (7499, 4735)	0.122679685506
  (7499, 4662)	0.249392893737
  (74

### Word Embeddings

A word embedding is a form of representing words and documents using a dense vector representation. The position of a word within the vector space is learned from text and is based on the words that surround the word when it is used. Word embeddings can be trained using the input corpus itself or can be generated using pre-trained word embeddings such as **Glove, FastText, and Word2Vec**. 

In [41]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('wiki-news-300d-1M.vec')):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(trainDF['text'])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [68]:
# https://medium.com/@Petuum/embeddings-a-matrix-of-meaning-4de877c9aa27

type(embedding_matrix)
#train_x
#train_seq_x
embedding_matrix
word_index


{'the': 1,
 'and': 2,
 'i': 3,
 'a': 4,
 'to': 5,
 'of': 6,
 'it': 7,
 'this': 8,
 'is': 9,
 'in': 10,
 'for': 11,
 'that': 12,
 'was': 13,
 'book': 14,
 'you': 15,
 'not': 16,
 'but': 17,
 'with': 18,
 'on': 19,
 'my': 20,
 'have': 21,
 'as': 22,
 'are': 23,
 'one': 24,
 'be': 25,
 'so': 26,
 'all': 27,
 'if': 28,
 'very': 29,
 'like': 30,
 'read': 31,
 'good': 32,
 'great': 33,
 'at': 34,
 'movie': 35,
 'they': 36,
 'just': 37,
 'about': 38,
 'from': 39,
 'or': 40,
 'would': 41,
 'an': 42,
 'me': 43,
 'out': 44,
 'what': 45,
 'has': 46,
 'more': 47,
 'by': 48,
 'time': 49,
 'had': 50,
 'when': 51,
 'get': 52,
 'will': 53,
 "it's": 54,
 'up': 55,
 'there': 56,
 'no': 57,
 'only': 58,
 'your': 59,
 'can': 60,
 "don't": 61,
 'his': 62,
 'really': 63,
 'who': 64,
 'some': 65,
 'he': 66,
 'well': 67,
 'first': 68,
 'her': 69,
 'much': 70,
 'than': 71,
 'even': 72,
 'do': 73,
 'story': 74,
 'because': 75,
 'them': 76,
 'other': 77,
 'after': 78,
 'buy': 79,
 'we': 80,
 'were': 81,
 'too': 

### Topic Models as features

In [70]:
# train a LDA Model
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
X_topics = lda_model.fit_transform(xtrain_count)
topic_word = lda_model.components_ 
vocab = count_vect.get_feature_names()

# view the topic models
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = numpy.array(vocab)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))

In [75]:
type(topic_word)
topic_summaries

['es wealth tribe im7 dede dimensions den kino artec elite',
 'la de un y el los del que jewish organ',
 'sexual listened foot email detailed till allow award wrong incorrect',
 'tess j smith hardy term grow heck bd cornwell arms',
 '1984 orwell brother winston timeless relevant aspects big commentary spanish',
 'stargate haiku holmes meets alive yellow exceptional hound explanation acted',
 'error simpletech topics chemistry card manage alarm designs firmware poignant',
 'crawford harry junior noir jam kelly joan vcr gluten palance',
 'manson max mad account range artists mountain blah office kerouac',
 'dr intelligent cat prime labor keel diabetes gammell stephen instructor',
 'the i it and to a this is for of',
 'river tism michigan ca rounded visually naader danner vein navy',
 'ear sandler adam g4 nights powerbook william headset cake castle',
 'freud intelligence bluray economy liking users lunchbox assembled anthology neanderthal',
 'chris bebel napoleon profanity pat captain wi

## 3.) Model Building

The final step in the text classification framework is to train a classifier using the features created in the previous step. There are many different choices of machine learning models which can be used to train a final model. We will implement following different classifiers for this purpose:

    * Naive Bayes Classifier
    * Linear Classifier
    * Support Vector Machine
    * Bagging Models
    * Boosting Models
    * Shallow Neural Networks
    * Deep Neural Networks
        * Convolutional Neural Network (CNN)
        * Long Short Term Modelr (LSTM)
        * Gated Recurrent Unit (GRU)
        * Bidirectional RNN
        * Recurrent Convolutional Neural Network (RCNN)
        * Other Variants of Deep Neural Networks
