In [12]:
max_words = 500

In [13]:
import json
import keras
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer

class FeatureProcessor:
    def __init__(self, tweets):
        self.tweets = tweets
        self.tokenizer = Tokenizer(num_words=max_words)
        # feed our tweets to the Tokenizer
        self.tokenizer.fit_on_texts(tweets)

        # Tokenizers come with a convenient list of words and IDs
        self.dictionary = self.tokenizer.word_index
    
    def convert_text_to_index_array(self,tweet):
        # one really important thing that `text_to_word_sequence` does
        # is make all texts the same length -- in this case, the length
        # of the longest text in the set.
        return [self.dictionary[word] for word in kpt.text_to_word_sequence(tweet)]

    def get_tokenized_tweets(self):
        allWordIndices = []
        # for each tweet, change each token to its ID in the Tokenizer's word_index
        for tweet in self.tweets:
            wordIndices = self.convert_text_to_index_array(tweet)
            allWordIndices.append(wordIndices)

        # now we have a list of all tweets converted to index arrays.
        # cast as an array for future usage.
        allWordIndices = np.asarray(allWordIndices)

        # create one-hot matrices out of the indexed tweets
        train_x = self.tokenizer.sequences_to_matrix(allWordIndices, mode='binary')
        return train_x

In [14]:
import pandas as pd 
import numpy as np
class DatasetLoader:
    def __init__(self, csv_file_path):
        data_frame = pd.read_csv(csv_file_path)
        self.data = np.array(data_frame)
        self.ids = self.data[:,0]
        self.labels = np.asarray(self.data[:,1:3]).astype('float32')
        self.features = self.data[:,3]
        feature_processor = FeatureProcessor(self.features)
        self.preprocessed_features = feature_processor.get_tokenized_tweets()
        self.preprocessed_features = np.asarray(self.preprocessed_features).astype('float32')
    def get_data_set(self):
        return self.data
    def get_ids(self):
        return self.ids
    def get_labels(self):
        return self.labels
    def get_features(self):
        return self.features
    def get_preprocessed_features(self):
        return self.preprocessed_features
        

In [15]:
train_loader = DatasetLoader('C:\\Users\\georg\\Documents\\PML\\training.csv')
validation_loader = DatasetLoader('C:\\Users\\georg\\Documents\\PML\\validation.csv')
print(train_loader.get_features().shape)
print(train_loader.get_labels().shape)
print(train_loader.get_ids().shape)
print(validation_loader.get_features().shape)
print(validation_loader.get_labels().shape)
print(validation_loader.get_ids().shape)


(22583,)
(22583, 2)
(22583,)
(3044,)
(3044, 2)
(3044,)


In [16]:
from sklearn.metrics import mean_squared_error
class DataSetEvaluator:
    def calculateMSE(self, model, dataSetLoader):
        y_pred = model.predict(dataSetLoader.get_preprocessed_features())
        print(y_pred[0])
        return mean_squared_error(dataSetLoader.get_labels(), y_pred)
    

In [102]:
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
alpha = 10 ** -4
kernel = "rbf"
model = KernelRidge(alpha, kernel)
model.fit(tokenized_tweets,train_loader.get_labels())



KernelRidge(alpha=0.0001, kernel='rbf')

In [42]:
from sklearn.linear_model import LinearRegression
LinearRegressionModel = LinearRegression()
LinearRegressionModel.fit(train_loader.get_preprocessed_features(),train_loader.get_labels())

LinearRegression()

In [43]:
from sklearn.multioutput import RegressorChain
from sklearn.svm import LinearSVR

LinearSvrModel = LinearSVR()
# define the chained multioutput wrapper model
LinearSvrWrapper = RegressorChain(LinearSvrModel)
# fit the model on the whole dataset
LinearSvrWrapper.fit(train_loader.get_preprocessed_features(),train_loader.get_labels())



RegressorChain(base_estimator=LinearSVR())

In [40]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

modelNN = Sequential()
modelNN.add(Dense(512, input_shape=(max_words,), activation='relu'))
modelNN.add(Dropout(0.5))
modelNN.add(Dense(256, activation='sigmoid'))
modelNN.add(Dropout(0.5))
modelNN.add(Dense(2))
modelNN.summary()
modelNN.compile(loss='mean_squared_error', optimizer='adam')

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 512)               1536512   
_________________________________________________________________
dropout_8 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 256)               131328    
_________________________________________________________________
dropout_9 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 2)                 514       
Total params: 1,668,354
Trainable params: 1,668,354
Non-trainable params: 0
_________________________________________________________________


In [41]:
modelNN.fit(train_loader.get_preprocessed_features(), train_loader.get_labels(),
  batch_size=32,
  epochs=5,
  verbose=1,
  validation_split=0.1,
  shuffle=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1f957bfe4c0>

In [44]:
evaluator = DataSetEvaluator()
print(evaluator.calculateMSE(LinearRegressionModel, validation_loader))
print(evaluator.calculateMSE(LinearSvrWrapper, validation_loader))
print(evaluator.calculateMSE(modelNN, validation_loader))


[51.57597   9.856364]
1.3347499
[51.73949143 10.70975961]
1.5282538955220115
[51.327515  9.308052]
1.3197142


In [17]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense,SimpleRNN

modelEE = Sequential()
modelEE.add(Embedding(max_words, 32))
modelEE.add(SimpleRNN(32))
modelEE.add(Dense(2))
modelEE.compile(loss='mean_squared_error', optimizer='adam')
history = modelEE.fit(train_loader.get_preprocessed_features(), train_loader.get_labels(),
  batch_size=32,
  epochs=10,
  verbose=1,
  validation_split=0.1,
  shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
evaluator = DataSetEvaluator()
print(evaluator.calculateMSE(modelEE, validation_loader))

[51.69241   9.230614]
1.2452524
