In [1]:
max_words = 500

In [2]:
import json
import keras
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords


german_stop_words = stopwords.words('german')
class FeatureProcessor:
    def __init__(self, tweets):
        self.tweets = tweets
        self.tokenizer = Tokenizer(num_words=max_words)
        # feed our tweets to the Tokenizer
        self.tokenizer.fit_on_texts(tweets)

        # Tokenizers come with a convenient list of words and IDs
        self.dictionary = self.tokenizer.word_index
    
    def convert_text_to_index_array(self,tweet):
        # one really important thing that `text_to_word_sequence` does
        # is make all texts the same length -- in this case, the length
        # of the longest text in the set.
        return [self.dictionary[word] for word in kpt.text_to_word_sequence(tweet)]
    def get_tokenized_tweets(self):
        allWordIndices = []
        # for each tweet, change each token to its ID in the Tokenizer's word_index
        for tweet in self.tweets:
            wordIndices = self.convert_text_to_index_array(tweet)
            allWordIndices.append(wordIndices)

        # now we have a list of all tweets converted to index arrays.
        # cast as an array for future usage.
        allWordIndices = np.asarray(allWordIndices)
        
        # create one-hot matrices out of the indexed tweets
        train_x = self.tokenizer.sequences_to_matrix(allWordIndices, mode='count')

        return train_x

In [3]:
import pandas as pd 
import numpy as np
import string
import preprocessor as p
import re
from nltk.stem import SnowballStemmer

class DatasetLoader:
    def __init__(self, csv_file_path, mode = 'train'):
        data_frame = pd.read_csv(csv_file_path)
        self.data = np.array(data_frame)
        if mode == 'train' :
            self.ids = self.data[:,0]
            self.labels = np.asarray(self.data[:,1:3]).astype('float32')
            self.features = self.clean_tweets(self.data[:,3])
        elif mode == 'test':
            self.ids = self.data[:,0]
            self.features = self.clean_tweets(self.data[:,1])
        print(self.features[0])
        feature_processor = FeatureProcessor(self.features)
        self.preprocessed_features = feature_processor.get_tokenized_tweets() 
        self.preprocessed_features = np.asarray(self.preprocessed_features).astype('float32')
        
    def clean_tweets(self,tweets):
        stemmer = SnowballStemmer("german")
        german_stop_words = stopwords.words('german')
        new_tweets =[]
        for tweet in tweets:
            tweet = tweet.lower()
            tweet = re.sub('[!?,.:";]', '', tweet)
            tweet = p.tokenize(tweet)
            cleaned_tweet =[];
            separator = " "
            for word in kpt.text_to_word_sequence(tweet):
                if word not in german_stop_words:
                    cleaned_tweet.append(stemmer.stem(word))
            new_tweets.append(separator.join(cleaned_tweet))
        
        return np.array(new_tweets)
    
    def get_data_set(self):
        return self.data
    def get_ids(self):
        return self.ids
    def get_labels(self):
        return self.labels
    def get_features(self):
        return self.features
    def get_preprocessed_features(self):
        return self.preprocessed_features
        

In [4]:
train_loader = DatasetLoader('training.csv')
validation_loader = DatasetLoader('validation.csv')
test_loadet = DatasetLoader('test.csv', mode='test')
print(train_loader.get_features().shape)
print(train_loader.get_labels().shape)
print(train_loader.get_ids().shape)
print(validation_loader.get_features().shape)
print(validation_loader.get_labels().shape)
print(validation_loader.get_ids().shape)


seit d vas kaputt gang bringt numb jahr unglck antwortet de spiegel isch gar nt kaputt gang bringt numb jahr pech dadruf fangt s kondom a lach s'atomkraftwerk au emoji uuuh nd schlecht emoji seisch aner frau isch fett hesch dis leb lang unglck au i fall nm lang lebsch
mer aner party bi kolleg en neu bro findt ht sogar dr vo dne emoji
emoji min vibi funktionkert nd emoji hesch d'batteri dri gsteckt emoji ja isch nd sgliich
(22583,)
(22583, 2)
(22583,)
(3044,)
(3044, 2)
(3044,)


In [5]:
from sklearn.metrics import mean_squared_error
import csv 
class DataSetEvaluator:
    def calculateMSE(self, model, dataSetLoader):
        y_pred = model.predict(dataSetLoader.get_preprocessed_features())
        print(y_pred[0])
        return mean_squared_error(dataSetLoader.get_labels(), y_pred)
    def generateEvaluationFile(self, model, dataSetLoader):
        y_pred = model.predict(dataSetLoader.get_preprocessed_features())
        print(y_pred[0])
        ids = dataSetLoader.get_ids()
        file=open('submision1.txt',"w")
        writes=csv.writer(file,delimiter=',',quoting=csv.QUOTE_ALL)
        count =0
        for row in y_pred:
            writes.writerow([ids[int(count)],row[0],row[1]])
            count = count + 1
        
    

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
alpha = 10 ** -4
kernel = "rbf"
KRmodel = KernelRidge(alpha, kernel)
KRmodel.fit(train_loader.get_preprocessed_features(),train_loader.get_labels())



In [6]:
from sklearn.linear_model import LinearRegression
LinearRegressionModel = LinearRegression()
LinearRegressionModel.fit(train_loader.get_preprocessed_features(),train_loader.get_labels())

LinearRegression()

In [8]:
from sklearn.multioutput import RegressorChain
from sklearn.linear_model import BayesianRidge
BayesModel = BayesianRidge()
BayesModelWrapper = RegressorChain(BayesModel)
BayesModelWrapper.fit(train_loader.get_preprocessed_features(),train_loader.get_labels())

RegressorChain(base_estimator=BayesianRidge())

In [200]:
evaluator = DataSetEvaluator()
evaluator.generateEvaluationFile(LinearRegressionModel, test_loadet)

[51.603798  9.399656]


In [9]:
from sklearn.multioutput import RegressorChain
from sklearn.svm import LinearSVR

LinearSvrModel = LinearSVR()
# define the chained multioutput wrapper model
LinearSvrWrapper = RegressorChain(LinearSvrModel)
# fit the model on the whole dataset
LinearSvrWrapper.fit(train_loader.get_preprocessed_features(),train_loader.get_labels())



RegressorChain(base_estimator=LinearSVR())

In [10]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

modelNN = Sequential()
modelNN.add(Dense(512, input_shape=(max_words,), activation='relu'))
modelNN.add(Dropout(0.5))
modelNN.add(Dense(256, activation='sigmoid'))
modelNN.add(Dropout(0.5))
modelNN.add(Dense(2))
modelNN.summary()
modelNN.compile(loss='mean_squared_error', optimizer='adam')

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               256512    
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 514       
Total params: 388,354
Trainable params: 388,354
Non-trainable params: 0
_________________________________________________________________


In [11]:
modelNN.fit(train_loader.get_preprocessed_features(), train_loader.get_labels(),
  batch_size=32,
  epochs=10,
  verbose=1,
  validation_split=0.1,
  shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x282858206d0>

In [10]:
evaluator = DataSetEvaluator()
evaluator.generateEvaluationFile(modelNN, test_loadet)

[51.385906  9.545205]


In [12]:
evaluator = DataSetEvaluator()
print(evaluator.calculateMSE(LinearRegressionModel, validation_loader))
print(evaluator.calculateMSE(LinearSvrWrapper, validation_loader))
print(evaluator.calculateMSE(modelNN, validation_loader))
print(evaluator.calculateMSE(BayesModelWrapper, validation_loader))
# print(evaluator.calculateMSE(KRmodel, validation_loader))


[51.931564  9.496623]
1.1855316
[52.02251294  9.85830693]
1.2740941188578512
[51.905483  9.606482]
1.180663
[51.89754052  9.49469686]
1.1586700466671107


In [11]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense,SimpleRNN

modelEE = Sequential()
modelEE.add(Embedding(max_words, 32))
modelEE.add(SimpleRNN(32))
modelEE.add(Dense(2))
modelEE.compile(loss='mean_squared_error', optimizer='adam')
history = modelEE.fit(train_loader.get_preprocessed_features(), train_loader.get_labels(),
  batch_size=32,
  epochs=5,
  verbose=1,
  validation_split=0.1,
  shuffle=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
evaluator = DataSetEvaluator()
print(evaluator.calculateMSE(modelEE, validation_loader))

[51.604973  9.42761 ]
1.2109075


[51.89754052  9.49469686]
1.1586700466671107
