In [1]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
class DatasetLoader:
    def __init__(self):
        self.count_vectorizer = CountVectorizer(encoding='utf-8', stop_words=stopwords.words('german'),max_df=0.5, min_df=0, binary = True, max_features = 20000)
        self.tfidf_transformer  = TfidfTransformer( norm='l2',use_idf=True, smooth_idf=True, sublinear_tf=True)
    def load_train_data(self,csv_file_path):
        data_frame = pd.read_csv(csv_file_path)
        data = np.array(data_frame)
        self.train_ids = data[:,0]
        self.train_labels = np.asarray(data[:,1:3]).astype('float32')
        self.train_processed_tweets = self.count_vectorizer.fit_transform(data[:,3])
        self.train_processed_tweets = self.tfidf_transformer.fit_transform(self.train_processed_tweets)
    def load_test_data(self,csv_file_path):
        data_frame = pd.read_csv(csv_file_path)
        data = np.array(data_frame)
        self.test_ids = data[:,0]
        self.test_processed_tweets = self.tfidf_transformer.transform(self.count_vectorizer.transform(data[:,1]))
        
    def load_validation_data(self, csv_file_path):
        data_frame = pd.read_csv(csv_file_path)
        data = np.array(data_frame)
        self.validation_ids = data[:,0]
        self.validation_labels = np.asarray(data[:,1:3]).astype('float32')
        self.validation_processed_tweets = self.tfidf_transformer.transform(self.count_vectorizer.transform(data[:,3]))
    
    

In [2]:
loader = DatasetLoader();
loader.load_train_data('training.csv')
print(loader.train_processed_tweets.shape)
print(loader.train_labels.shape)

loader.load_validation_data('validation.csv')
print(loader.validation_processed_tweets.shape)
print(loader.validation_labels.shape)
loader.load_test_data('test.csv')



(22583, 20000)
(22583, 2)
(3044, 20000)
(3044, 2)


In [3]:
# from sklearn.linear_model.stochastic_gradient import SGDClassifier
# from sklearn.multioutput import RegressorChain

# clf = SGDClassifier(loss='log', penalty='elasticnet', alpha=5e-5, l1_ratio=0.9, fit_intercept=True, max_iter=5, n_jobs=2, random_state=0, learning_rate="optimal")
# clfwrapper = RegressorChain(clf)
# clfwrapper.fit(loader.train_processed_tweets,loader.train_labels)

In [4]:
from sklearn.metrics import mean_squared_error
import csv 
class DataSetEvaluator:
    def calculateMSE(self, model, labels, processed_features):
        y_pred = model.predict(processed_features)
        print(y_pred[0])
        return mean_squared_error(labels, y_pred)
    def generateEvaluationFile(self,model,ids, processed_features, filename):
        y_pred = model.predict(processed_features)
        print(y_pred[0])
        file=open(filename,"w")
        writes=csv.writer(file,delimiter=',',quoting=csv.QUOTE_ALL)
        count =0
        for row in y_pred:
            writes.writerow([ids[int(count)],row[0],row[1]])
            count = count + 1
        
    

In [40]:
from sklearn.multioutput import RegressorChain
from sklearn.svm import LinearSVR
LinearSvrModel = LinearSVR(C=0.5,random_state=1242,loss='squared_epsilon_insensitive')

# define the chained multioutput wrapper model
LinearSvrWrapper = RegressorChain(LinearSvrModel)
# fit the model on the whole dataset
LinearSvrWrapper.fit(loader.train_processed_tweets,loader.train_labels)



RegressorChain(base_estimator=LinearSVR(C=0.5,
                                        loss='squared_epsilon_insensitive',
                                        random_state=1242))

In [41]:
evaluator = DataSetEvaluator()
print(evaluator.calculateMSE(LinearSvrWrapper, loader.validation_labels, loader.validation_processed_tweets))
# evaluator.generateEvaluationFile(LinearSvrWrapper, loader.test_ids, loader.test_processed_tweets, 'linearSvrSubmission3.txt')

[51.93746407  9.75082132]
0.5936588929615422


In [25]:
import pickle
pickle.dump(LinearSvrWrapper, open('best_svr_wrapper_model.sav', 'wb'))

In [39]:
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import RegressorChain
from sklearn.svm import LinearSVR
from sklearn.pipeline import make_pipeline

parameters = {'C':[0,5,1,1.5,5],'loss': ['squared_epsilon_insensitive', 'epsilon_insensitive'],'epsilon':[0,0.1,0.2,0.5,0.3]}
base_estimator = GridSearchCV(LinearSVR(),parameters,refit=True)
clf = RegressorChain(base_estimator)
        
clf.fit(loader.train_processed_tweets,loader.train_labels)
evaluator = DataSetEvaluator()
print(evaluator.calculateMSE(clf, loader.validation_labels, loader.validation_processed_tweets))

Traceback (most recent call last):
  File "C:\Users\george.moldovan\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\george.moldovan\Anaconda3\lib\site-packages\sklearn\svm\_classes.py", line 418, in fit
    self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
  File "C:\Users\george.moldovan\Anaconda3\lib\site-packages\sklearn\svm\_base.py", line 966, in _fit_liblinear
    raw_coef_, n_iter_ = liblinear.train_wrap(
  File "sklearn\svm\_liblinear.pyx", line 52, in sklearn.svm._liblinear.train_wrap
ValueError: b'C <= 0'

Traceback (most recent call last):
  File "C:\Users\george.moldovan\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\george.moldovan\Anaconda3\lib\site-packages\sklearn\svm\_classes.py", line 418, in fit
    self.coef_, self.intercept_, 











[52.03258954  8.71261937]
1.077506237600133




In [7]:
from sklearn.linear_model import LinearRegression
LinearRegressionModel = LinearRegression()
LinearRegressionModel.fit(loader.train_processed_tweets,loader.train_labels)

LinearRegression()

In [8]:
evaluator = DataSetEvaluator()
evaluator.calculateMSE(LinearRegressionModel, loader.validation_labels, loader.validation_processed_tweets)

[51.57941678 11.84049659]


4.962317541659444

In [9]:
# from keras.models import Sequential
# from keras.layers import Dense, Dropout, Activation

# modelNN = Sequential()
# modelNN.add(Dense(512, input_shape=(20000,), activation='relu'))
# modelNN.add(Dropout(0.5))
# modelNN.add(Dense(256, activation='sigmoid'))
# modelNN.add(Dropout(0.5))
# modelNN.add(Dense(2))
# modelNN.summary()
# modelNN.compile(loss='mean_squared_error', optimizer='adam')

In [10]:
# modelNN.fit(loader.train_processed_tweets,loader.train_labels,
#   batch_size=32,
#   epochs=10,
#   verbose=1,
#   validation_data=np.c_(loader.validation_processed_tweets,loader.validation_labels),
#   shuffle=True)