In [75]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
class DatasetLoader:
    def __init__(self):
        # Definirea modelelor de prepocesare a tweet-urilor
        self.count_vectorizer = CountVectorizer(encoding='utf-8',lowercase=True, stop_words=stopwords.words('german'),ngram_range=(1, 1),max_df=0.5, min_df=0, binary = True, max_features = 20000)
        self.tfidf_transformer  = TfidfTransformer( norm='l2',use_idf=True, smooth_idf=True, sublinear_tf=True)
    def load_train_data(self,csv_file_path):
        data_frame = pd.read_csv(csv_file_path)
        data = np.array(data_frame)
        self.train_ids = data[:,0]
        self.train_labels = np.asarray(data[:,1:3]).astype('float32')
        self.train_processed_tweets = self.count_vectorizer.fit_transform(data[:,3])
        self.train_processed_tweets = self.tfidf_transformer.fit_transform(self.train_processed_tweets)
    def load_test_data(self,csv_file_path):
        data_frame = pd.read_csv(csv_file_path)
        data = np.array(data_frame)
        self.test_ids = data[:,0]
        self.test_processed_tweets = self.tfidf_transformer.transform(self.count_vectorizer.transform(data[:,1]))
        
    def load_validation_data(self, csv_file_path):
        data_frame = pd.read_csv(csv_file_path)
        data = np.array(data_frame)
        self.validation_ids = data[:,0]
        self.validation_labels = np.asarray(data[:,1:3]).astype('float32')
        self.validation_processed_tweets = self.tfidf_transformer.transform(self.count_vectorizer.transform(data[:,3]))
    
    

In [76]:
# Incarcarea datelor de antrenare,validare si de test.
loader = DatasetLoader();
loader.load_train_data('training.csv')
print(loader.train_processed_tweets.shape)
print(loader.train_labels.shape)

loader.load_validation_data('validation.csv')
print(loader.validation_processed_tweets.shape)
print(loader.validation_labels.shape)
loader.load_test_data('test.csv')



(22583, 20000)
(22583, 2)
(3044, 20000)
(3044, 2)


In [77]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
import csv 
class DataSetEvaluator:
    def calculateMSE(self, model, labels, processed_features):
        y_pred = model.predict(processed_features)
        print(y_pred[0])
        return mean_squared_error(labels, y_pred)
    def calculateMAE(self, model, labels, processed_features):
        y_pred = model.predict(processed_features)
        print(y_pred[0])
        return mean_absolute_error(labels, y_pred)
    def generateEvaluationFile(self,model,ids, processed_features, filename):
        y_pred = model.predict(processed_features)
        print(y_pred[0])
        file=open(filename,"w")
        writes=csv.writer(file,delimiter=',',quoting=csv.QUOTE_ALL)
        count =0
        for row in y_pred:
            writes.writerow([ids[int(count)],row[0],row[1]])
            count = count + 1
        
    

In [78]:
from sklearn.multioutput import RegressorChain, MultiOutputRegressor
from sklearn.svm import LinearSVR
#Definirea modelului de baza
LinearSvrModel = LinearSVR(C=0.5,random_state=1242,loss='squared_epsilon_insensitive')
#Creeaza o inlanturie de modele pentru a genera un output bidimensional.
LinearSvrWrapper = RegressorChain(LinearSvrModel)
LinearSvrWrapper.fit(loader.train_processed_tweets,loader.train_labels)




RegressorChain(base_estimator=LinearSVR(C=0.5,
                                        loss='squared_epsilon_insensitive',
                                        random_state=1242))

In [82]:
evaluator = DataSetEvaluator()
print('MSE : ' + str(evaluator.calculateMSE(LinearSvrWrapper, loader.validation_labels, loader.validation_processed_tweets)))
print('MAE : ' + str(evaluator.calculateMAE(LinearSvrWrapper, loader.validation_labels, loader.validation_processed_tweets)))
evaluator.generateEvaluationFile(LinearSvrWrapper, loader.test_ids, loader.test_processed_tweets, 'linearSvrSubmission3.txt')

[51.94081123  9.76800256]
MSE : 0.5930440608413687
[51.94081123  9.76800256]
MAE : 0.5863074767024781
[51.79599767 10.53477428]


In [80]:
import pickle
pickle.dump(LinearSvrWrapper, open('best_svr_wrapper_model.sav', 'wb'))