In [8]:
import logging 
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error


class BasePipeLine:
    def __init__(self, train, test, sample):
        self.train = train
        self.test = test
        self.sample = sample
        self.KNNR = KNeighborsRegressor()
        self.DTR = DecisionTreeRegressor()
        self.SVR = SVR()
        self.SGDR = SGDRegressor()
        #self.algorithms = [self.KNNR, self.DTR, self.SVR, self.SGDR]
        self.algorithms = [self.SVR]
        self.params_dict = {
            #'KNeighborsRegressor': {'n_neighbors':list(range(3, 11))},
            'SVR' : {'kernel': ('linear', 'rbf'), 'C':[1.5],'gamma': [1e-4],'epsilon':[0.1]},
            #'DecisionTreeRegressor' : {"min_samples_split": [10, 20], "max_depth": [2, 6]},
            #'SGDRegressor': {'alpha': 10.0 ** -np.arange(1, 3), 'penalty': ['l2'], 'learning_rate': ['optimal']}
        }
                
    def data_transform(self):
        X = self.train.drop(['redshift'], axis=1)
        y = self.train['redshift']
        X_test = self.test.drop(['ID'], axis=1)
        y_test = self.sample.drop(['ID'], axis=1).values
        return X, y, X_test, y_test
    
    
    def grid_search_cv(self, X_train, X_test, y_train, y_test):
        results = [] 
        for algorithm in self.algorithms:
            clf = GridSearchCV(estimator=algorithm, param_grid=self.params_dict['{}'.format(algorithm.__class__.__name__)], cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=True).fit(X_train, y_train)
            
            best_model = clf.best_estimator_
            
            predictions = clf.best_estimator_.predict(X_test)
            results.append(mean_squared_error(y_test, predictions))
            
            submission_df = pd.DataFrame(columns=['ID', 'redshift'])
            submission_df['redshift'] = predictions
            submission_df['ID'] = submission_df['redshift'].index
            submission_df.to_csv('submission_{}.csv'.format(algorithm.__class__.__name__), index = False)
        return [best_model, results]
    
    def cat_boost(self, X_train, X_test, y_train, y_test):
        import CatBoostRegressor
        cat = CatBoostRegressor(logging_level='Silent', random_state=45, early_stopping_rounds=300)
        predictions = cat.fit(X_train, y_train).predict(X_test)
        score = mean_squared_error(y_test, predictions)
        return score
                        

    def run(self):
        X, y, X_test, y_test = BasePipeLine(train, test, sample).data_transform()
        logging.info('Transformation finished')
        
        best_model, results = self.grid_search_cv(X, X_test, y, y_test)
        logging.info('Fitting models finished')
        logging.info('Evaluating Finished')
        return best_model, results

In [9]:
train = pd.read_csv('train.csv')
test = pd.read_csv('testX.csv')
sample = pd.read_csv('sample.csv')

In [None]:
BasePipeLine(train, test, sample).run()

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
