## Importing Libraries and Regression Class

In [1]:
# import libraries for class
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=1.5)
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import time

# sklearn models
from sklearn.model_selection import train_test_split,cross_val_score, StratifiedKFold, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, ElasticNet, TweedieRegressor
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor 
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

In [102]:
class full_regression:
    """A class which automatically does many regression models and gridsearches for you. 
    Note: when you run a new model it will overwrite the previous model. You can access the current model with .model and .model_des.
    Other options:
    run_all = default True, if set to false the class will not automatically run any models
    standardize = default True, uses standard scaler and fit-transforms on train, transform on test if exists
    test_size = default 0.15, decide the side of your test set - set to 0 if dont want test
    folds = default 6, amount of folds for cross validation - integer > 1
    shuffle = default True, shuffle the data for test split and cross val
    stratify = default None, input the variable that you which to stratify the data by
    print_info = default True, print all of the results out every time you run a model
    save_it = default False, this adds functionality to be able to save down all model results into a
              dataframe, set as a global variable called model_tracker.
    comment = default None, This is a comment field for the model_tracker
    Go to readme for further information: https://github.com/LukeBetham/machine-learning-classes/blob/master/README.md
    Created by LukeBetham"""

    def __init__(self, X, y, run_all=True, standardize=True,
                 test_size=0.15, manual_test=None, folds=6, shuffle=True, stratify=None,
                 print_info=True, save_it=False, comment=None):
        # Save settings to object
        self.folds = folds
        self.shuffle = shuffle
        self.stratify= stratify
        self.comment = comment
        self.save_it = save_it
        self.print_info = print_info
        if self.stratify is None:
            self.kfold = KFold(self.folds, shuffle=self.shuffle, random_state=66)
        else:
            self.kfold = StratifiedKFold(self.folds, shuffle=self.shuffle, random_state=66)
        # Option for bolding print text
        self.BOLD = '\033[1m'
        self.END = '\033[0m'
        # Create train-test split if selected
        self.X = X
        self.y = y
        self.test = test_size
        if self.test != 0:
            self.X, self.X_test, self.y, self.y_test, self.index_train, self.index_test = train_test_split(
                self.X, self.y, self.X.index, test_size=self.test, shuffle=self.shuffle, 
                stratify=self.stratify, random_state=66)
        # Standardise the data if selected
        if standardize == True:
            scaler = StandardScaler()
            self.X = pd.DataFrame(
                scaler.fit_transform(self.X), columns=self.X.columns)
            if isinstance(manual_test, pd.DataFrame):
                self.X_test_manual = pd.DataFrame(
                    scaler.transform(manual_test), columns=manual_test.columns)
            if self.test != 0:
                self.X_test = pd.DataFrame(
                    scaler.transform(self.X_test), columns=self.X_test.columns)
        # Run all models
        if run_all==True:
            self.Linear()
            self.Linear_elastic_net()
            self.Linear_generalised()
            self.SVM_regressor()
            self.KNN_regressor()
            self.Decision_tree_model()
            self.Random_forest_model()
            self.GradientBoosting()
            self.VotingRegressor()
            self.MLP_Neural_Net()

    def Linear(self, model=LinearRegression()):
        # Set up linear model
        self.model = model
        self.model_des = "Linear Regression Model"
        self.grid_multiple = 1
        self.model_calc()
        
    def Linear_elastic_net(self, model=ElasticNet()):
        # Set up linear model
        self.model = model
        self.model_des = "Linear Regression Model (with Elastic Net)"
        self.grid_multiple = 7.5
        self.model_calc()

    def Linear_generalised(self, model=TweedieRegressor()):
        # Set up linear model generalised - power=0: Normal, power = 1: Poisson, power = 2: Gamma, power = 3: Inverse Gaussian distribution.
        self.model = model
        self.model_des = "Generalised Linear Regression Model"
        self.grid_multiple = 1
        self.model_calc()
    
    def SVM_regressor(self, model=SVR()):
        # Set up SVM
        self.model = model
        self.model_des = "SVM Regression Model"
        self.grid_multiple = 1
        self.model_calc()
        
    def KNN_regressor(self, model=KNeighborsRegressor()):
        # Set up KNN 
        self.model = model
        self.model_des = "KNN Regression Model"
        self.grid_multiple = 1
        self.model_calc()
        
    def knn_all_k(self, limit = 50):
        # run KNN for all possible Ks and graph them
        self.scores = []
        self.max_k = np.minimum(limit,int(len(self.y)*(1-(1/self.folds))-1))
        for k in range(1, self.max_k):
            knn = KNeighborsRegressor(n_neighbors=k)
            self.scores.append(np.mean(cross_val_score(knn, self.X, self.y, cv=self.kfold)))
        self.knn_best = self.scores.index(np.max(self.scores))+1
        plt.plot(range(1, self.max_k), self.scores, label='Mean CV Scores')
        plt.xlabel('k')
        plt.ylabel('R2')
        plt.legend(loc=[1.1, 0])
        print(self.BOLD + "Highest KNN Score:" + self.END, self.knn_best)
        plt.show()
        
    def Decision_tree_model(self, print_tree=False, print_depth=5, model=DecisionTreeRegressor(random_state=66)):
        # set up decision tree model
        self.model = model
        self.model_des = "Decision Tree Model"
        self.grid_multiple = 1
        self.model_calc()
        if print_tree == True:
            dot_data = StringIO() 
            export_graphviz(self.model, out_file=dot_data, filled=True, rounded=True,
                            special_characters=True, feature_names=self.X.columns, max_depth=print_depth)  

            graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
            display(Image(graph.create_png()))

    def Random_forest_model(self, model=RandomForestRegressor(random_state=66)):
        self.model = model
        self.model_des = "Random Forest Model"
        self.grid_multiple = 1
        self.model_calc()
            
    def GradientBoosting(self, grad_model=GradientBoostingRegressor(random_state=66)):
        self.model = grad_model
        self.model_des = "Gradient Boosting Model"
        self.grid_multiple = 32.4
        self.model_calc()
    
    def VotingRegressor(self, model = VotingRegressor, vote_model1 = KNeighborsRegressor(2), vote_model2 = RandomForestRegressor(random_state=66), vote_model3 = GradientBoostingRegressor(random_state=66)):
        reg1 = vote_model3
        reg2 = vote_model2
        reg3 = vote_model1
        self.model = VotingRegressor(estimators=[('gb', reg1), ('rf', reg2), ('lr', reg3)])
        self.model_des = "Voting Regressor Model (default Linear & RF & GB)"
        self.grid_multiple = 1
        self.model_calc()
    
    def MLP_Neural_Net(self, model = MLPRegressor(solver='adam', alpha=10**(0),
                                        hidden_layer_sizes=(10, 10, 10), activation='relu',
                                        random_state=66, batch_size=50,max_iter=500)):
        self.model = model
        self.model_des = 'MLP Regressor Neural Net'
        self.grid_multiple = 1
        self.model_calc()

    def model_calc(self):
        # fit model
        t0 = time.time()
        self.model.fit(self.X, self.y)
        self.sc = self.model.score(self.X, self.y)
        self.cvs = cross_val_score(self.model, self.X, self.y, cv=self.kfold).mean()
        self.rmse = -cross_val_score(self.model, self.X, self.y, cv=self.kfold, scoring="neg_root_mean_squared_error").mean()
        # Get test score
        if self.test != 0:
            self.sct = self.model.score(self.X_test, self.y_test)
            self.sctp = str(round(self.sct, 4))
        else:
            self.sct = 'No Test Set'
            self.sctp = 'No Test Set'
        # time the running of the model
        t1 = time.time()
        self.elaspsed = t1-t0
        # show the results from the classification model
        if self.print_info==True:
            print(self.BOLD + self.model_des, 'Test\nTrain Score:' + self.END, round(self.sc, 4),
                  self.BOLD + '\nCV Fold Score (r^2):' +
                  self.END, round(self.cvs, 4),
                  self.BOLD + '\nCV Fold Score (RMSE):' +
                  self.END, round(self.rmse, 4),
                  self.BOLD + "\nModel Test Score:" + self.END, self.sctp)
            print("Time Elapsed = ", round(self.elaspsed, 2), 'secs - grid will take ~',
                  round(self.elaspsed*self.grid_multiple, 2), 'minutes to run.\n')
        try:
            self.coef = self.model.coef_
            self.coefs
        except:
            pass
        if self.save_it == True:
            self.tracking()
            
    def tracking(self):
        global model_tracker
        df_temp = pd.DataFrame({'model_type':self.model_des,'model_train_score':self.sc,
                                'cv_score_r2':self.cvs,'cv_score_rmse': self.rmse,'test_score':self.sct,'predictors': str(','.join(self.X.columns)),
                                'model_params':str(self.model),'observations':len(self.X),'time':self.elaspsed,'comment':self.comment},index=[1])
        try:
            model_tracker =  pd.concat([model_tracker,df_temp])
        except:
            model_tracker = pd.DataFrame(columns = ['model_type','model_train_score','cv_score_r2','cv_score_rmse',
                                                    'test_score','predictors','model_params','observations','time',
                                                    'comment'])
            model_tracker =  pd.concat([model_tracker,df_temp])

    def gridsearch(self, params='default'):
        """A function which automatically runs a gridsearch on your selected model. Returns model_grid model with best parameters.
        Has default parameters for each model type, but you can set your own by passing a dict into params = {}
        """
        # setting the default parameters if not set by user
        if params == 'default':
            if self.model_des == "Linear Regression Model (with Elastic Net)":
                self.params = {'alpha': np.logspace(-5, 5, 10), 'l1_ratio': np.linspace(0.0001, 1, 10), 'random_state': [66], 'warm_start':['True','False'], 'max_iter': [10000]}
            elif self.model_des == "Generalised Linear Regression Model":
                self.params = {'power': [0,1,(1,2),2,3], 'alpha': [0,1,5], 'link' : ['auto', 'identity', 'log'], 'max_iter': [10000]}
            elif self.model_des == "KNN Regression Model":
                self.params = {'n_neighbors': range(1, 10, 1), 'weights': ['uniform', 'distance'], 'p': [1, 2], 'metric': ['euclidean','manhattan','chebyshev','minkowski','wminkowski','seuclidean','mahalanobis']}
            elif self.model_des == "SVM Regression Model":
                self.params = {'kernel':['linear','poly','rbf','sigmoid'], 'degree': [2,3], 'gamma': ['scale','auto'], 'C': np.logspace(-5,5,10), 'shrinking':[True,False]}
            elif self.model_des == "Decision Tree Model":
                self.params = {'criterion': ['mse', 'friedman_mse', 'mae'], 'max_depth': [None, 5, 6, 7, 8], 'max_features': ['auto','sqrt','log2'], 'splitter': [
                    'best', 'random'], 'min_samples_split': [2, 3, 4, 5], 'ccp_alpha': [0.0, .01, .1, 1, 10, 100]}
            elif self.model_des == "Random Forest Model":
                self.params = {'n_estimators':[100,200,500], 'criterion':['mse','mae'], 'max_depth':[None,8], 'min_samples_split':[2,5],"max_features":["auto","log2","sqrt"],
                               'oob_score':[True,False],'warm_start':[True,False],'ccp_alpha':[0.0,0.5,1]}
#             elif self.model_des == "ADA Boosting Model": 
#                 self.params = {"learning_rate": [0.05, 0.25, 0.5, 0.75, 1], 'base_estimator':[DecisionTreeClassifier(max_depth=1),DecisionTreeClassifier(max_depth=2),DecisionTreeClassifier(max_depth=3),DecisionTreeClassifier(max_depth=4),DecisionTreeClassifier(max_depth=5)],
#                                'algorithm':['SAMME'],"n_estimators":[100,200,500,1000]}
            elif self.model_des == "Gradient Boosting Model": 
                self.params = {"learning_rate": [0.01, 0.5, 1], 'loss':['deviance', 'exponential'],'max_features':['auto','log2','sqrt'],
                               'warm_start':[True,False],"n_estimators":[100,200], 'ccp_alpha':[0.0,0.5,0.9],'max_depth':[1,3,5], 'subsample':[1.0,0.75,0.5]}
            elif self.model_des == 'MLP Regressor Neural Net': 
                self.params = {'solver':['adam','sgd'], 'alpha': np.linspace(0.00001,1,4),'hidden_layer_sizes':[(10, 10, 10,10),(20, 20, 20),(50,50),(100)],
                               'learning_rate' : ['constant', 'invscaling', 'adaptive'],'activation' : ['identity', 'logistic', 'tanh', 'relu']}
        else:
            self.params = params
        # setup the gridsearch
        self.grid = GridSearchCV(self.model, self.params, verbose=2, n_jobs=2, scoring="neg_root_mean_squared_error", cv=self.kfold)
        self.grid.fit(self.X, self.y)
        self.gsc = self.grid.best_score_
        self.best = self.grid.best_params_
        self.model = self.grid.best_estimator_
        self.model_des = self.model_des + " Grid Search:"
        try:
            self.coef = self.grid.best_estimator_.coef_
        except:
            pass
        # Check test score for grid
        try:
            self.sct = self.grid.best_estimator_.score(
                self.X_test, self.y_test)
            self.sctp = str(round(self.sct, 4))
        except:
            self.sctp = None
        # Print Grid results
        if self.print_info==True:
            print(self.BOLD + self.model_des + self.END)
            print(self.BOLD + "Best Mean CV Model Score:" + self.END, round(self.gsc, 4),
                  self.BOLD + "\nModel Test Score:" + self.END, self.sctp)
            print(self.BOLD + 'Grid Best Parameters:\n' + self.END, self.best)
            print(self.BOLD + '\nSearch Parameters:\n' + self.END, self.params)
        

## Importing Cleaned Data and setting X

In [72]:
df = pd.read_csv("train_amended.csv")
df_test = pd.read_csv("test_amended.csv")
model_tracker = pd.read_csv('model_tracker.csv')
y = df.pop("y")
# X = df.iloc[:,2:]
# X = df.drop('galaxy',axis=1)
# Have dummified now in the cleaning sheet as it does improve the model somewhat
X = df
X_test = df_test
X_test.drop('y',axis=1, inplace=True)

In [73]:
model_tracker.sort_values("cv_score_rmse")

Unnamed: 0,model_type,model_train_score,cv_score_r2,cv_score_rmse,test_score,predictors,model_params,observations,time,comment
64,KNN Regression Model,0.994460,9.744198e-01,1.013137e-02,No Test Set,"galactic year,existence expectancy index,exist...",KNeighborsRegressor(n_neighbors=2),3865,2.820831,6th run - 5th but with dummify same for train+...
53,KNN Regression Model,0.994460,9.744198e-01,1.013137e-02,No Test Set,"galactic year,existence expectancy index,exist...",KNeighborsRegressor(n_neighbors=2),3865,2.279424,5th run with interpolation + test and with yea...
42,KNN Regression Model,0.992616,9.715510e-01,1.067581e-02,No Test Set,"galactic year,existence expectancy index,exist...",KNeighborsRegressor(n_neighbors=2),3865,3.171636,4th run with interpolation +test and with year...
30,KNN Regression Model,0.991274,9.682712e-01,1.126174e-02,No Test Set,"existence expectancy index,existence expectanc...",KNeighborsRegressor(n_neighbors=2),3865,6.742438,"3rd run with interpolation, no galaxy or year"
62,Voting Regressor Model (default Linear & RF & GB),0.993451,9.619262e-01,1.236642e-02,No Test Set,"galactic year,existence expectancy index,exist...","VotingRegressor(estimators=[('gb', GradientBoo...",3865,204.848726,6th run - 5th but with dummify same for train+...
...,...,...,...,...,...,...,...,...,...,...
57,SVM Regression Model,-0.073435,-1.798037e-01,6.874194e-02,No Test Set,"galactic year,existence expectancy index,exist...",SVR(),3865,1.120048,6th run - 5th but with dummify same for train+...
3,SVM Regression Model,0.203636,8.630304e-02,8.135218e-02,No Test Set,"existence expectancy index,existence expectanc...",SVR(),334,0.052393,"First run with no nulls, no galaxy or year"
1,Linear Regression Model (with Elastic Net),0.000000,-3.083393e-03,8.611446e-02,No Test Set,"existence expectancy index,existence expectanc...",ElasticNet(),334,0.043729,"First run with no nulls, no galaxy or year"
43,Linear Regression Model,0.953880,-1.936690e+21,1.617910e+09,No Test Set,"galactic year,existence expectancy index,exist...",LinearRegression(),3865,0.417754,5th run with interpolation + test and with yea...


In [74]:
X

Unnamed: 0,galactic year,existence expectancy index,existence expectancy at birth,Gross income per capita,Income Index,Expected years of education (galactic years),Mean years of education (galactic years),Intergalactic Development Index (IDI),Education Index,"Intergalactic Development Index (IDI), Rank",...,galaxy_UGCA 292,galaxy_UGCA 438 (ESO 407-018),galaxy_UGCA 86,galaxy_UGCA 92,galaxy_Ursa Major I Dwarf (UMa I dSph),galaxy_Ursa Major II Dwarf,galaxy_Ursa Minor Dwarf,galaxy_Virgo I,galaxy_Willman 1,"galaxy_Wolf-Lundmark-Melotte (WLM, DDO 221)"
0,990025,0.628657,63.125200,27109.234310,0.646039,8.240543,5.248527,0.576582,0.526375,185.598200,...,0,0,0,0,0,0,0,0,0,0
1,991020,0.627245,62.389059,17114.109182,0.503198,9.317262,5.248527,0.576582,0.526375,185.598200,...,0,0,0,0,0,0,0,0,0,0
2,992016,0.662135,68.794245,13225.033915,0.692653,8.030645,5.248527,0.576582,0.526375,185.598200,...,0,0,0,0,0,0,0,0,0,0
3,993012,0.687776,62.848660,9116.133706,0.667264,11.347429,5.248527,0.576582,0.526375,185.598200,...,0,0,0,0,0,0,0,0,0,0
4,994009,0.727717,67.482431,3598.172394,0.609830,10.402265,5.248527,0.576582,0.526375,185.598200,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3860,1004004,1.091676,86.984952,66671.711229,1.069811,19.201916,13.947544,1.113395,1.057824,65.251487,...,0,0,0,0,0,0,0,0,0,0
3861,1005006,1.080452,92.192055,52238.818082,0.981938,19.461596,15.568753,0.956783,1.012429,39.797926,...,0,0,0,0,0,0,0,0,0,0
3862,1006009,1.143596,89.762503,46090.953928,1.103940,21.213489,16.022024,1.128388,0.945926,55.575405,...,0,0,0,0,0,0,0,0,0,0
3863,1012036,1.083807,87.420338,61057.801088,1.110189,16.698035,15.320563,1.019514,1.223794,96.327666,...,0,0,0,0,0,0,0,0,0,0


## Running Models

In [107]:
reg_model = full_regression(X, y, run_all = False, standardize=True,manual_test=X_test, test_size=0, folds=3, save_it=True, comment = "best gridsearch models")


In [108]:
# best linear regression after GridSearch
reg_model.Linear_elastic_net(model=ElasticNet(alpha= 0.0001291549665014884, l1_ratio=  0.33340000000000003, max_iter= 10000, random_state= 66, warm_start= True))


[1mLinear Regression Model (with Elastic Net) Test
Train Score:[0m 0.953 [1m
CV Fold Score (r^2):[0m 0.9453 [1m
CV Fold Score (RMSE):[0m 0.0148 [1m
Model Test Score:[0m No Test Set
Time Elapsed =  8.15 secs - grid will take ~ 61.12 minutes to run.



In [109]:
# best linear generalised after gridsearch
reg_model.Linear_generalised(model=TweedieRegressor(alpha=0,link="auto",max_iter=10000,power=1))

[1mGeneralised Linear Regression Model Test
Train Score:[0m 0.978 [1m
CV Fold Score (r^2):[0m 0.971 [1m
CV Fold Score (RMSE):[0m 0.0132 [1m
Model Test Score:[0m No Test Set
Time Elapsed =  0.37 secs - grid will take ~ 0.37 minutes to run.



In [110]:
# best knn after gridsearch
reg_model.KNN_regressor(model=KNeighborsRegressor(n_neighbors=2,
    weights='distance',
    p=1,
    metric='euclidean'))

[1mKNN Regression Model Test
Train Score:[0m 1.0 [1m
CV Fold Score (r^2):[0m 0.9756 [1m
CV Fold Score (RMSE):[0m 0.0099 [1m
Model Test Score:[0m No Test Set
Time Elapsed =  2.36 secs - grid will take ~ 2.36 minutes to run.



In [111]:
# best SVM regressor
reg_model.SVM_regressor(model=SVR(C=0.001668,degree=2,gamma='scale',kernel='linear',shrinking=True))

[1mSVM Regression Model Test
Train Score:[0m 0.5446 [1m
CV Fold Score (r^2):[0m 0.5261 [1m
CV Fold Score (RMSE):[0m 0.0436 [1m
Model Test Score:[0m No Test Set
Time Elapsed =  0.68 secs - grid will take ~ 0.68 minutes to run.



In [112]:
reg_model.Decision_tree_model(model=DecisionTreeRegressor(ccp_alpha=0,criterion='friedman_mse',max_depth=None,max_features='auto',min_samples_split=4,splitter='random',random_state=66))
                              

[1mDecision Tree Model Test
Train Score:[0m 0.9964 [1m
CV Fold Score (r^2):[0m 0.9218 [1m
CV Fold Score (RMSE):[0m 0.0177 [1m
Model Test Score:[0m No Test Set
Time Elapsed =  0.46 secs - grid will take ~ 0.46 minutes to run.



In [113]:
reg_model.Random_forest_model(model=RandomForestRegressor(ccp_alpha=0, criterion='mse',
                                                          max_depth=None, max_features='auto',
                                                          min_samples_split=2, n_estimators=500,
                                                          oob_score=True,warm_start=True,
                                                          random_state=66))


[1mRandom Forest Model Test
Train Score:[0m 0.9931 [1m
CV Fold Score (r^2):[0m 0.9417 [1m
CV Fold Score (RMSE):[0m 0.0153 [1m
Model Test Score:[0m No Test Set
Time Elapsed =  470.83 secs - grid will take ~ 470.83 minutes to run.



In [116]:
reg_model.MLP_Neural_Net(model = MLPRegressor(activation= 'relu', alpha= 0.33334,
                                        hidden_layer_sizes= (50, 50),
                                        learning_rate= 'invscaling',
                                        solver= 'adam'))

[1mMLP Regressor Neural Net Test
Train Score:[0m 0.9635 [1m
CV Fold Score (r^2):[0m 0.9608 [1m
CV Fold Score (RMSE):[0m 0.0122 [1m
Model Test Score:[0m No Test Set
Time Elapsed =  24.31 secs - grid will take ~ 24.31 minutes to run.



In [117]:
reg_model.GradientBoosting(grad_model=GradientBoostingRegressor(random_state=66, ccp_alpha=0.0, learning_rate= 0.5,
                                                          loss='huber', max_depth= 3,
                                                          max_features= 'auto',
                                                          n_estimators= 200,
                                                          subsample= 1.0,
                                                          warm_start= True))

[1mGradient Boosting Model Test
Train Score:[0m 0.9957 [1m
CV Fold Score (r^2):[0m 0.9461 [1m
CV Fold Score (RMSE):[0m 0.0147 [1m
Model Test Score:[0m No Test Set
Time Elapsed =  82.22 secs - grid will take ~ 2663.9 minutes to run.



In [118]:
model_tracker.sort_values('cv_score_rmse')

Unnamed: 0,model_type,model_train_score,cv_score_r2,cv_score_rmse,test_score,predictors,model_params,observations,time,comment
1,KNN Regression Model,1.000000,9.756256e-01,9.878979e-03,No Test Set,"galactic year,existence expectancy index,exist...","KNeighborsRegressor(metric='euclidean', n_neig...",3865,2.357463,best gridsearch models
64,KNN Regression Model,0.994460,9.744198e-01,1.013137e-02,No Test Set,"galactic year,existence expectancy index,exist...",KNeighborsRegressor(n_neighbors=2),3865,2.820831,6th run - 5th but with dummify same for train+...
53,KNN Regression Model,0.994460,9.744198e-01,1.013137e-02,No Test Set,"galactic year,existence expectancy index,exist...",KNeighborsRegressor(n_neighbors=2),3865,2.279424,5th run with interpolation + test and with yea...
42,KNN Regression Model,0.992616,9.715510e-01,1.067581e-02,No Test Set,"galactic year,existence expectancy index,exist...",KNeighborsRegressor(n_neighbors=2),3865,3.171636,4th run with interpolation +test and with year...
30,KNN Regression Model,0.991274,9.682712e-01,1.126174e-02,No Test Set,"existence expectancy index,existence expectanc...",KNeighborsRegressor(n_neighbors=2),3865,6.742438,"3rd run with interpolation, no galaxy or year"
...,...,...,...,...,...,...,...,...,...,...
57,SVM Regression Model,-0.073435,-1.798037e-01,6.874194e-02,No Test Set,"galactic year,existence expectancy index,exist...",SVR(),3865,1.120048,6th run - 5th but with dummify same for train+...
3,SVM Regression Model,0.203636,8.630304e-02,8.135218e-02,No Test Set,"existence expectancy index,existence expectanc...",SVR(),334,0.052393,"First run with no nulls, no galaxy or year"
1,Linear Regression Model (with Elastic Net),0.000000,-3.083393e-03,8.611446e-02,No Test Set,"existence expectancy index,existence expectanc...",ElasticNet(),334,0.043729,"First run with no nulls, no galaxy or year"
54,Linear Regression Model,0.953880,-1.936690e+21,1.617910e+09,No Test Set,"galactic year,existence expectancy index,exist...",LinearRegression(),3865,1.576757,6th run - 5th but with dummify same for train+...


In [119]:
model_tracker.to_csv("model_tracker.csv", index=False)

## Predicting on Test Set with best model

In [111]:
reg_model.model_des

'KNN Regression Model'

In [112]:
y

0       0.052590
1       0.052115
2       0.052006
3       0.051675
4       0.051334
          ...   
3860    0.165448
3861    0.166074
3862    0.166249
3863    0.176675
3864    0.182978
Name: y, Length: 3865, dtype: float64

In [113]:
reg_model.model.score(reg_model.X,reg_model.y)

0.9944602825475282

In [114]:
X.shape

(3865, 259)

In [115]:
X_test.shape

(890, 259)

In [116]:
test_preds = reg_model.model.predict(reg_model.X_test_manual)

In [118]:
preds_df = pd.DataFrame(test_preds, columns=['pred'])

In [121]:
preds_df.to_csv("y_preds.csv",index=False)