In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics  import r2_score
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
from xgboost import XGBRegressor

  from pandas import MultiIndex, Int64Index


In [2]:
class Model_Finder:
    def __init__(self):
        self.clf = RandomForestClassifier()
        self.DecisionTreeReg = DecisionTreeRegressor()
    
    def get_best_params_for_random_forest(self,train_x,train_y):
            #get the parameters for Random Forest Algorithm which give the best accuracy.
            #Use Hyper Parameter Tuning.
            # output will be The model with the best parameters
            
            # initializing with different combination of parameters
            self.param_grid = {"n_estimators": [50, 100, 130],"criterion": ['squared_error', 'absolute_error', 'poisson'],
                          "max_depth": range(2, 4, 1), 
                          "max_features": ['sqrt', 'log2']
                             }
            
            #Creating an object of the Grid Search class
            self.grid = GridSearchCV(estimator=self.clf, param_grid=self.param_grid, cv=5)
            
            self.grid.fit(train_x, train_y)
            
            #extracting the best parameters
            self.criterion = self.grid.best_params_['criterion']
            self.max_depth = self.grid.best_params_['max_depth']
            self.max_features = self.grid.best_params_['max_features']
            self.n_estimators = self.grid.best_params_['n_estimators']
            
            #creating a new model with the best parameters
            self.clf = RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion,
                                              max_depth=self.max_depth, max_features=self.max_features)
            # training the mew model
            self.clf.fit(train_x, train_y)
            
            return self.clf
        
        ###### for Decision Tree  ############
        
    def get_best_params_for_DecisionTreeRegressor(self, train_x, train_y):
        #get the parameters for DecisionTreeRegressor Algorithm which gives best accuracy
        #Output will be The model with the best parameters
        
        self.param_grid_decisionTree = {"criterion": ["mse", "friedman_mse", "mae"],
                              "splitter": ["best", "random"],
                              "max_features": [ "sqrt", "log2"],
                              'max_depth': range(2, 16, 2),
                              'min_samples_split': range(2, 16, 2)
                              }
        # Creating an object of the Grid Search class
        self.grid = GridSearchCV(self.DecisionTreeReg, self.param_grid_decisionTree, verbose=3,cv=5)
        
        self.grid.fit(train_x, train_y)

        # extracting the best parameters
        self.criterion = self.grid.best_params_['criterion']
        self.splitter = self.grid.best_params_['splitter']
        self.max_features = self.grid.best_params_['max_features']
        self.max_depth  = self.grid.best_params_['max_depth']
        self.min_samples_split = self.grid.best_params_['min_samples_split']
        
        # creating a new model with the best parameters
        self.decisionTreeReg = DecisionTreeRegressor(criterion=self.criterion,splitter=self.splitter,
                                                     max_features=self.max_features,max_depth=self.max_depth,
                                                     min_samples_split=self.min_samples_split)
        # training the new models
        self.decisionTreeReg.fit(train_x, train_y)
        
        return self.decisionTreeReg
    
    def get_best_params_for_xgboost(self,train_x,train_y):
        #get the parameters for XGBoost Algorithm which give the best accuracy.Use Hyper Parameter Tuning.
        #output will be The model with the best parameters
        
        # initializing with different combination of parameters
        self.param_grid_xgboost = {
                           'n_estimators' : [ 50, 200],'subsample' : [0.7, 0.8],
                            'max_depth' :[ 5,7],'learning_rate': [0.5, 0.01]

                            }
        self.grid= GridSearchCV(XGBRegressor(objective='reg:linear'),self.param_grid_xgboost, verbose=3,cv=5)
        
        self.grid.fit(train_x, train_y)
        
        # extracting the best parameters
        self.learning_rate = self.grid.best_params_['learning_rate']
        self.max_depth = self.grid.best_params_['max_depth']
        self.n_estimators = self.grid.best_params_['n_estimators']
        
        # creating a new model with the best parameters
        self.xgb = XGBRegressor(objective='reg:linear',learning_rate=self.learning_rate, 
                                max_depth=self.max_depth, n_estimators=self.n_estimators)
        
        # training the mew model
        self.xgb.fit(train_x, train_y)
        
        return self.xgb
    
    def get_best_model(self,train_x,train_y,test_x,test_y):
            #Find out the Model which has the best AUC score.
            #Output: The best model name and the model object
            ## for decision tree
            self.decisionTreeReg= self.get_best_params_for_DecisionTreeRegressor(train_x, train_y)
            self.prediction_decisionTreeReg = self.decisionTreeReg.predict(test_x) # Predictions using the decisionTreeReg Model
            self.decisionTreeReg_error = r2_score(test_y,self.prediction_decisionTreeReg)
            
            # create best model for XGBoost
            self.xgboost = self.get_best_params_for_xgboost(train_x, train_y)
            self.prediction_xgboost = self.xgboost.predict(test_x)  # Predictions using the XGBoost Model
            self.prediction_xgboost_error = r2_score(test_y,self.prediction_xgboost)
            
             #comparing the two models
            if(self.decisionTreeReg_error <  self.prediction_xgboost_error):
                return 'XGBoost',self.xgboost
            else:
                return 'DecisionTreeReg',self.decisionTreeReg

        
            
            

In [3]:
model_dir = "D:/cdac/CDAC_PROJECT/Untitled Folder/model"

## save model

In [4]:
import pickle 
import os
import shutil

class modelOperation:
        
    def save_model(model,filename):
        path = os.path.join("D:/cdac/CDAC_PROJECT/Untitled Folder/model",filename)  #create seperate directory for each cluster
        if os.path.isdir(path):   #remove previously existing models for each clusters
            shutil.rmtree(model_dir)
            os.makedirs(path)
        else:
            os.makedirs(path) 
            with open(path +'/' + filename+'.sav','wb') as f:
                
                pickle.dump(model, f)
        return 'success'
    
    def load_model(filename):
        with open(model_dir + filename + '/' + filename + '.sav','rb') as f:
            return pickle.load(f)



## clustering

In [5]:
from sklearn.cluster import KMeans
from kneed import KneeLocator
import matplotlib.pyplot as plt
class KMeansClustering:
    
    def elbow_plot(data):
        ss=[]
        
        for i in range(1,11):
            kmeans=KMeans(n_clusters=i,init='k-means++',random_state=7)
            kmeans.fit(data)
            ss.append(kmeans.inertia_)
            
        plt.plot(range(1,11),ss) # creating the graph between SS and the number of clusters
        plt.title('The Elbow Method')
        plt.xlabel('Number of clusters')
        plt.ylabel('SS')
        
        plt.savefig('K-Means_Elbow.PNG')
        
        kn = KneeLocator(range(1, 11), ss, curve='convex', direction='decreasing') #Xaxis,Yaxis,curveshape,direction
        
        return kn.knee
    
    def create_cluster(data,number_of_clusters):
        kmeans = KMeans(n_clusters=number_of_clusters, init='k-means++', random_state=7)
        
        y_kmeans = kmeans.fit_predict(data)  #divide data into clusters
        save_model = modelOperation.save_model(kmeans, 'KMeans')
        
        data['Cluster']=y_kmeans  # create a new column in dataset for storing the cluster information
        return data
    

In [6]:
from sklearn.model_selection import train_test_split
from preprocessing import preprocessor
import pandas as pd

In [7]:
data = pd.read_csv('D:/cdac/CDAC_PROJECT/Untitled Folder/Training_Batch_Files/visibility_08012008_120010.csv')

In [8]:
class trainModel:
    model_dir = "model/"
    
    def trainingModel():
            
        ### preprocessing #####
        #removing unwanted columns as seen in the EDA part
        data = preprocessor.dropUnnecessaryCol(data,['DATE','Precip','WETBULBTEMPF','DewPointTempF','StationPressure'])

        # create separate features and labels
        X, Y = preprocessor.separate_label_feature(data, label_column_name='VISIBILITY')

        #kmeans=KMeansClustering() # object initialization.
        number_of_clusters=KMeansClustering.elbow_plot(X)  #  using the elbow plot to find the number of optimum clusters

        # Divide the data into clusters
        X = KMeansClustering.create_cluster(X,number_of_clusters)

        #create a new column in the dataset consisting of the corresponding cluster assignments.
        X['Labels']=Y

         # getting the unique clusters from our dataset
        list_of_clusters=X['Cluster'].unique()

        ##parsing all the clusters and looking for the best ML algorithm to fit on individual cluster

        for i in list_of_clusters:
            cluster_data=X[X['Cluster']==i] # filter the data for one cluster

            # Prepare the feature and Label columns
            cluster_features=cluster_data.drop(['Labels','Cluster'],axis=1)
            cluster_label= cluster_data['Labels']

            # splitting the data into training and test set for each cluster one by one
            x_train, x_test, y_train, y_test = train_test_split(cluster_features, cluster_label, test_size=0.3, random_state=7)

            x_train_scaled = preprocessor.standardScaling(x_train)
            x_test_scaled = preprocessor.standardScaling(x_test)

            model_finder=Model_Finder()

            #getting the best model for each of the clusters
            best_model_name,best_model=model_finder.get_best_model(x_train_scaled,y_train,x_test_scaled,y_test)

            #saving the best model to the directory.
            #file_op = modelOperation()
            save_model=modelOperation.save_model(best_model,best_model_name+str(i))

