## Preprocessing

In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

class preprocessor:
    
    
    def remove_columns(data,columns):
        # taking dataframe and name of columns to be removed
        
        useful_data=data.drop(labels=columns,axis=1)
        
        return useful_data
    
    def separate_label_feature(data,label_column_name):
        #taking dataframe and label col name or output col name
        X = data.drop(labels=label_column_name,axis=1)
        Y = data[label_column_name] #take the output col in Y
        
        return X,Y
    
    def dropUnnecessaryCol(data,columnList):
        data = data.drop(columnList,axis=1)
        return data
    
    def standardScaling(X):
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        return X_scaled

## Clustering

In [3]:
from sklearn.cluster import KMeans
from kneed import KneeLocator
import matplotlib.pyplot as plt
class KMeansClustering:
    
    def elbow_plot(data):
        ss=[]
        
        for i in range(1,11):
            kmeans=KMeans(n_clusters=i,init='k-means++',random_state=7)
            kmeans.fit(data)
            ss.append(kmeans.inertia_)
            
        plt.plot(range(1,11),ss) # creating the graph between SS and the number of clusters
        plt.title('The Elbow Method')
        plt.xlabel('Number of clusters')
        plt.ylabel('SS')
        
        plt.savefig('K-Means_Elbow.PNG')
        
        kn = KneeLocator(range(1, 11), ss, curve='convex', direction='decreasing') #Xaxis,Yaxis,curveshape,direction
        
        return kn.knee
    
    def create_cluster(data,number_of_clusters):
        kmeans = KMeans(n_clusters=number_of_clusters, init='k-means++', random_state=7)
        
        y_kmeans = kmeans.fit_predict(data)  #divide data into clusters
        save_model = modelOperation.save_model(kmeans, 'KMeans')
        
        data['Cluster']=y_kmeans  # create a new column in dataset for storing the cluster information
        return data
    

## Save the Model

In [4]:
import pickle 
import os
import shutil

class modelOperation:
    model_dir = "D:/cdac/CDAC_PROJECT/Untitled Folder/model"
        
    def save_model(model,filename):
        path = os.path.join("D:/cdac/CDAC_PROJECT/Untitled Folder/model",filename)  #create seperate directory for each cluster
        if os.path.isdir(path):   #remove previously existing models for each clusters
            shutil.rmtree(model_dir)
            os.makedirs(path)
        else:
            os.makedirs(path) 
            with open(path +'/' + filename+'.sav','wb') as f:
                
                pickle.dump(model, f)
        return 'success'
    
    def load_model(filename):
        with open(model_dir + filename + '/' + filename + '.sav','rb') as f:
            return pickle.load(f)

    def find_correct_model_file(cluster_number):
        
        cluster_number= cluster_number
        folder_name= model_dir
        list_of_model_files = []
        list_of_files = os.listdir(folder_name)
        for file in list_of_files:
            try:
                if (file.index(str(cluster_number))!=-1):
                    model_name=file
            except:
                continue
        model_name=model_name.split('.')[0]
        return model_name
        

## Validation

In [5]:
import sqlite3
from datetime import datetime
from os import listdir
import os
import re
import json
import shutil
import pandas as pd

In [6]:
class Raw_Data_validation:
    
    def __init__(self,path):
        self.Batch_Directory = path
        self.schema_path = 'schema.json'
    def valuesFromSchema(self):
        # it will extract all the information from given schema
        f = open(self.schema_path,'r')
        dict1 = json.load(f)  # it will return json object containing data in key-value pairs
        f.close()
        #print(type(dict1))
        pattern = dict1['SampleFileName']
        LengthOfDateStampInFile = dict1['LengthOfDateStampInFile']
        LengthOfTimeStampInFile = dict1['LengthOfTimeStampInFile']
        column_names = dict1['ColName']
        NumberofColumns = dict1['NumberofColumns']
        
        return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns
    
    def FileNameRegex(self):
        #regular exp for the filename from training batch files
        regex = "['visibility']+['\_'']+[\d_]+[\d]+\.csv"
        return regex
    def validationFileName(self,regex,LengthOfDateStampInFile,LengthOfTimeStampInFile):
        # file name validation with regex and schema info
        
        onlyfiles = [f for f in listdir(self.Batch_Directory)] # this will give all the files at given path
        destination="Training_files_validated/Good_data"
        destination2="Training_files_validated/Bad_data"
        for filename in onlyfiles:
                if (re.match(regex, filename)):
                    split1 = re.split('.csv', filename)
                    split2 = (re.split('_', split1[0]))
                    if len(split2[1]) == LengthOfDateStampInFile:
                        
                        if len(split2[2]) == LengthOfTimeStampInFile:
                            shutil.copy("Training_Batch_Files/" + filename, "Training_files_validated/Good_data")
                        else:
                            shutil.copy("Training_Batch_Files/" + filename, "Training_files_validated/Bad_data")
                    else:
                        shutil.copy("Training_Batch_Files/" + filename, "Training_files_validated/Bad_data")
                else:
                    shutil.copy("Training_Batch_Files/" + filename, "Training_files_validated/Bad_data")
        
    def validateColumnLength(self,NumberofColumns):
        # even if file name is right ,it may happen that no. of cols are not same
        # so this function will validate that.
        
        for file in listdir('Training_files_validated/Good_data/'):
            csv = pd.read_csv("Training_files_validated/Good_data/" + file)
            if csv.shape[1] == NumberofColumns:  #shape gives(rows,columns) so index 1
                pass
            else:
                shutil.move("Training_files_validated/Good_data/" + file, "Training_files_validated/Bad_data")
            

## Database Operations

In [7]:
import csv

In [8]:
class dBOperation:           #This class is used for handling all the SQL operations.
    
    def __init__(self):
        self.badFilePath = "Training_files_validated/Bad_data"
        self.goodFilePath = "Training_files_validated/Good_data"
    def dataBaseConnection(self,DatabaseName):
        # This method creates the database with the given name and 
        # if Database already exists then opens the connection to the DB.
        
        conn = sqlite3.connect(DatabaseName+'.db')
        return conn
    
    def createTableDb(self,DatabaseName,column_names):
        
    #This method creates a table in the given database which will be used to insert the Good data
        conn = self.dataBaseConnection(DatabaseName)
        for key in column_names.keys():
            type = column_names[key]
            try:
                conn.execute('ALTER TABLE Good_Raw_Data ADD COLUMN "{column_name}" {dataType}'.format(column_name=key,dataType=type))
            except:
                
                conn.execute('CREATE TABLE  Good_Raw_Data ({column_name} {dataType})'.format(column_name=key, dataType=type))
        conn.close()
        
    def insertIntoTableGoodData(self,Database):
        conn = self.dataBaseConnection(Database)
        goodFilePath= self.goodFilePath
        badFilePath = self.badFilePath
        onlyfiles = [f for f in listdir(goodFilePath)]
        
        for file in onlyfiles:
            
            with open(goodFilePath+'/'+file, "r") as f:
                next(f)
                reader = csv.reader(f, delimiter="\n")
                for line in enumerate(reader):
                    for list_ in (line[1]):
                        conn.execute('INSERT INTO Good_Raw_Data values ({values})'.format(values=(list_)))
                        conn.commit()
                

        conn.close()
        
        
            
    def selectingDatafromtableintocsv(self,Database):
        #This method exports the data from Good_Raw_Data table as a CSV file. at a given location.
        
        
        self.fileName = 'InputFile.csv'
        conn = self.dataBaseConnection(Database)
        sqlSelect = "SELECT *  FROM Good_Raw_Data"
        cursor = conn.cursor()

        cursor.execute(sqlSelect)
        results = cursor.fetchall()
        
        # Get the headers of the csv file
        headers = [i[0] for i in cursor.description]  #description property will return a list of tuples describing the columns
        # 0th index is always a col name in description 
        with open( self.fileName, 'w', newline='') as csvFile:
            csvFile = csv.writer(csvFile,delimiter=',',lineterminator='\n')
        
            csvFile.writerow(headers)   # for single row at a time--to write the field names or col names
            csvFile.writerows(results)  # for multiple rows at a time
        
        

In [9]:
class train_validation:
    def __init__(self,path):
        self.raw_data = Raw_Data_validation(path)
        self.dBOperation = dBOperation()
        
    def train_validation(self):
        
            # extracting values from prediction schema
            LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, noofcolumns = self.raw_data.valuesFromSchema()
            # getting the regex defined to validate filename
            regex = self.raw_data.FileNameRegex()
            # validating filename of prediction files
            self.raw_data.validationFileName(regex, LengthOfDateStampInFile, LengthOfTimeStampInFile)
            # validating column length in the file
            self.raw_data.validateColumnLength(noofcolumns)
            
           
            # create database with given name, if present open the connection! Create table with columns given in schema
            self.dBOperation.createTableDb('Training', column_names)
           
            # insert csv files in the table
            self.dBOperation.insertIntoTableGoodData('Training')
            

            # export data in table to csvfile
            self.dBOperation.selectingDatafromtableintocsv('Training')
            


## Model Training

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics  import r2_score
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
from xgboost import XGBRegressor

  from pandas import MultiIndex, Int64Index


In [11]:
class Model_Finder:
    def __init__(self):
        self.clf = RandomForestClassifier()
        self.DecisionTreeReg = DecisionTreeRegressor()
    
    def get_best_params_for_random_forest(self,train_x,train_y):
            #get the parameters for Random Forest Algorithm which give the best accuracy.
            #Use Hyper Parameter Tuning.
            # output will be The model with the best parameters
            
            # initializing with different combination of parameters
            self.param_grid = {"n_estimators": [10, 50, 100, 130], "criterion": ['gini', 'entropy'],
                               "max_depth": range(2, 4, 1), "max_features": ['auto', 'log2']}
            
            #Creating an object of the Grid Search class
            self.grid = GridSearchCV(estimator=self.clf, param_grid=self.param_grid, cv=5)
            
            self.grid.fit(train_x, train_y)
            
            #extracting the best parameters
            self.criterion = self.grid.best_params_['criterion']
            self.max_depth = self.grid.best_params_['max_depth']
            self.max_features = self.grid.best_params_['max_features']
            self.n_estimators = self.grid.best_params_['n_estimators']
            
            #creating a new model with the best parameters
            self.clf = RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion,
                                              max_depth=self.max_depth, max_features=self.max_features)
            # training the mew model
            self.clf.fit(train_x, train_y)
            
            return self.clf
        
        ###### for Decision Tree  ############
        
    def get_best_params_for_DecisionTreeRegressor(self, train_x, train_y):
        #get the parameters for DecisionTreeRegressor Algorithm which gives best accuracy
        #Output will be The model with the best parameters
        
        self.param_grid_decisionTree = {"criterion": ["mse", "friedman_mse", "mae"],
                              "splitter": ["best", "random"],
                              "max_features": ["auto", "sqrt", "log2"],
                              'max_depth': range(2, 16, 2),
                              'min_samples_split': range(2, 16, 2)
                              }
        # Creating an object of the Grid Search class
        self.grid = GridSearchCV(self.DecisionTreeReg, self.param_grid_decisionTree, verbose=3,cv=5)
        
        self.grid.fit(train_x, train_y)

        # extracting the best parameters
        self.criterion = self.grid.best_params_['criterion']
        self.splitter = self.grid.best_params_['splitter']
        self.max_features = self.grid.best_params_['max_features']
        self.max_depth  = self.grid.best_params_['max_depth']
        self.min_samples_split = self.grid.best_params_['min_samples_split']
        
        # creating a new model with the best parameters
        self.decisionTreeReg = DecisionTreeRegressor(criterion=self.criterion,splitter=self.splitter,
                                                     max_features=self.max_features,max_depth=self.max_depth,
                                                     min_samples_split=self.min_samples_split)
        # training the new models
        self.decisionTreeReg.fit(train_x, train_y)
        
        return self.decisionTreeReg
    
    def get_best_params_for_xgboost(self,train_x,train_y):
        #get the parameters for XGBoost Algorithm which give the best accuracy.Use Hyper Parameter Tuning.
        #output will be The model with the best parameters
        
        # initializing with different combination of parameters
        self.param_grid_xgboost = {
                            'learning_rate': [0.5, 0.1, 0.01, 0.001],
                            'max_depth': [3, 5, 10, 20],
                            'n_estimators': [10, 50, 100, 200]
                            }
        self.grid= GridSearchCV(XGBRegressor(objective='reg:linear'),self.param_grid_xgboost, verbose=3,cv=5)
        
        self.grid.fit(train_x, train_y)
        
        # extracting the best parameters
        self.learning_rate = self.grid.best_params_['learning_rate']
        self.max_depth = self.grid.best_params_['max_depth']
        self.n_estimators = self.grid.best_params_['n_estimators']
        
        # creating a new model with the best parameters
        self.xgb = XGBRegressor(objective='reg:linear',learning_rate=self.learning_rate, 
                                max_depth=self.max_depth, n_estimators=self.n_estimators)
        
        # training the mew model
        self.xgb.fit(train_x, train_y)
        
        return self.xgb
    
    def get_best_model(self,train_x,train_y,test_x,test_y):
            #Find out the Model which has the best AUC score.
            #Output: The best model name and the model object
            ## for decision tree
            self.decisionTreeReg= self.get_best_params_for_DecisionTreeRegressor(train_x, train_y)
            self.prediction_decisionTreeReg = self.decisionTreeReg.predict(test_x) # Predictions using the decisionTreeReg Model
            self.decisionTreeReg_error = r2_score(test_y,self.prediction_decisionTreeReg)
            
            # create best model for XGBoost
            self.xgboost = self.get_best_params_for_xgboost(train_x, train_y)
            self.prediction_xgboost = self.xgboost.predict(test_x)  # Predictions using the XGBoost Model
            self.prediction_xgboost_error = r2_score(test_y,self.prediction_xgboost)
            
             #comparing the two models
            if(self.decisionTreeReg_error <  self.prediction_xgboost_error):
                return 'XGBoost',self.xgboost
            else:
                return 'DecisionTreeReg',self.decisionTreeReg

        

In [12]:
from sklearn.model_selection import train_test_split
from preprocessing import preprocessor
import pandas as pd

In [20]:
data = pd.read_csv('D:/cdac/CDAC_PROJECT/Untitled Folder/Training_Batch_Files/visibility_08012008_120010.csv')

In [21]:
class trainModel:
    model_dir = "model/"
    
    def trainingModel():
            
        ### preprocessing #####
        #removing unwanted columns as seen in the EDA part
        data = preprocessor.dropUnnecessaryCol(data,['DATE','Precip','WETBULBTEMPF','DewPointTempF','StationPressure'])

        # create separate features and labels
        X, Y = preprocessor.separate_label_feature(data, label_column_name='VISIBILITY')

        #kmeans=KMeansClustering() # object initialization.
        number_of_clusters=KMeansClustering.elbow_plot(X)  #  using the elbow plot to find the number of optimum clusters

        # Divide the data into clusters
        X = KMeansClustering.create_cluster(X,number_of_clusters)

        #create a new column in the dataset consisting of the corresponding cluster assignments.
        X['Labels']=Y

         # getting the unique clusters from our dataset
        list_of_clusters=X['Cluster'].unique()

        ##parsing all the clusters and looking for the best ML algorithm to fit on individual cluster

        for i in list_of_clusters:
            cluster_data=X[X['Cluster']==i] # filter the data for one cluster

            # Prepare the feature and Label columns
            cluster_features=cluster_data.drop(['Labels','Cluster'],axis=1)
            cluster_label= cluster_data['Labels']

            # splitting the data into training and test set for each cluster one by one
            x_train, x_test, y_train, y_test = train_test_split(cluster_features, cluster_label, test_size=0.3, random_state=7)

            x_train_scaled = preprocessor.standardScaling(x_train)
            x_test_scaled = preprocessor.standardScaling(x_test)

            model_finder=Model_Finder()

            #getting the best model for each of the clusters
            best_model_name,best_model=model_finder.get_best_model(x_train_scaled,y_train,x_test_scaled,y_test)

            #saving the best model to the directory.
            #file_op = modelOperation()
            save_model=modelOperation.save_model(best_model,best_model_name+str(i))


## Prediction Part

In [15]:
class Prediction_Data_validation:
    
    def __init__(path):
        Batch_Directory = path
        schema_path = 'schema_prediction.json'
    def valuesFromSchema():
        # it will extract all the information from given schema
        f = open(schema_path,'r')
        dict1 = json.load(f)  # it will return json object containing data in key-value pairs
        f.close()
        #print(type(dict1))
        pattern = dict1['SampleFileName']
        LengthOfDateStampInFile = dict1['LengthOfDateStampInFile']
        LengthOfTimeStampInFile = dict1['LengthOfTimeStampInFile']
        column_names = dict1['ColName']
        NumberofColumns = dict1['NumberofColumns']
        
        return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns
    
    def FileNameRegex():
        #regular exp for the filename from training batch files
        regex = "['visibility']+['\_'']+[\d_]+[\d]+\.csv"
        return regex
    def validationFileName(regex,LengthOfDateStampInFile,LengthOfTimeStampInFile):
        # file name validation with regex and schema info
        
        onlyfiles = [f for f in listdir(Batch_Directory)] # this will give all the files at given path
        #destination="Prediction_files_validated/Good_data"
        #destination2="Prediction_files_validated/Bad_data"
        for filename in onlyfiles:
                if (re.match(regex, filename)):
                    split1 = re.split('.csv', filename)
                    split2 = (re.split('_', split1[0]))
                    if len(split2[1]) == LengthOfDateStampInFile:
                        
                        if len(split2[2]) == LengthOfTimeStampInFile:
                            shutil.copy("Prediction_Batch_Files/" + filename, "Prediction_files_validated/Good_data")
                        else:
                            shutil.copy("Prediction_Batch_Files/" + filename, "Prediction_files_validated/Bad_data")
                    else:
                        shutil.copy("Prediction_Batch_Files/" + filename, "Prediction_files_validated/Bad_data")
                else:
                    shutil.copy("Prediction_Batch_Files/" + filename, "Prediction_files_validated/Bad_data")
        
    def validateColumnLength(NumberofColumns):
        # even if file name is right ,it may happen that no. of cols are not same
        # so this function will validate that.
        
        for file in listdir('Prediction_files_validated/Good_data/'):
            csv = pd.read_csv("Prediction_files_validated/Good_data/" + file)
            if csv.shape[1] == NumberofColumns:  #shape gives(rows,columns) so index 1
                pass
            else:
                shutil.move("Prediction_files_validated/Good_data/" + file, "Prediction_files_validated/Bad_data")
                
    def deletePredictionFile():

        if os.path.exists('Prediction_Output_File/Predictions.csv'):
            os.remove('Prediction_Output_File/Predictions.csv')
            

## Database Operations

In [17]:
class dBOperationPredict:           #This class is used for handling all the SQL operations.
    def __init__(self): 
        self.path = "Prediction_Database/"
        self.badFilePath = "Training_files_validated/Bad_data"
        self.goodFilePath = "Training_files_validated/Good_data"
    
    def dataBaseConnectionPredict(DatabaseName):
        # This method creates the database with the given name and 
        # if Database already exists then opens the connection to the DB.
        
        conn = sqlite3.connect(self.path+DatabaseName+'.db')
        return conn
    
    def createTableDbPredict(DatabaseName,column_names):
        
    #This method creates a table in the given database which will be used to insert the Good data
        conn = dataBaseConnection(DatabaseName)
        for key in column_names.keys():
            type = column_names[key]
            try:
                conn.execute('ALTER TABLE Good_Raw_Data ADD COLUMN "{column_name}" {dataType}'.format(column_name=key,dataType=type))
            except:
                
                conn.execute('CREATE TABLE  Good_Raw_Data ({column_name} {dataType})'.format(column_name=key, dataType=type))
        conn.close()
        
    def insertIntoTableGoodDataPredict(Database):
        conn = dataBaseConnection(Database)
        goodFilePath= self.goodFilePath
        badFilePath = self.badFilePath
        onlyfiles = [f for f in listdir(goodFilePath)]
        
        for file in onlyfiles:
            
            with open(goodFilePath+'/'+file, "r") as f:
                next(f)
                reader = csv.reader(f, delimiter="\n")
                for line in enumerate(reader):
                    for list_ in (line[1]):
                        conn.execute('INSERT INTO Good_Raw_Data values ({values})'.format(values=(list_)))
                        conn.commit()
                

        conn.close()
        
        
            
    def selectingDatafromtableintocsvPredict(Database):
        #This method exports the data from Good_Raw_Data table as a CSV file. at a given location.
        
        
        fileName = 'InputFile1.csv'
        conn = dataBaseConnection(Database)
        sqlSelect = "SELECT *  FROM Good_Raw_Data"
        cursor = conn.cursor()

        cursor.execute(sqlSelect)
        results = cursor.fetchall()
        
        # Get the headers of the csv file
        headers = [i[0] for i in cursor.description]  #description property will return a list of tuples describing the columns
        # 0th index is always a col name in description 
        with open( fileName, 'w', newline='') as csvFile:
            csvFile = csv.writer(csvFile,delimiter=',',lineterminator='\n')
        
            csvFile.writerow(headers)   # for single row at a time--to write the field names or col names
            csvFile.writerows(results)  # for multiple rows at a time
        

In [1]:
class pred_validation:
    
    def __init__(self,path):
        Batch_Directory = path
        schema_path = 'schema_prediction.json'
        self.dbOperationPredict=dbOperationPredict()
        
    def valuesFromSchema():
        # it will extract all the information from given schema
        f = open(schema_path,'r')
        dict1 = json.load(f)  # it will return json object containing data in key-value pairs
        f.close()
        #print(type(dict1))
        pattern = dict1['SampleFileName']
        LengthOfDateStampInFile = dict1['LengthOfDateStampInFile']
        LengthOfTimeStampInFile = dict1['LengthOfTimeStampInFile']
        column_names = dict1['ColName']
        NumberofColumns = dict1['NumberofColumns']
        
        return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns
    
    def FileNameRegex():
        #regular exp for the filename from training batch files
        regex = "['visibility']+['\_'']+[\d_]+[\d]+\.csv"
        return regex
    def validationFileName(regex,LengthOfDateStampInFile,LengthOfTimeStampInFile):
        # file name validation with regex and schema info
        
        onlyfiles = [f for f in listdir(Batch_Directory)] # this will give all the files at given path
        #destination="Prediction_files_validated/Good_data"
        #destination2="Prediction_files_validated/Bad_data"
        for filename in onlyfiles:
                if (re.match(regex, filename)):
                    split1 = re.split('.csv', filename)
                    split2 = (re.split('_', split1[0]))
                    if len(split2[1]) == LengthOfDateStampInFile:
                        
                        if len(split2[2]) == LengthOfTimeStampInFile:
                            shutil.copy("Prediction_Batch_Files/" + filename, "Prediction_files_validated/Good_data")
                        else:
                            shutil.copy("Prediction_Batch_Files/" + filename, "Prediction_files_validated/Bad_data")
                    else:
                        shutil.copy("Prediction_Batch_Files/" + filename, "Prediction_files_validated/Bad_data")
                else:
                    shutil.copy("Prediction_Batch_Files/" + filename, "Prediction_files_validated/Bad_data")
        
    def validateColumnLength(NumberofColumns):
        # even if file name is right ,it may happen that no. of cols are not same
        # so this function will validate that.
        
        for file in listdir('Prediction_files_validated/Good_data/'):
            csv = pd.read_csv("Prediction_files_validated/Good_data/" + file)
            if csv.shape[1] == NumberofColumns:  #shape gives(rows,columns) so index 1
                pass
            else:
                shutil.move("Prediction_files_validated/Good_data/" + file, "Prediction_files_validated/Bad_data")
                
    def deletePredictionFile():

        if os.path.exists('Prediction_Output_File/Predictions.csv'):
            os.remove('Prediction_Output_File/Predictions.csv')
            
        
    def prediction_validation():
        
            # extracting values from prediction schema
            LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, noofcolumns = valuesFromSchema()
            # getting the regex defined to validate filename
            regex = FileNameRegex()
            # validating filename of prediction files
            validationFileName(regex, LengthOfDateStampInFile, LengthOfTimeStampInFile)
            # validating column length in the file
            validateColumnLength(noofcolumns)
            
           
            # create database with given name, if present open the connection! Create table with columns given in schema
            self.dBOperationPredict.createTableDbPredict('Training', column_names)
           
            # insert csv files in the table
            self.dBOperationPredict.insertIntoTableGoodDataPredict('Training')
            

            # export data in table to csvfile
            self.dBOperationPredict.selectingDatafromtableintocsvPredict('Training')
            


In [19]:
class prediction:
    def __init__(path):
        pred_data_val = Prediction_Data_validation(path)
    
    def predictionFromModel():
        data = preprocessor.dropUnnecessaryCol(data,['DATE','Precip','WETBULBTEMPF','DewPointTempF','StationPressure'])
        
        kmeans=modelOperation.load_model('KMeans')
        
        clusters=kmeans.predict(data)#drops the first column for cluster prediction
        data['clusters']=clusters
        clusters=data['clusters'].unique()
        result=[] # initialize blank list for storing predicitons
        
        for i in clusters:
            cluster_data= data[data['clusters']==i]
            cluster_data = cluster_data.drop(['clusters'],axis=1)
            model_name = modelOperation.find_correct_model_file(i)
            model = modelOperation.load_model(model_name)
            for val in (model.predict(cluster_data.values)):
                result.append(val)
            
        result = pandas.DataFrame(result,columns=['Predictions'])
        path="Prediction_Output_File/Predictions.csv"
        result.to_csv("Prediction_Output_File/Predictions.csv",header=True) #appends result to prediction file
        
        return path
    

## Flask  Application

In [None]:
from flask import Flask, request, render_template
from flask import Response
import os

app = Flask(__name__)

@app.route("/", methods=['GET'])
def home():
    return render_template('index.html')

@app.route("/predict", methods=['POST'])
def prediction():
    if request.form is not None:
        path = request.form['filepath']

            # object initialization

        pred_validation.prediction_validation()  # calling the prediction_validation function

        pred = prediction(path)  # object initialization

            # predicting for dataset present in database
        path = pred.predictionFromModel()
        return Response("Prediction File created at %s!!!" % path)


@app.route("/train", methods=['POST'])
def trainRouteClient():

    try:
        if request.json['folderPath'] is not None:
            path = request.json['folderPath']
            train_valObj = train_validation(path) #object initialization

            train_valObj.train_validation()#calling the training_validation function


            trainModelObj = trainModel() #object initialization
            trainModelObj.trainingModel() #training the model for the files in the table


    except ValueError:

        return Response("Error Occurred! %s" % ValueError)

    except KeyError:

        return Response("Error Occurred! %s" % KeyError)

    except Exception as e:

        return Response("Error Occurred! %s" % e)
    return Response("Training successfull!!")


if __name__ == "__main__":
    app.run(debug=True)