In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('max_columns',None)

In [None]:
df2016 = pd.read_csv('./2016.csv')
df2017 = pd.read_csv('./2017.csv')
df2018 = pd.read_csv('./2018.csv')

In [None]:
df = pd.concat([df2016,df2017,df2018]).drop("Unnamed: 0",axis=1).reset_index(drop=True)

## extracting month,hours and year from the time column

In [None]:
df['time'] = pd.to_datetime(df['time'],format= "%Y-%m-%d %H:%M:%S")
df['month'] = df['time'].dt.month
df['hours'] = df['time'].dt.hour
df['year'] = df['time'].dt.year

## taking the columns with weather delay greater than zero 

In [None]:
findf = df[(df['WEATHER_DELAY'].isna()==False) & (df['WEATHER_DELAY']>0)]

## setting the catogerical columns 

In [None]:
findf['month'] = pd.Categorical(findf['month'])
findf['hours'] = pd.Categorical(findf['hours'])
findf['Wind Speed'] = pd.Categorical(findf['Wind Speed'])
findf['Precipitation'] = pd.Categorical(findf['Precipitation'])

# data preprocessing

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(
    findf[['Temperature','Dew Point','Humidity','Wind Speed','Pressure','Precipitation','month','hours']]
    ,findf['WEATHER_DELAY'],test_size=0.2)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(xtrain)
x_train = scaler.transform(xtrain)
x_test = scaler.transform(xtest)

# models

In [None]:
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from sklearn.mixture import GaussianMixture
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRFRegressor
from sklearn.ensemble import RandomForestRegressor

## the final function

In [None]:
import joblib
import time 
import pickle

In [None]:
def savemodel(model_file_name):
    clf = None
    joblib.dump(clf, model_file_name)
    model_directory = "/home/shaury/Desktop/pvsc/alibaba"
    model_file_name = f'{model_directory}/model.pkl'
    joblib.dump(clf, model_file_name)

In [None]:
def main_model_function(x_train,ytrain):
    start = time.time()
    #gaussian mixture model
    err,I = 1000,0
    for i in range(1,40):
        gbmodel = GaussianMixture(n_components=i)
        gbmodel.fit(x_train,ytrain)
        if(err>mean_absolute_error(gbmodel.predict(x_train),ytrain)):
            err = mean_absolute_error(gbmodel.predict(x_train),ytrain)
            I = i
    gbmodel = GaussianMixture(n_components=I)
   
    
    # lasso model
    alphas = 10**np.arange(-7,0,0.1)
    params = {"alpha":alphas}
    lassocv = GridSearchCV(Lasso(max_iter=1e7),param_grid=params,verbose = 5)
    lassocv.fit(x_train,ytrain)
    lassomodel = Lasso(alpha = lassocv.best_params_['alpha'],max_iter=1e7)
    
    
    #random forest regressor
    rfc = RandomForestRegressor(n_estimators=200 , max_depth=15)
    
    
    #xgb regressor
    lrate = 10**(np.arange(-2,0.2,0.01))
    cvxg = GridSearchCV(XGBRegressor(n_estimators=150),
                        param_grid={"learning_rate":lrate},verbose=5).fit(x_train,ytrain)
    xgbmodel = XGBRegressor(n_estimators=150,learning_rate=cvxg.best_params_['learning_rate'])
   
    
    stack = StackingCVRegressor(regressors=(gbmodel, lassomodel, rfc, xgbmodel),
                            meta_regressor=xgbmodel, cv=10,
                            use_features_in_secondary=True,
                            store_train_meta_features=True,
                            shuffle=False,
                            random_state=42)
    stack.fit(x_train,ytrain)
    print(start- time.time())
    return stack

In [None]:
model = main_model_function(x_train,ytrain)

In [None]:
import pickle
Pkl_Filename = "./model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(model, file)