# Earth's current temperature and CO/CO2 level modeller (predictor) for given date/time.

* Ideally all these dataset should contain data that taken hourly basis.
* Unfortunately I could not find anything like that.
* So I used most closest , realistic and scientifically accurate dataset. 

## First part of this notebook dedicated to training of temperature prediction.

* Dataset used : 
                https://www.climatologylab.org/gridmet.html

### Purpose of training is predicting future values temperature of given location and time/date.
### As a result we can detect abnormalities from gathered satellite sensor data.


In [None]:
!pip3 install sklearn
!pip3 install xgboost
!pip3 install pandas
!pip3 install sklearn

In [1]:
import os
ROOT_PATH=os.path.dirname(os.path.realpath(__file__))
TEMP_CSV_PATH=os.path.join(ROOT_PATH,"..","output.csv")
EPA_CSV_PATH=os.path.join(ROOT_PATH,"..","epa_co_clean.csv")
TRAIN_TEST_RATIO=0.8

In [2]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import accuracy_score,mean_squared_error
import pandas as pd
from sklearn.ensemble import *
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from xgboost import *
import pickle
from sklearn.pipeline import Pipeline

In [2]:
# Read all the data 
df=pd.read_csv("output.csv")
df.drop(["Unnamed: 0","radiation"],inplace=True,axis=1)
df=df[:][:2000000]

In [12]:
df['date_formatted'] = pd.to_datetime(df.date,format="%Y%m%d")
df['month'] = df["date_formatted"].dt.month
df['year'] = df["date_formatted"].dt.year
df['day'] = df["date_formatted"].dt.day

df.drop(['date_formatted'], axis=1,inplace=True)
df.drop(['date'], axis=1,inplace=True)

Y_train=df["max_temp"].iloc[0: int(TRAIN_TEST_RATIO*len(df))]
X_train=df.drop(['max_temp'], axis=1).iloc[0: int(TRAIN_TEST_RATIO*len(df))]
Y_test=df["max_temp"][int(TRAIN_TEST_RATIO*len(df)):]
X_test=df.drop(['max_temp'], axis=1).iloc[ int(TRAIN_TEST_RATIO*len(df)):]


In [None]:
# Feature scaling for Neural Network

nn_pipe = Pipeline([
        ('scale', StandardScaler()),('NN', MLPRegressor(random_state=1, max_iter=500)) ])

In [None]:
adab_reg=AdaBoostRegressor()
random_reg=RandomForestRegressor()
extra_reg=ExtraTreesRegressor()
gradient_reg=GradientBoostingRegressor()
xgb_reg=XGBRegressor()

In [None]:
adab_reg.fit(X_train,Y_train)
print("AdaGrad finished")
random_reg.fit(X_train,Y_train)
print("RandomForest finished")
extra_reg.fit(X_train,Y_train)
print("Extratree finished")
gradient_reg.fit(X_train,Y_train)
print("Gradient finished")
xgb_reg.fit(X_train,Y_train)
print("XGBoost finished")
nn_pipe.fit(X_train,Y_train)
print("FullyConnected network finished")

In [19]:
predict_ada=adab_reg.predict(X_test)
predict_random=random_reg.predict(X_test)
predict_extra=extra_reg.predict(X_test)
predict_gradient=gradient_reg.predict(X_test)
predict_xgb=xgb_reg.predict(X_test)
predict_nn=nn_pipe.predict(X_test)

In [None]:
best_model={"Adaboost":mean_squared_error(Y_test,predict_ada),
            "RandomForest":mean_squared_error(Y_test,predict_random),
            "ExtraTrees":mean_squared_error(Y_test,predict_extra),
            "GradientBoost":mean_squared_error(Y_test,predict_gradient),
            "XGBoost":mean_squared_error(Y_test,predict_xgb),
            "FullyConnected":mean_squared_error(Y_test,predict_nn)
            }

print(f"*** Best performed model is {min(best_model.items(),key=lambda x:x[1])} ***")

print(f"ADA : {mean_squared_error(Y_test,predict_ada)}")
print(f"RANDOM : {mean_squared_error(Y_test,predict_random)}")
print(f"EXTRA : {mean_squared_error(Y_test,predict_extra)}")
print(f"GRADIENT : {mean_squared_error(Y_test,predict_gradient)}")
print(f"XGB : {mean_squared_error(Y_test,predict_xgb)}")
print(f"FULLYCONNECTED : {mean_squared_error(Y_test,predict_nn)}")


In [None]:
# Save model weights

pickle.dump(adab_reg,open(os.path.join(ROOT_PATH,"adaboost.pkl"),"wb"))
pickle.dump(random_reg,open(os.path.join(ROOT_PATH,"randomforest.pkl"),"wb"))
pickle.dump(extra_reg,open(os.path.join(ROOT_PATH,"extratree.pkl"),"wb"))
pickle.dump(gradient_reg,open(os.path.join(ROOT_PATH,"gradientboost.pkl"),"wb"))
pickle.dump(xgb_reg,open(os.path.join(ROOT_PATH,"xgboost.pkl"),"wb"))
pickle.dump(nn_pipe,open(os.path.join(ROOT_PATH,"fullyconnected.pkl"),"wb"))

# Lets improve already trained models with simple ensemble technique

In [3]:
def averaged_models(model1,model2,model3,model4,model5,model6,test_input):
    m1=model1.predict(test_input)
    m2=model2.predict(test_input)
    m3=model3.predict(test_input)
    m4=model4.predict(test_input)
    m5=model5.predict(test_input)
    m6=model6.predict(test_input)
    return (m1+m2+m3+m4+m5+m6)/6


In [None]:
avg_predictor=averaged_models(adab_reg,random_reg,extra_reg,gradient_reg,xgb_reg,nn_pipe,X_test)

#

# EPA CO/CO2 dataset training

* Dataset used : 
                 https://www.kaggle.com/epa/carbon-monoxide/code


In [None]:
df_epa=pd.read_csv(EPA_CSV_PATH)
df_epa=df_epa[:][:150000]

In [None]:
Y_train=df_epa["arithmetic_mean"].iloc[0: int(TRAIN_TEST_RATIO*len(df_epa))]
X_train=df_epa.drop(['arithmetic_mean'], axis=1).iloc[0: int(TRAIN_TEST_RATIO*len(df_epa))]
Y_test=df_epa["arithmetic_mean"][int(TRAIN_TEST_RATIO*len(df_epa)):]
X_test=df_epa.drop(['arithmetic_mean'], axis=1).iloc[ int(TRAIN_TEST_RATIO*len(df_epa)):]

In [None]:

nn_pipe = Pipeline([
        ('scale', StandardScaler()),('NN', MLPRegressor(random_state=1, max_iter=500)) ])

In [None]:
adab_reg=AdaBoostRegressor()
random_reg=RandomForestRegressor()
extra_reg=ExtraTreesRegressor()
gradient_reg=GradientBoostingRegressor()
xgb_reg=XGBRegressor()

In [None]:
adab_reg.fit(X_train,Y_train)
print("AdaGrad finished")
random_reg.fit(X_train,Y_train)
print("RandomForest finished")
extra_reg.fit(X_train,Y_train)
print("Extratree finished")
gradient_reg.fit(X_train,Y_train)
print("Gradient finished")
xgb_reg.fit(X_train,Y_train)
print("XGBoost finished")
nn_pipe.fit(X_train,Y_train)
print("FullyConnected network finished")

In [None]:
best_model={"Adaboost":mean_squared_error(Y_test,predict_ada),
            "RandomForest":mean_squared_error(Y_test,predict_random),
            "ExtraTrees":mean_squared_error(Y_test,predict_extra),
            "GradientBoost":mean_squared_error(Y_test,predict_gradient),
            "XGBoost":mean_squared_error(Y_test,predict_xgb),
            "FullyConnected":mean_squared_error(Y_test,predict_nn)
            }

print(f"*** Best performed model is {min(best_model.items(),key=lambda x:x[1])} ***")

print(f"ADA : {mean_squared_error(Y_test,predict_ada)}")
print(f"RANDOM : {mean_squared_error(Y_test,predict_random)}")
print(f"EXTRA : {mean_squared_error(Y_test,predict_extra)}")
print(f"GRADIENT : {mean_squared_error(Y_test,predict_gradient)}")
print(f"XGB : {mean_squared_error(Y_test,predict_xgb)}")
print(f"FULLYCONNECTED : {mean_squared_error(Y_test,predict_nn)}")


In [None]:
# Save model weights

pickle.dump(adab_reg,open(os.path.join(ROOT_PATH,"EPA_adaboost.pkl"),"wb"))
pickle.dump(random_reg,open(os.path.join(ROOT_PATH,"EPA_randomforest.pkl"),"wb"))
pickle.dump(extra_reg,open(os.path.join(ROOT_PATH,"EPA_extratree.pkl"),"wb"))
pickle.dump(gradient_reg,open(os.path.join(ROOT_PATH,"EPA_gradientboost.pkl"),"wb"))
pickle.dump(xgb_reg,open(os.path.join(ROOT_PATH,"EPA_xgboost.pkl"),"wb"))
pickle.dump(nn_pipe,open(os.path.join(ROOT_PATH,"EPA_fullyconnected.pkl"),"wb"))

In [None]:
avg_predictor=averaged_models(adab_reg,random_reg,extra_reg,gradient_reg,xgb_reg,nn_pipe,X_test)