In [154]:
#import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestRegressor
from pdpbox.pdp import pdp_isolate, pdp_plot

In [11]:
#import data
df = pd.read_csv('GlobalTemperatures.csv')

In [12]:
#explore data
display(df.head())
display(df.columns)
display(df.info())
df.isnull().sum()

Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
0,1750-01-01,3.034,3.574,,,,,,
1,1750-02-01,3.083,3.702,,,,,,
2,1750-03-01,5.626,3.076,,,,,,
3,1750-04-01,8.49,2.451,,,,,,
4,1750-05-01,11.573,2.072,,,,,,


Index(['dt', 'LandAverageTemperature', 'LandAverageTemperatureUncertainty',
       'LandMaxTemperature', 'LandMaxTemperatureUncertainty',
       'LandMinTemperature', 'LandMinTemperatureUncertainty',
       'LandAndOceanAverageTemperature',
       'LandAndOceanAverageTemperatureUncertainty'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3192 entries, 0 to 3191
Data columns (total 9 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   dt                                         3192 non-null   object 
 1   LandAverageTemperature                     3180 non-null   float64
 2   LandAverageTemperatureUncertainty          3180 non-null   float64
 3   LandMaxTemperature                         1992 non-null   float64
 4   LandMaxTemperatureUncertainty              1992 non-null   float64
 5   LandMinTemperature                         1992 non-null   float64
 6   LandMinTemperatureUncertainty              1992 non-null   float64
 7   LandAndOceanAverageTemperature             1992 non-null   float64
 8   LandAndOceanAverageTemperatureUncertainty  1992 non-null   float64
dtypes: float64(8), object(1)
memory usage: 224.6+ KB


None

dt                                              0
LandAverageTemperature                         12
LandAverageTemperatureUncertainty              12
LandMaxTemperature                           1200
LandMaxTemperatureUncertainty                1200
LandMinTemperature                           1200
LandMinTemperatureUncertainty                1200
LandAndOceanAverageTemperature               1200
LandAndOceanAverageTemperatureUncertainty    1200
dtype: int64

In [16]:
#cleanup data
#method for cleaning up data
def clean_up_df(df):
    df = df.copy()
    df = df.drop(columns=['LandAverageTemperatureUncertainty', 'LandMaxTemperatureUncertainty', 'LandMinTemperatureUncertainty', 'LandAndOceanAverageTemperatureUncertainty'], axis=1)
    return df

In [51]:
#call method and display cleaned version
df_clean = clean_up_df(df)
display(df_clean.head())
display(df_clean.info())

Unnamed: 0,dt,LandAverageTemperature,LandMaxTemperature,LandMinTemperature,LandAndOceanAverageTemperature
0,1750-01-01,3.034,,,
1,1750-02-01,3.083,,,
2,1750-03-01,5.626,,,
3,1750-04-01,8.49,,,
4,1750-05-01,11.573,,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3192 entries, 0 to 3191
Data columns (total 5 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   dt                              3192 non-null   object 
 1   LandAverageTemperature          3180 non-null   float64
 2   LandMaxTemperature              1992 non-null   float64
 3   LandMinTemperature              1992 non-null   float64
 4   LandAndOceanAverageTemperature  1992 non-null   float64
dtypes: float64(4), object(1)
memory usage: 124.8+ KB


None

In [52]:
#convert dt column to datetime
df_clean['dt'] = pd.to_datetime(df['dt'])
df_clean['Month'] = df_clean['dt'].dt.month
df_clean['Year'] = df_clean['dt'].dt.year
df_clean = df_clean.drop('dt', axis=1)
df_clean = df_clean.drop('Month', axis=1)
df_clean = df_clean[df_clean.Year >= 1915]

#set index to Year column
df_clean = df_clean.set_index(['Year'])

#drop NaN values
df_clean = df_clean.dropna()

#display final data
display(df_clean.head())

Unnamed: 0_level_0,LandAverageTemperature,LandMaxTemperature,LandMinTemperature,LandAndOceanAverageTemperature
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1915,2.299,7.901,-3.416,13.32
1915,3.265,8.931,-2.52,13.675
1915,5.143,11.024,-1.115,14.32
1915,8.705,14.607,2.755,15.292
1915,11.493,17.141,5.371,16.067


In [54]:
#target vector
target = 'LandAverageTemperature'
Y = df_clean[target]

#feature matrix
X = df_clean[['LandMaxTemperature', 'LandMinTemperature', 'LandAndOceanAverageTemperature']]

In [137]:
#train Test our X features & Y target
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(909, 3)
(303, 3)
(909,)
(303,)


In [138]:
#calculate MAE
y_pred = [Y_train.mean()] * len(Y_train)
print('Baseline MAE:', round(mean_absolute_error(Y_train, y_pred), 5))

Baseline MAE: 3.76596


In [139]:
lr = make_pipeline(StandardScaler(), LinearRegression())
lr.fit(X_train, Y_train)

print('Linear Regression Training MAE:', round(mean_absolute_error(Y_train, lr.predict(X_train)), 5))
print('Linear Regression Test MAE:', round(mean_absolute_error(Y_test, lr.predict(X_test)), 5))

Linear Regression Training MAE: 0.04882
Linear Regression Test MAE: 0.05093


In [140]:
y_pred = lr.predict(X_test)
errors = abs(y_pred - Y_test)
mape = 100 * (errors / Y_train)
accuracy = 100 - np.mean(mape)
print('Linear Regression Model Prediction Accuracy:', round(accuracy, 2), '%.')

Linear Regression Model Prediction Accuracy: 99.18 %.


In [145]:
forestModel = make_pipeline(
    SelectKBest(k="all"),
    StandardScaler(),
    RandomForestRegressor(
        n_estimators=100,
        max_depth=50,
        random_state=77,
        n_jobs=-1
    )
)
forestModel.fit(X_train, Y_train)

print('Random Forest Regressor Model Training MAE:', mean_absolute_error(Y_train, forestModel.predict(X_train)))
print('Random Forest Regressor Model Validation MAE:', mean_absolute_error(Y_test, forestModel.predict(X_test)))

Random Forest Regressor Model Training MAE: 0.023445291529152472
Random Forest Regressor Model Validation MAE: 0.06262871287128695


In [151]:
y_pred = forestModel.predict(X_test)
errorss = abs(y_pred - Y_test)
mapee = 100 * (errorss / Y_train)
accuracyy = 100 - np.mean(mapee)
print('Random Forest Regressor Model Prediction Accuracy:', round(accuracyy, 2), '%.')

Random Forest Regressor Model Prediction Accuracy: 98.99 %.
