Import Packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

Read CSV Files

In [2]:
glt_country=pd.read_csv("GlobalLandTemperaturesByCountry.csv")
glt_city=pd.read_csv("GlobalLandTemperaturesByMajorCity.csv")
glt_state=pd.read_csv("GlobalLandTemperaturesByState.csv")
glt=pd.read_csv("GlobalTemperatures.csv")

In [3]:
glt

Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
0,1750-01-01,3.034,3.574,,,,,,
1,1750-02-01,3.083,3.702,,,,,,
2,1750-03-01,5.626,3.076,,,,,,
3,1750-04-01,8.490,2.451,,,,,,
4,1750-05-01,11.573,2.072,,,,,,
...,...,...,...,...,...,...,...,...,...
3187,2015-08-01,14.755,0.072,20.699,0.110,9.005,0.170,17.589,0.057
3188,2015-09-01,12.999,0.079,18.845,0.088,7.199,0.229,17.049,0.058
3189,2015-10-01,10.801,0.102,16.450,0.059,5.232,0.115,16.290,0.062
3190,2015-11-01,7.433,0.119,12.892,0.093,2.157,0.106,15.252,0.063


In [5]:
glt_city

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1849-01-01,26.704,1.435,Abidjan,Côte D'Ivoire,5.63N,3.23W
1,1849-02-01,27.434,1.362,Abidjan,Côte D'Ivoire,5.63N,3.23W
2,1849-03-01,28.101,1.612,Abidjan,Côte D'Ivoire,5.63N,3.23W
3,1849-04-01,26.140,1.387,Abidjan,Côte D'Ivoire,5.63N,3.23W
4,1849-05-01,25.427,1.200,Abidjan,Côte D'Ivoire,5.63N,3.23W
...,...,...,...,...,...,...,...
239172,2013-05-01,18.979,0.807,Xian,China,34.56N,108.97E
239173,2013-06-01,23.522,0.647,Xian,China,34.56N,108.97E
239174,2013-07-01,25.251,1.042,Xian,China,34.56N,108.97E
239175,2013-08-01,24.528,0.840,Xian,China,34.56N,108.97E


In [7]:
glt_country

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
0,1743-11-01,4.384,2.294,Åland
1,1743-12-01,,,Åland
2,1744-01-01,,,Åland
3,1744-02-01,,,Åland
4,1744-03-01,,,Åland
...,...,...,...,...
577457,2013-05-01,19.059,1.022,Zimbabwe
577458,2013-06-01,17.613,0.473,Zimbabwe
577459,2013-07-01,17.000,0.453,Zimbabwe
577460,2013-08-01,19.759,0.717,Zimbabwe


In [9]:
glt_state

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,State,Country
0,1855-05-01,25.544,1.171,Acre,Brazil
1,1855-06-01,24.228,1.103,Acre,Brazil
2,1855-07-01,24.371,1.044,Acre,Brazil
3,1855-08-01,25.427,1.073,Acre,Brazil
4,1855-09-01,25.675,1.014,Acre,Brazil
...,...,...,...,...,...
645670,2013-05-01,21.634,0.578,Zhejiang,China
645671,2013-06-01,24.679,0.596,Zhejiang,China
645672,2013-07-01,29.272,1.340,Zhejiang,China
645673,2013-08-01,29.202,0.869,Zhejiang,China


Check for nulls

In [4]:
glt.isnull().sum()

dt                                              0
LandAverageTemperature                         12
LandAverageTemperatureUncertainty              12
LandMaxTemperature                           1200
LandMaxTemperatureUncertainty                1200
LandMinTemperature                           1200
LandMinTemperatureUncertainty                1200
LandAndOceanAverageTemperature               1200
LandAndOceanAverageTemperatureUncertainty    1200
dtype: int64

In [6]:
glt_city.isnull().sum()

dt                                   0
AverageTemperature               11002
AverageTemperatureUncertainty    11002
City                                 0
Country                              0
Latitude                             0
Longitude                            0
dtype: int64

In [8]:
glt_country.isnull().sum()

dt                                   0
AverageTemperature               32651
AverageTemperatureUncertainty    31912
Country                              0
dtype: int64

In [10]:
glt_state.isnull().sum()

dt                                   0
AverageTemperature               25648
AverageTemperatureUncertainty    25648
State                                0
Country                              0
dtype: int64

In [11]:
# glt.dropna(inplace=True)
# glt

In [12]:
# glt_country.dropna(inplace=True)
# glt_country

In [13]:
# glt_state.dropna(inplace=True)
# glt_state

In [14]:
# glt_city.dropna(inplace=True)
# glt_city

In [16]:
def preprocess_temperature_data(df,date_col='dt'):
    df[date_col]=pd.to_datetime(df[date_col])
    df=df.sort_values(by=date_col)
    df.interpolate(method='linear',inplace=True)
    return df

state_temp=preprocess_temperature_data(glt_state)
country_temp=preprocess_temperature_data(glt_country)
city_temp=preprocess_temperature_data(glt_city)
global_temp=preprocess_temperature_data(glt)

  df.interpolate(method='linear',inplace=True)
  df.interpolate(method='linear',inplace=True)
  df.interpolate(method='linear',inplace=True)


In [17]:
merged_data = pd.merge_asof(global_temp, country_temp, on='dt', direction='nearest')
merged_data = pd.merge_asof(merged_data, state_temp, on='dt', direction='nearest')
merged_data = pd.merge_asof(merged_data, city_temp, on='dt', direction='nearest')

In [18]:
merged_data['year'] = merged_data['dt'].dt.year
merged_data['month'] = merged_data['dt'].dt.month


In [19]:
location_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_locations = location_encoder.fit_transform(merged_data[['Country', 'State', 'City']])

In [20]:
merged_data = pd.concat([merged_data, pd.DataFrame(encoded_locations, index=merged_data.index)], axis=1)

In [29]:
features = ['year', 'month', 'LandAverageTemperature', 'AverageTemperature_x', 'AverageTemperature_y'] 
X = merged_data[features]
y = merged_data['LandAndOceanAverageTemperature']


In [22]:
imputer = SimpleImputer(strategy='mean')
y = imputer.fit_transform(y.values.reshape(-1, 1)).ravel()

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [24]:
model1=LinearRegression()
model2=RandomForestRegressor()
model3=DecisionTreeRegressor()
model4=SVR()

In [25]:
model1.fit(X_train,y_train)
model2.fit(X_train,y_train)
model3.fit(X_train,y_train)
model4.fit(X_train,y_train)

In [26]:
y1_pred=model1.predict(X_test)
y2_pred=model2.predict(X_test)
y3_pred=model3.predict(X_test)
y4_pred=model4.predict(X_test)

In [27]:
rmse1=np.sqrt(mean_squared_error(y_test,y1_pred))
rmse2=np.sqrt(mean_squared_error(y_test,y2_pred))
rmse3=np.sqrt(mean_squared_error(y_test,y3_pred))
rmse4=np.sqrt(mean_squared_error(y_test,y4_pred))

In [28]:
print("Linear Regression: ",rmse1)
print("Random Forest Regression: ",rmse2)
print("Decision Tree Regression: ",rmse3)
print("Support Vector Regression: ",rmse4)

Linear Regression:  0.7159052994067002
Random Forest Regression:  0.2169466513459093
Decision Tree Regression:  0.22588661343321245
Support Vector Regression:  1.2422838969365644


Random Forest Regression is the best model