In [1]:
import warnings

warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

In [4]:
# Load the data
# file_path = Path('Average Temperature of Cities.csv')
file_path = Path('Annual Average AZ Temps 1970-2022.csv')
file = Path('CO2 Emissions from Fossil Fuels- Fuel Total.csv')
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,State,Year,Degrees Fahrenheit,Anomaly
0,Arizona,1970,59.0,-0.8
1,Arizona,1971,58.2,-1.6
2,Arizona,1972,59.6,-0.2
3,Arizona,1973,58.4,-1.4
4,Arizona,1974,59.5,-0.3


In [5]:
df_file = pd.read_csv(file)
df_file.head()

Unnamed: 0,State,Year,Coal,Petroleum Products,Natural Gas
0,Arizona,1970,0.8,13.4,10.7
1,Arizona,1971,0.8,14.3,11.8
2,Arizona,1972,0.7,16.8,12.6
3,Arizona,1973,0.9,21.7,11.8
4,Arizona,1974,4.6,21.4,10.7


In [6]:
df = df.drop(columns = 'State')
df.head()

Unnamed: 0,Year,Degrees Fahrenheit,Anomaly
0,1970,59.0,-0.8
1,1971,58.2,-1.6
2,1972,59.6,-0.2
3,1973,58.4,-1.4
4,1974,59.5,-0.3


In [7]:
df_file = df_file.drop(columns = 'State')

In [8]:
df_file.head()

Unnamed: 0,Year,Coal,Petroleum Products,Natural Gas
0,1970,0.8,13.4,10.7
1,1971,0.8,14.3,11.8
2,1972,0.7,16.8,12.6
3,1973,0.9,21.7,11.8
4,1974,4.6,21.4,10.7


In [9]:
df = pd.merge(df, df_file, on = 'Year')

In [10]:
df = df.drop(columns = 'Anomaly')

In [11]:
df.tail()

Unnamed: 0,Year,Degrees Fahrenheit,Coal,Petroleum Products,Natural Gas
45,2015,61.8,36.9,38.8,19.3
46,2016,62.2,30.9,40.2,19.7
47,2017,63.0,32.0,40.8,17.7
48,2018,62.3,31.7,41.2,21.1
49,2019,60.3,24.6,42.3,25.6


In [12]:
df.columns

Index(['Year', 'Degrees Fahrenheit', 'Coal', 'Petroleum Products',
       'Natural Gas'],
      dtype='object')

In [13]:
X = df[['Year', 'Coal', 'Petroleum Products',
       'Natural Gas']].copy()

X = X.dropna()
y=df['Degrees Fahrenheit']
X.head()

Unnamed: 0,Year,Coal,Petroleum Products,Natural Gas
0,1970,0.8,13.4,10.7
1,1971,0.8,14.3,11.8
2,1972,0.7,16.8,12.6
3,1973,0.9,21.7,11.8
4,1974,4.6,21.4,10.7


In [14]:
X.dropna()

Unnamed: 0,Year,Coal,Petroleum Products,Natural Gas
0,1970,0.8,13.4,10.7
1,1971,0.8,14.3,11.8
2,1972,0.7,16.8,12.6
3,1973,0.9,21.7,11.8
4,1974,4.6,21.4,10.7
5,1975,8.8,20.8,8.6
6,1976,13.3,21.1,9.4
7,1977,17.1,24.2,9.2
8,1978,15.2,24.3,9.8
9,1979,23.4,23.3,9.5


In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)

In [16]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# Fitting our model with all our features in X
model.fit(X_train, y_train)

score = model.score(X, y)
print(f"R2 Score: {score}")

R2 Score: 0.5668356048328688


In [17]:
scaler = MinMaxScaler()

In [18]:
scaler.fit(X_train)

MinMaxScaler()

In [19]:
X_train_scaled = scaler.transform(X_train)

In [20]:
X_test_scaled = scaler.transform(X_test)

In [21]:
# Random Forest
rf_model = RandomForestRegressor(n_estimators=128, random_state=78)

In [22]:
rf = rf_model.fit(X_train_scaled, y_train)

In [23]:
rf_predictions = rf_model.predict(X_test_scaled)

In [24]:
predictions = pd.DataFrame({"Predictions":rf_predictions, "Actual":y_test})

In [25]:
predictions.head(25)

Unnamed: 0,Predictions,Actual
13,60.184375,59.2
39,61.196094,61.1
30,60.878906,61.7
45,61.725,61.8
17,60.025,59.3
48,61.967188,62.3
26,60.050781,61.9
25,60.008594,61.0
32,61.435156,61.2
19,59.694531,61.2


In [26]:
rf.score(X_test_scaled, y_test)

0.34613178210694606

In [27]:
y_train

8     59.8
3     58.4
6     59.0
41    60.3
46    62.2
47    63.0
15    59.6
9     58.8
16    60.6
24    60.6
34    60.4
31    61.2
0     59.0
44    62.2
27    60.5
33    61.9
5     57.9
29    60.8
11    61.3
36    61.1
1     58.2
21    59.2
2     59.6
43    60.4
35    61.1
23    59.8
40    60.4
10    60.2
22    59.6
18    60.2
49    60.3
20    59.7
7     60.2
42    62.1
14    59.4
28    59.3
38    60.6
Name: Degrees Fahrenheit, dtype: float64

In [28]:
y_pred = model.predict(X_test_scaled)

In [29]:
y_pred

array([-31.20472354, -31.17466052, -31.18105829, -31.16750772,
       -31.19980418, -31.1649508 , -31.18769577, -31.18951344,
       -31.17940137, -31.19667698, -31.20551677, -31.21748374,
       -31.17426147])

In [30]:
df.columns

Index(['Year', 'Degrees Fahrenheit', 'Coal', 'Petroleum Products',
       'Natural Gas'],
      dtype='object')

In [31]:
test = pd.DataFrame({'Year':[2050],
       'Coal':[30], 'Petroleum Products':[40],
       'Natural Gas':[25]})


In [32]:
model.predict(test)

array([62.90401421])

In [33]:
df.corr()

Unnamed: 0,Year,Degrees Fahrenheit,Coal,Petroleum Products,Natural Gas
Year,1.0,0.757579,0.773944,0.952765,0.721341
Degrees Fahrenheit,0.757579,1.0,0.607625,0.738825,0.503674
Coal,0.773944,0.607625,1.0,0.703621,0.286516
Petroleum Products,0.952765,0.738825,0.703621,1.0,0.780433
Natural Gas,0.721341,0.503674,0.286516,0.780433,1.0


In [34]:
from sklearn.linear_model import LinearRegression
classifier = LinearRegression(fit_intercept = True, normalize = False)
classifier

LinearRegression(normalize=False)

In [35]:
classifier.fit(X_train_scaled, y_train)

LinearRegression(normalize=False)

In [36]:
X_train_scaled

array([[0.16326531, 0.33564815, 0.37716263, 0.21      ],
       [0.06122449, 0.00462963, 0.28719723, 0.31      ],
       [0.12244898, 0.29166667, 0.26643599, 0.19      ],
       [0.83673469, 1.        , 0.85813149, 0.495     ],
       [0.93877551, 0.69907407, 0.92733564, 0.705     ],
       [0.95918367, 0.72453704, 0.94809689, 0.605     ],
       [0.30612245, 0.73611111, 0.2733564 , 0.08      ],
       [0.18367347, 0.52546296, 0.34256055, 0.195     ],
       [0.32653061, 0.63425926, 0.3183391 , 0.        ],
       [0.48979592, 0.86805556, 0.44636678, 0.085     ],
       [0.69387755, 0.92592593, 0.84429066, 0.665     ],
       [0.63265306, 0.91898148, 0.76816609, 0.365     ],
       [0.        , 0.00231481, 0.        , 0.255     ],
       [0.89795918, 0.97453704, 0.84429066, 0.555     ],
       [0.55102041, 0.7962963 , 0.55363322, 0.08      ],
       [0.67346939, 0.88194444, 0.8200692 , 0.45      ],
       [0.10204082, 0.1875    , 0.25605536, 0.15      ],
       [0.59183673, 0.87268519,

In [37]:
type(y_train)

pandas.core.series.Series

In [38]:
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
13,59.649678,59.2
39,61.057052,61.1
30,60.69335,61.7
45,61.368104,61.8
17,59.863725,59.3
48,61.487601,62.3
26,60.397129,61.9
25,60.326057,61.0
32,60.776549,61.2
19,60.011535,61.2


In [39]:
classifier.score(X_test_scaled,y_test)

0.5343182468166369

In [40]:
from sklearn.linear_model import Ridge

In [41]:
ridge = Ridge(alpha = 1, solver = 'auto')

In [42]:
ridge.fit(X_train_scaled, y_train)

Ridge(alpha=1)

In [43]:
ridge.score(X_test_scaled, y_test)

0.4688019515118369

In [44]:
ridge.predict(X_test_scaled)

array([59.66165813, 60.99921065, 60.69428078, 61.18704028, 59.81441371,
       61.25645291, 60.26638008, 60.19425986, 60.77895473, 60.01343383,
       59.70304779, 59.19863373, 61.14259853])