## Outline of Machine Learning Model

In [5]:
# Import all dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import sqlalchemy
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier


### Import the Data as a Dataframe and Perform Data Preprocessing

In [None]:
# Create the connection to postgres
# engine = create_engine('postgresql://username:password@host:port/database')

# Load the table as a dataframe
# engine = pg.connect("dbname='my_db_name' user='pguser' host='127.0.0.1' port='15432' password='pgpassword'")
# df = pd.read_sql('select * from climate_change_twitter', con=engine)

In [6]:
# Perform preliminary exploration and data cleaning

# Drop null values
# Use onehotencoding or get dummies to create continuous data from categorical columns

cc_df = pd.read_csv("../climate_change_twitter.csv")
cc_df.head(5)

Unnamed: 0,created_at,id,lng,lat,topic,sentiment,stance,gender,temperature_avg,aggressiveness
0,2006-06-06 16:06:42+00:00,6132.0,,,Weather Extremes,-0.09718,neutral,female,,aggressive
1,2006-07-23 21:52:30+00:00,13275.0,-73.949582,40.650104,Weather Extremes,0.575777,neutral,undefined,-1.114768,aggressive
2,2006-08-29 01:52:30+00:00,23160.0,,,Weather Extremes,0.500479,neutral,male,,aggressive
3,2006-11-07 02:46:52+00:00,57868.0,,,Weather Extremes,0.032816,neutral,male,,aggressive
4,2006-11-27 14:27:43+00:00,304553.0,,,Importance of Human Intervantion,-0.090428,neutral,male,,aggressive


In [7]:
# Drop the null rows
cc_df = cc_df.dropna()
cc_df.head()

Unnamed: 0,created_at,id,lng,lat,topic,sentiment,stance,gender,temperature_avg,aggressiveness
1,2006-07-23 21:52:30+00:00,13275.0,-73.949582,40.650104,Weather Extremes,0.575777,neutral,undefined,-1.114768,aggressive
7,2006-12-14 01:39:10+00:00,1092823.0,-122.41942,37.77493,Ideological Positions on Global Warming,-0.544195,neutral,male,4.22854,aggressive
8,2006-12-17 19:43:09+00:00,1278023.0,-79.79198,36.07264,Weather Extremes,-0.565028,denier,male,5.478175,aggressive
9,2006-12-21 01:39:01+00:00,1455543.0,-121.80579,38.00492,Weather Extremes,0.65096,neutral,male,-1.652156,not aggressive
11,2006-12-31 10:47:25+00:00,1893063.0,-1.902691,52.479699,Weather Extremes,0.670905,neutral,male,4.864521,aggressive


In [8]:
cc_df.dtypes

created_at          object
id                 float64
lng                float64
lat                float64
topic               object
sentiment          float64
stance              object
gender              object
temperature_avg    float64
aggressiveness      object
dtype: object

In [9]:
cc_df.count()

created_at         312693
id                 312693
lng                312693
lat                312693
topic              312693
sentiment          312693
stance             312693
gender             312693
temperature_avg    312693
aggressiveness     312693
dtype: int64

In [10]:
# Get rid of spaces in names
cc_df.columns = cc_df.columns.str.strip()

In [11]:
# Drop ID column
cc_df = cc_df.drop(["id"], axis=1)

In [12]:
cc_df.head()

Unnamed: 0,created_at,lng,lat,topic,sentiment,stance,gender,temperature_avg,aggressiveness
1,2006-07-23 21:52:30+00:00,-73.949582,40.650104,Weather Extremes,0.575777,neutral,undefined,-1.114768,aggressive
7,2006-12-14 01:39:10+00:00,-122.41942,37.77493,Ideological Positions on Global Warming,-0.544195,neutral,male,4.22854,aggressive
8,2006-12-17 19:43:09+00:00,-79.79198,36.07264,Weather Extremes,-0.565028,denier,male,5.478175,aggressive
9,2006-12-21 01:39:01+00:00,-121.80579,38.00492,Weather Extremes,0.65096,neutral,male,-1.652156,not aggressive
11,2006-12-31 10:47:25+00:00,-1.902691,52.479699,Weather Extremes,0.670905,neutral,male,4.864521,aggressive


In [13]:
# Count and transform topic column
cc_df["topic"].nunique()

10

In [50]:
# One hot encode topic, stance, gender, and aggressiveness to numerical values
encoded_df = pd.get_dummies(cc_df, columns=['topic', 'stance', 'gender', 'aggressiveness'])
encoded_df

Unnamed: 0,created_at,lng,lat,sentiment,temperature_avg,topic_Donald Trump versus Science,topic_Global stance,topic_Ideological Positions on Global Warming,topic_Impact of Resource Overconsumption,topic_Importance of Human Intervantion,...,topic_Undefined / One Word Hashtags,topic_Weather Extremes,stance_believer,stance_denier,stance_neutral,gender_female,gender_male,gender_undefined,aggressiveness_aggressive,aggressiveness_not aggressive
1,2006-07-23 21:52:30+00:00,-73.949582,40.650104,0.575777,-1.114768,0,0,0,0,0,...,0,1,0,0,1,0,0,1,1,0
7,2006-12-14 01:39:10+00:00,-122.419420,37.774930,-0.544195,4.228540,0,0,1,0,0,...,0,0,0,0,1,0,1,0,1,0
8,2006-12-17 19:43:09+00:00,-79.791980,36.072640,-0.565028,5.478175,0,0,0,0,0,...,0,1,0,1,0,0,1,0,1,0
9,2006-12-21 01:39:01+00:00,-121.805790,38.004920,0.650960,-1.652156,0,0,0,0,0,...,0,1,0,0,1,0,1,0,0,1
11,2006-12-31 10:47:25+00:00,-1.902691,52.479699,0.670905,4.864521,0,0,0,0,0,...,0,1,0,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048528,2011-03-29 08:23:23+00:00,144.963320,-37.814000,-0.412735,1.822982,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1
1048533,2011-03-29 08:42:01+00:00,110.828448,-7.569246,0.450944,-0.253280,0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,1
1048551,2011-03-29 09:21:17+00:00,121.069917,14.528887,0.538100,-1.243056,0,0,0,0,0,...,0,1,0,0,1,0,1,0,0,1
1048558,2011-03-29 09:44:21+00:00,1.155450,52.059170,0.061093,1.056350,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,1


In [15]:
# Change latitude and longitude to zip codes
lat_lng_df = cc_df[['lat', 'lng']]
lat_lng_df

Unnamed: 0,lat,lng
1,40.650104,-73.949582
7,37.774930,-122.419420
8,36.072640,-79.791980
9,38.004920,-121.805790
11,52.479699,-1.902691
...,...,...
1048528,-37.814000,144.963320
1048533,-7.569246,110.828448
1048551,14.528887,121.069917
1048558,52.059170,1.155450


In [51]:
#Importing geopy
from geopy.geocoders import Nominatim
geocoder = Nominatim(user_agent = 'climate_project')

In [52]:
# adding 1 second padding between calls
from geopy.extra.rate_limiter import RateLimiter
geocode = RateLimiter(geocoder.geocode, min_delay_seconds = 1,   return_value_on_exception = None) 

In [56]:
lat_lng_list = list(zip(encoded_df['lat'], encoded_df['lng']))
lat_lng_list

[(40.6501038, -73.9495823),
 (37.77493, -122.41942),
 (36.07264, -79.79198),
 (38.00492, -121.80579),
 (52.4796992, -1.9026911),
 (40.6501038, -73.9495823),
 (51.50853, -0.12574),
 (51.5073219, -0.1276474),
 (51.5073219, -0.1276474),
 (51.5073219, -0.1276474),
 (51.50853, -0.12574),
 (32.7762719, -96.7968559),
 (40.6501038, -73.9495823),
 (36.1126365, -80.0144842),
 (43.43617, 13.61232),
 (51.5073219, -0.1276474),
 (51.50853, -0.12574),
 (51.50853, -0.12574),
 (25.96389, -80.24417),
 (51.5073219, -0.1276474),
 (51.5073219, -0.1276474),
 (51.50853, -0.12574),
 (51.50853, -0.12574),
 (43.6534817, -79.3839347),
 (51.50853, -0.12574),
 (52.48142, -1.89983),
 (40.6501038, -73.9495823),
 (51.5073219, -0.1276474),
 (51.48, -3.18),
 (34.09834, -118.32674),
 (36.1622296, -86.7743531),
 (51.0534234, -114.0625892),
 (45.5202471, -122.6741949),
 (51.50853, -0.12574),
 (51.50853, -0.12574),
 (51.50853, -0.12574),
 (38.9778882, -77.0074765),
 (45.52345, -122.67621),
 (49.24966, -123.11934),
 (51.508

In [None]:
# returns geopy Location object
reverse = RateLimiter(geocoder.reverse, min_delay_seconds=1)
#location = reverse(lat_lng_list)

locations = []

for i in lat_lng_list:
    location = reverse(lat_lng_list[0])
    locations.append(location)
    
print(locations)

### Prepare Data for Machine Learning Models

In [None]:
# Create features
# Create our features

In [None]:
# Create our target
y = cc_df['temperature_avg']

In [None]:
# Check the dataframes
X.head()

In [None]:
X.describe()

In [None]:
y.describe()

In [None]:
# Check the balance of our target values
y.value_counts()

In [None]:
# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
Counter(y_test)

### Test Different Models and Compare Confusion Matrices to Find Best Model

#### Random Forest Classifier

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
# Create a random forest classifier.
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=78)

# Fit the model
rf_model = rf_model.fit(X_train, y_train)

In [None]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test)

# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test, predictions)
acc_score

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# Display the confusion matrix
y_pred = rf_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
cm_df

#### Easy Ensemble AdaBoost Classifier

In [None]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier 

#Instantiate the model
eec = EasyEnsembleClassifier(random_state=1, n_estimators=100)

# Fit the model
eec.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
# Making predictions using the testing data.
predictions = eec.predict(X_test)

# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test, predictions)
acc_score

In [None]:
# Display the confusion matrix
y_pred = eec.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
cm_df

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))