## Outline of Machine Learning Model

In [1]:
# Import all dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import sqlalchemy
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier


### Import the Data as a Dataframe and Perform Data Preprocessing

In [None]:
# Create the connection to postgres
# engine = create_engine('postgresql://username:password@host:port/database')

# Load the table as a dataframe
# engine = pg.connect("dbname='my_db_name' user='pguser' host='127.0.0.1' port='15432' password='pgpassword'")
# df = pd.read_sql('select * from climate_change_twitter', con=engine)

In [4]:
# Perform preliminary exploration and data cleaning

# Drop null values
# Use onehotencoding or get dummies to create continuous data from categorical columns

cc_df = pd.read_csv("../climate_change_twitter.csv")
cc_df.head(5)

Unnamed: 0,created_at,id,lng,lat,topic,sentiment,stance,gender,temperature_avg,aggressiveness
0,2006-06-06 16:06:42+00:00,6132.0,,,Weather Extremes,-0.09718,neutral,female,,aggressive
1,2006-07-23 21:52:30+00:00,13275.0,-73.949582,40.650104,Weather Extremes,0.575777,neutral,undefined,-1.114768,aggressive
2,2006-08-29 01:52:30+00:00,23160.0,,,Weather Extremes,0.500479,neutral,male,,aggressive
3,2006-11-07 02:46:52+00:00,57868.0,,,Weather Extremes,0.032816,neutral,male,,aggressive
4,2006-11-27 14:27:43+00:00,304553.0,,,Importance of Human Intervantion,-0.090428,neutral,male,,aggressive


In [None]:
# Drop the null rows
cc_df = cc_df.dropna()
cc_df.head()

In [None]:
cc_df.dtypes

In [None]:
cc_df.count()

In [None]:
# Get rid of spaces in names
cc_df.columns = cc_df.columns.str.strip()

In [None]:
# Drop ID column
cc_df = cc_df.drop(["id"], axis=1)

In [None]:
cc_df.head()

In [None]:
# Count and transform topic column
cc_df["topic"].nunique()

In [None]:
# One hot encode topic, stance, gender, and aggressiveness to numerical values
encoded_df = pd.get_dummies(cc_df, columns=['topic', 'stance', 'gender', 'aggressiveness'])
encoded_df

In [None]:
# Change latitude and longitude to zip codes
lat_lng_df = cc_df[['lat', 'lng']]
lat_lng_df

In [None]:
# Convert created_at to datetime format

### Prepare Data for Machine Learning Models

In [None]:
# Create features
# Create our features
# X = pd.get_dummies(cc_df, columns=['created_at', 'id', 'lng', 'lat', 
                'topic', 'sentiment', 'stance', 'gender', 
                'aggressiveness'], sparse=True).drop('temperature_avg', axis=1)

In [None]:
# Create our target
y = cc_df['temperature_avg']

In [None]:
# Check the dataframes
X.head()

In [None]:
X.describe()

In [None]:
y.describe()

In [None]:
# Check the balance of our target values
y.value_counts()

In [None]:
# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
Counter(y_test)

### Test Different Models and Compare Confusion Matrices to Find Best Model

#### Random Forest Classifier

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
# Create a random forest classifier.
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=78)

# Fit the model
rf_model = rf_model.fit(X_train, y_train)

In [None]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test)

# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test, predictions)
acc_score

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# Display the confusion matrix
y_pred = rf_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
cm_df

#### Easy Ensemble AdaBoost Classifier

In [None]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier 

#Instantiate the model
eec = EasyEnsembleClassifier(random_state=1, n_estimators=100)

# Fit the model
eec.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
# Making predictions using the testing data.
predictions = eec.predict(X_test)

# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test, predictions)
acc_score

In [None]:
# Display the confusion matrix
y_pred = eec.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
cm_df

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))