## Supervised Machine Learning Model

#### The purpose of this machine learning model is to explore if it is possible to predict if a Twitter user is a climate change believer, denier, or neutral towards climate change based on their location, temperature change of their area over the past 50 years, gender, and the topic of their Tweet. Social media is becoming more and more worldwide and is more widely used than most traditional government polls. Using social media data to uncover public opinion on important topics such as this can become a more accurate way to assess these opinions and uses less resources than traditional polling methods.

#### Google Colab Dependencies

In [1]:
#from google.colab import drive
#drive.mount('/content/drive/')

In [None]:
#import os
#Change the current working directory to the path of Google Cloud Drive
#path="/content/drive/My Drive/Colab Notebooks/"
#os.chdir(path)
#os.listdir(path)

#### Jupyter Notebook Dependencies

In [1]:
# Import all dependencies
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None 
from pathlib import Path
from collections import Counter
import sqlalchemy
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
import psycopg2
from sklearn.cluster import KMeans
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import datetime

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier
from config import password

### Import the Data as a Dataframe and Perform Data Preprocessing

#### The data for this project is sourced from Kaggle: "The Climate Change Twitter Dataset"

#### PgAdmin Database Import

In [None]:
# Create the connection to postgres
#db_string = f"postgresql://postgres:{password}@127.0.0.1:5432/climate-change-twitter"

#Instantiate the engine
#engine = create_engine(db_string)

# Load the table as a dataframe
#cc_twitter_df = pd.read_sql('select * from climate_change_twitter', con=engine)
#cc_twitter_df

#### CSV File Import

In [2]:
# Load the data
cc_twitter_raw_df = pd.read_csv("../climate_change_twitter.csv")
cc_twitter_raw_df.head(5)

Unnamed: 0,created_at,id,lng,lat,topic,sentiment,stance,gender,temperature_avg,aggressiveness
0,2006-06-06 16:06:42+00:00,6132.0,,,Weather Extremes,-0.09718,neutral,female,,aggressive
1,2006-07-23 21:52:30+00:00,13275.0,-73.949582,40.650104,Weather Extremes,0.575777,neutral,undefined,-1.114768,aggressive
2,2006-08-29 01:52:30+00:00,23160.0,,,Weather Extremes,0.500479,neutral,male,,aggressive
3,2006-11-07 02:46:52+00:00,57868.0,,,Weather Extremes,0.032816,neutral,male,,aggressive
4,2006-11-27 14:27:43+00:00,304553.0,,,Importance of Human Intervantion,-0.090428,neutral,male,,aggressive


#### Preprocess Dataframe for Machine Learning Model

In [3]:
# Drop the null rows
cc_twitter_nulldrop_df = cc_twitter_raw_df.dropna()
cc_twitter_nulldrop_df.head()

Unnamed: 0,created_at,id,lng,lat,topic,sentiment,stance,gender,temperature_avg,aggressiveness
1,2006-07-23 21:52:30+00:00,13275.0,-73.949582,40.650104,Weather Extremes,0.575777,neutral,undefined,-1.114768,aggressive
7,2006-12-14 01:39:10+00:00,1092823.0,-122.41942,37.77493,Ideological Positions on Global Warming,-0.544195,neutral,male,4.22854,aggressive
8,2006-12-17 19:43:09+00:00,1278023.0,-79.79198,36.07264,Weather Extremes,-0.565028,denier,male,5.478175,aggressive
9,2006-12-21 01:39:01+00:00,1455543.0,-121.80579,38.00492,Weather Extremes,0.65096,neutral,male,-1.652156,not aggressive
11,2006-12-31 10:47:25+00:00,1893063.0,-1.902691,52.479699,Weather Extremes,0.670905,neutral,male,4.864521,aggressive


In [4]:
# Clean any spaces in column names to avoid errors
cc_twitter_nulldrop_df.columns = cc_twitter_nulldrop_df.columns.str.strip()

In [5]:
# Encode stance column
# This is the target column for the machine learning model
cc_twitter_nulldrop_df['stance'] = cc_twitter_nulldrop_df['stance'].replace(['neutral'], 0)
cc_twitter_nulldrop_df['stance'] = cc_twitter_nulldrop_df['stance'].replace(['believer'], 1)
cc_twitter_nulldrop_df['stance'] = cc_twitter_nulldrop_df['stance'].replace(['denier'], 2)
cc_twitter_endocedstance_df = cc_twitter_nulldrop_df
cc_twitter_endocedstance_df.head(5)

Unnamed: 0,created_at,id,lng,lat,topic,sentiment,stance,gender,temperature_avg,aggressiveness
1,2006-07-23 21:52:30+00:00,13275.0,-73.949582,40.650104,Weather Extremes,0.575777,0,undefined,-1.114768,aggressive
7,2006-12-14 01:39:10+00:00,1092823.0,-122.41942,37.77493,Ideological Positions on Global Warming,-0.544195,0,male,4.22854,aggressive
8,2006-12-17 19:43:09+00:00,1278023.0,-79.79198,36.07264,Weather Extremes,-0.565028,2,male,5.478175,aggressive
9,2006-12-21 01:39:01+00:00,1455543.0,-121.80579,38.00492,Weather Extremes,0.65096,0,male,-1.652156,not aggressive
11,2006-12-31 10:47:25+00:00,1893063.0,-1.902691,52.479699,Weather Extremes,0.670905,0,male,4.864521,aggressive


In [6]:
# Isolate the date from timestamp and create new column to store the date
cc_twitter_endocedstance_df['created_at'] = pd.to_datetime(cc_twitter_endocedstance_df['created_at'])
cc_twitter_endocedstance_df['date'] = cc_twitter_endocedstance_df['created_at'].dt.date
cc_twitter_date_df = cc_twitter_endocedstance_df
cc_twitter_date_df.head()

Unnamed: 0,created_at,id,lng,lat,topic,sentiment,stance,gender,temperature_avg,aggressiveness,date
1,2006-07-23 21:52:30+00:00,13275.0,-73.949582,40.650104,Weather Extremes,0.575777,0,undefined,-1.114768,aggressive,2006-07-23
7,2006-12-14 01:39:10+00:00,1092823.0,-122.41942,37.77493,Ideological Positions on Global Warming,-0.544195,0,male,4.22854,aggressive,2006-12-14
8,2006-12-17 19:43:09+00:00,1278023.0,-79.79198,36.07264,Weather Extremes,-0.565028,2,male,5.478175,aggressive,2006-12-17
9,2006-12-21 01:39:01+00:00,1455543.0,-121.80579,38.00492,Weather Extremes,0.65096,0,male,-1.652156,not aggressive,2006-12-21
11,2006-12-31 10:47:25+00:00,1893063.0,-1.902691,52.479699,Weather Extremes,0.670905,0,male,4.864521,aggressive,2006-12-31


In [7]:
# Drop "created_at" column
cc_twitter_date_df = cc_twitter_date_df.drop('created_at', axis=1)

In [8]:
# Convert date object to string for ml model
cc_twitter_date_df['date'] = cc_twitter_date_df['date'].astype(str).apply(lambda x: x.replace('-', '')).astype(int)

In [9]:
# Drop ID column; it does not contain useful information for the model
cc_twitter_date_df = cc_twitter_date_df.drop(["id"], axis=1)

In [None]:
# Import cleaned dataframe back into PGAdmin

In [None]:
# Import cleaned dataframe as a CSV file
cc_twitter_date_df.to_csv("../climate_change_twitter_cleaned.csv")

In [10]:
# Encode topic, gender, and aggressiveness to numerical values
cc_twitter_df = pd.get_dummies(cc_twitter_date_df, columns=['topic', 'gender', 'aggressiveness'])
cc_twitter_df.head(10)

Unnamed: 0,lng,lat,sentiment,stance,temperature_avg,date,topic_Donald Trump versus Science,topic_Global stance,topic_Ideological Positions on Global Warming,topic_Impact of Resource Overconsumption,...,topic_Politics,topic_Seriousness of Gas Emissions,topic_Significance of Pollution Awareness Events,topic_Undefined / One Word Hashtags,topic_Weather Extremes,gender_female,gender_male,gender_undefined,aggressiveness_aggressive,aggressiveness_not aggressive
1,-73.949582,40.650104,0.575777,0,-1.114768,20060723,0,0,0,0,...,0,0,0,0,1,0,0,1,1,0
7,-122.41942,37.77493,-0.544195,0,4.22854,20061214,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
8,-79.79198,36.07264,-0.565028,2,5.478175,20061217,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
9,-121.80579,38.00492,0.65096,0,-1.652156,20061221,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
11,-1.902691,52.479699,0.670905,0,4.864521,20061231,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
12,-73.949582,40.650104,-0.567821,0,15.600876,20070106,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
15,-0.12574,51.50853,-0.531149,0,4.908487,20070108,0,0,0,1,...,0,0,0,0,0,0,0,1,1,0
21,-0.127647,51.507322,-0.162596,0,4.59847,20070110,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
22,-0.127647,51.507322,-0.162596,0,4.59847,20070110,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
23,-0.127647,51.507322,0.04892,1,4.702218,20070111,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


#### Split data into features (X and y)

In [26]:
# Create the X features
X = cc_twitter_df.drop(["stance"], axis=1)

In [27]:
# Isolate the target
y = cc_twitter_df['stance']

In [28]:
# Check the features are correct
X.head()

Unnamed: 0,lng,lat,sentiment,temperature_avg,topic_Donald Trump versus Science,topic_Global stance,topic_Ideological Positions on Global Warming,topic_Impact of Resource Overconsumption,topic_Importance of Human Intervantion,topic_Politics,topic_Seriousness of Gas Emissions,topic_Significance of Pollution Awareness Events,topic_Undefined / One Word Hashtags,topic_Weather Extremes,gender_female,gender_male,gender_undefined,aggressiveness_aggressive,aggressiveness_not aggressive,date
1,-73.949582,40.650104,0.575777,-1.114768,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,20060723
7,-122.41942,37.77493,-0.544195,4.22854,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,20061214
8,-79.79198,36.07264,-0.565028,5.478175,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,20061217
9,-121.80579,38.00492,0.65096,-1.652156,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,20061221
11,-1.902691,52.479699,0.670905,4.864521,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,20061231


In [32]:
# Check the target is correct
y

1          0
7          0
8          2
9          0
11         0
          ..
1048528    2
1048533    0
1048551    0
1048558    0
1048573    0
Name: stance, Length: 312693, dtype: int64

In [33]:
# Check the balance of our target values
y.value_counts()

1    140978
0    129737
2     41978
Name: stance, dtype: int64

In [34]:
# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)

### Run the Random Forest Model with Entire Dataframe

In [35]:
# Instantiate a random forest classifier model
# n_estimators are set to 100 because
# random_state set to 78 because
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=78)

# Fit the model
rf_model = rf_model.fit(X_train, y_train)

In [36]:
# Make predictions using the testing data on the trained model
predictions = rf_model.predict(X_test)

In [37]:
# Calculate the accuracy score of the model using the testing data
acc_score = balanced_accuracy_score(y_test, predictions)
acc_score

0.5544544535173802

In [38]:
# Print the classification report of this model
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.60      0.40      0.81      0.48      0.57      0.31     32504
          1       0.65      0.59      0.74      0.62      0.66      0.43     35171
          2       0.29      0.67      0.74      0.40      0.71      0.50     10499

avg / total       0.58      0.52      0.77      0.53      0.63      0.39     78174



In [39]:
# Sort the features by importance
features = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
for feature in features:
    print(f"{feature[1]}: ({feature[0]})")

sentiment: (0.2085747316780853)
temperature_avg: (0.17613622497055406)
date: (0.1734054956062695)
lng: (0.16324464834707492)
lat: (0.15987018885202656)
topic_Global stance: (0.020880264912935168)
topic_Weather Extremes: (0.018826518536204238)
topic_Donald Trump versus Science: (0.01218076491532651)
topic_Importance of Human Intervantion: (0.01033791402743824)
topic_Ideological Positions on Global Warming: (0.009167349092578495)
gender_male: (0.007482082624256454)
gender_female: (0.007181062738735014)
aggressiveness_not aggressive: (0.006081995720558184)
aggressiveness_aggressive: (0.005511381199384241)
topic_Seriousness of Gas Emissions: (0.005000815410175303)
topic_Politics: (0.00439518844018732)
topic_Significance of Pollution Awareness Events: (0.003210026411697448)
topic_Undefined / One Word Hashtags: (0.002988673286084511)
gender_undefined: (0.002937144117660653)
topic_Impact of Resource Overconsumption: (0.0025875291127679364)


#### This first Random Forest model is resulting in a low accuracy score, telling us that it is not likely (just higher than a 50/50 chance) of predicting an individual's stance based on their location, average temperature change of the area, their sentiment, gender, and topic of tweet. Now we will test dropping certain features and resulting different combinations of features to see if the model's accuracy changes based on the inputs.

### Random Forest Model without Topic Columns

#### Preprocess dataframe to drop "topic" column

In [30]:
# Check dataframe from before topic was encoded
cc_twitter_date_df

Unnamed: 0,lng,lat,topic,sentiment,stance,gender,temperature_avg,aggressiveness,date
1,-73.949582,40.650104,Weather Extremes,0.575777,0,undefined,-1.114768,aggressive,20060723
7,-122.419420,37.774930,Ideological Positions on Global Warming,-0.544195,0,male,4.228540,aggressive,20061214
8,-79.791980,36.072640,Weather Extremes,-0.565028,2,male,5.478175,aggressive,20061217
9,-121.805790,38.004920,Weather Extremes,0.650960,0,male,-1.652156,not aggressive,20061221
11,-1.902691,52.479699,Weather Extremes,0.670905,0,male,4.864521,aggressive,20061231
...,...,...,...,...,...,...,...,...,...
1048528,144.963320,-37.814000,Seriousness of Gas Emissions,-0.412735,2,male,1.822982,not aggressive,20110329
1048533,110.828448,-7.569246,Ideological Positions on Global Warming,0.450944,0,female,-0.253280,not aggressive,20110329
1048551,121.069917,14.528887,Weather Extremes,0.538100,0,male,-1.243056,not aggressive,20110329
1048558,1.155450,52.059170,Seriousness of Gas Emissions,0.061093,0,female,1.056350,not aggressive,20110329


In [60]:
# Drop the topic column
cc_twitter_notopic_df = cc_twitter_date_df.drop(["topic"], axis=1)

In [61]:
# Encode gender and aggressiveness to numerical values
cc_twitter_notopic_df = pd.get_dummies(cc_twitter_notopic_df, columns=['gender', 'aggressiveness'])
cc_twitter_notopic_df.head(5)

Unnamed: 0,created_at,id,lng,lat,sentiment,stance,temperature_avg,date,gender_female,gender_male,gender_undefined,aggressiveness_aggressive,aggressiveness_not aggressive
1,2006-07-23 21:52:30+00:00,13275.0,-73.949582,40.650104,0.575777,0,-1.114768,2006-07-23,0,0,1,1,0
7,2006-12-14 01:39:10+00:00,1092823.0,-122.41942,37.77493,-0.544195,0,4.22854,2006-12-14,0,1,0,1,0
8,2006-12-17 19:43:09+00:00,1278023.0,-79.79198,36.07264,-0.565028,2,5.478175,2006-12-17,0,1,0,1,0
9,2006-12-21 01:39:01+00:00,1455543.0,-121.80579,38.00492,0.65096,0,-1.652156,2006-12-21,0,1,0,0,1
11,2006-12-31 10:47:25+00:00,1893063.0,-1.902691,52.479699,0.670905,0,4.864521,2006-12-31,0,1,0,1,0


#### Split data into features (X and y)

In [45]:
# Create the X features
X = cc_twitter_notopic_df.drop(["stance"], axis=1)

In [46]:
# Isolate the target
y = cc_twitter_notopic_df['stance']

In [47]:
# Check the features are correct
X.head()

Unnamed: 0,lng,lat,sentiment,temperature_avg,date,gender_female,gender_male,gender_undefined,aggressiveness_aggressive,aggressiveness_not aggressive
1,-73.949582,40.650104,0.575777,-1.114768,20060723,0,0,1,1,0
7,-122.41942,37.77493,-0.544195,4.22854,20061214,0,1,0,1,0
8,-79.79198,36.07264,-0.565028,5.478175,20061217,0,1,0,1,0
9,-121.80579,38.00492,0.65096,-1.652156,20061221,0,1,0,0,1
11,-1.902691,52.479699,0.670905,4.864521,20061231,0,1,0,1,0


In [48]:
# Check the target is correct
y

1          0
7          0
8          2
9          0
11         0
          ..
1048528    2
1048533    0
1048551    0
1048558    0
1048573    0
Name: stance, Length: 312693, dtype: int64

In [49]:
# Check the balance of our target values
y.value_counts()

1    140978
0    129737
2     41978
Name: stance, dtype: int64

In [50]:
# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)

#### Run the Random Forest Model

In [51]:
# Instantiate a random forest classifier model
# n_estimators are set to 100 because
# random_state set to 78 because
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=78)

# Fit the model
rf_model = rf_model.fit(X_train, y_train)

In [52]:
# Make predictions using the testing data on the trained model
predictions = rf_model.predict(X_test)

In [53]:
# Calculate the accuracy score of the model using the testing data
acc_score = balanced_accuracy_score(y_test, predictions)
acc_score

0.49736043123569745

In [54]:
# Print the classification report of this model
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.54      0.40      0.76      0.46      0.55      0.29     32504
          1       0.58      0.48      0.72      0.53      0.59      0.34     35171
          2       0.26      0.61      0.73      0.36      0.66      0.44     10499

avg / total       0.52      0.47      0.74      0.48      0.58      0.33     78174



In [55]:
# Sort the features by importance
features = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
for feature in features:
    print(f"{feature[1]}: ({feature[0]})")

sentiment: (0.23687627551941198)
temperature_avg: (0.19527219646221913)
date: (0.1917289998285639)
lng: (0.17448060027468412)
lat: (0.1679522265004956)
gender_male: (0.00999127360252635)
gender_female: (0.009583759440122332)
aggressiveness_aggressive: (0.005648536482308488)
aggressiveness_not aggressive: (0.005005699919539336)
gender_undefined: (0.0034604319701287967)
