## Supervised Machine Learning Model Removing Sentiment

#### The purpose of this machine learning model is to explore if the model accuracy will be affected by removing the Sentiment feature. This will help us determine the importance of this feature in predicting an individual's stance.

#### Google Colab Dependencies

In [1]:
#from google.colab import drive
#drive.mount('/content/drive/')

In [2]:
#import os
#Change the current working directory to the path of Google Cloud Drive
#path="/content/drive/My Drive/Colab Notebooks/"
#os.chdir(path)
#os.listdir(path)

#### Jupyter Notebook Dependencies

In [3]:
# Import all dependencies
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None 
from pathlib import Path
from collections import Counter
import sqlalchemy
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
import psycopg2
from sklearn.cluster import KMeans
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import datetime

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier
from config import password

### Import the Data as a Dataframe and Perform Data Preprocessing

#### The data for this project is sourced from Kaggle: "The Climate Change Twitter Dataset"

#### PgAdmin Database Import

In [4]:
# Create the connection to postgres
#db_string = f"postgresql://postgres:{password}@127.0.0.1:5432/climate-change-twitter"

#Instantiate the engine
#engine = create_engine(db_string)

# Load the table as a dataframe
#cc_twitter_df = pd.read_sql('select * from climate_change_twitter', con=engine)
#cc_twitter_df

#### CSV File Import

In [5]:
# Load the data
cc_twitter_raw_df = pd.read_csv("../The_Climate_Change_Twitter_Dataset.csv")
cc_twitter_raw_df.head(5)

Unnamed: 0,created_at,id,lng,lat,topic,sentiment,stance,gender,temperature_avg,aggressiveness
0,2006-06-06 16:06:42+00:00,6132,,,Weather Extremes,-0.09718,neutral,female,,aggressive
1,2006-07-23 21:52:30+00:00,13275,-73.949582,40.650104,Weather Extremes,0.575777,neutral,undefined,-1.114768,aggressive
2,2006-08-29 01:52:30+00:00,23160,,,Weather Extremes,0.500479,neutral,male,,aggressive
3,2006-11-07 02:46:52+00:00,57868,,,Weather Extremes,0.032816,neutral,male,,aggressive
4,2006-11-27 14:27:43+00:00,304553,,,Importance of Human Intervantion,-0.090428,neutral,male,,aggressive


#### Preprocess Dataframe for Machine Learning Model

In [6]:
cc_twitter_raw_df.count()

created_at         15789411
id                 15789411
lng                 5307538
lat                 5307538
topic              15789411
sentiment          15789411
stance             15789411
gender             15789411
temperature_avg     5307538
aggressiveness     15789411
dtype: int64

In [7]:
# Drop the null rows
cc_twitter_nulldrop_df = cc_twitter_raw_df.dropna()
cc_twitter_nulldrop_df.head()

Unnamed: 0,created_at,id,lng,lat,topic,sentiment,stance,gender,temperature_avg,aggressiveness
1,2006-07-23 21:52:30+00:00,13275,-73.949582,40.650104,Weather Extremes,0.575777,neutral,undefined,-1.114768,aggressive
7,2006-12-14 01:39:10+00:00,1092823,-122.41942,37.77493,Ideological Positions on Global Warming,-0.544195,neutral,male,4.22854,aggressive
8,2006-12-17 19:43:09+00:00,1278023,-79.79198,36.07264,Weather Extremes,-0.565028,denier,male,5.478175,aggressive
9,2006-12-21 01:39:01+00:00,1455543,-121.80579,38.00492,Weather Extremes,0.65096,neutral,male,-1.652156,not aggressive
11,2006-12-31 10:47:25+00:00,1893063,-1.902691,52.479699,Weather Extremes,0.670905,neutral,male,4.864521,aggressive


In [8]:
# Clean any spaces in column names to avoid errors
cc_twitter_nulldrop_df.columns = cc_twitter_nulldrop_df.columns.str.strip()

In [9]:
# Encode stance column
# This is the target column for the machine learning model
cc_twitter_nulldrop_df['stance'] = cc_twitter_nulldrop_df['stance'].replace(['neutral'], 0)
cc_twitter_nulldrop_df['stance'] = cc_twitter_nulldrop_df['stance'].replace(['believer'], 1)
cc_twitter_nulldrop_df['stance'] = cc_twitter_nulldrop_df['stance'].replace(['denier'], 2)
cc_twitter_endocedstance_df = cc_twitter_nulldrop_df
cc_twitter_endocedstance_df.head(5)

Unnamed: 0,created_at,id,lng,lat,topic,sentiment,stance,gender,temperature_avg,aggressiveness
1,2006-07-23 21:52:30+00:00,13275,-73.949582,40.650104,Weather Extremes,0.575777,0,undefined,-1.114768,aggressive
7,2006-12-14 01:39:10+00:00,1092823,-122.41942,37.77493,Ideological Positions on Global Warming,-0.544195,0,male,4.22854,aggressive
8,2006-12-17 19:43:09+00:00,1278023,-79.79198,36.07264,Weather Extremes,-0.565028,2,male,5.478175,aggressive
9,2006-12-21 01:39:01+00:00,1455543,-121.80579,38.00492,Weather Extremes,0.65096,0,male,-1.652156,not aggressive
11,2006-12-31 10:47:25+00:00,1893063,-1.902691,52.479699,Weather Extremes,0.670905,0,male,4.864521,aggressive


In [10]:
# Isolate the date from timestamp and create new column to store the date
cc_twitter_endocedstance_df['created_at'] = pd.to_datetime(cc_twitter_endocedstance_df['created_at'])
cc_twitter_endocedstance_df['date'] = cc_twitter_endocedstance_df['created_at'].dt.date
cc_twitter_date_df = cc_twitter_endocedstance_df
cc_twitter_date_df.head()

Unnamed: 0,created_at,id,lng,lat,topic,sentiment,stance,gender,temperature_avg,aggressiveness,date
1,2006-07-23 21:52:30+00:00,13275,-73.949582,40.650104,Weather Extremes,0.575777,0,undefined,-1.114768,aggressive,2006-07-23
7,2006-12-14 01:39:10+00:00,1092823,-122.41942,37.77493,Ideological Positions on Global Warming,-0.544195,0,male,4.22854,aggressive,2006-12-14
8,2006-12-17 19:43:09+00:00,1278023,-79.79198,36.07264,Weather Extremes,-0.565028,2,male,5.478175,aggressive,2006-12-17
9,2006-12-21 01:39:01+00:00,1455543,-121.80579,38.00492,Weather Extremes,0.65096,0,male,-1.652156,not aggressive,2006-12-21
11,2006-12-31 10:47:25+00:00,1893063,-1.902691,52.479699,Weather Extremes,0.670905,0,male,4.864521,aggressive,2006-12-31


In [11]:
# Drop "created_at" column
cc_twitter_date_df = cc_twitter_date_df.drop('created_at', axis=1)

In [12]:
# Convert date object to string for ml model
cc_twitter_date_df['date'] = cc_twitter_date_df['date'].astype(str).apply(lambda x: x.replace('-', '')).astype(int)

In [13]:
# Drop ID column; it does not contain useful information for the model
cc_twitter_date_df = cc_twitter_date_df.drop(labels=["id", "sentiment"], axis=1)

In [14]:
# Export cleaned dataframe as a CSV file
# cc_twitter_date_df.to_csv("../climate_change_twitter_cleaned.csv")

In [15]:
# Export cleaned dataframe back to PGAdmin

In [16]:
# Encode topic, gender, and aggressiveness to numerical values
cc_twitter_df = pd.get_dummies(cc_twitter_date_df, columns=['topic', 'gender', 'aggressiveness'])
cc_twitter_df.head(10)

Unnamed: 0,lng,lat,stance,temperature_avg,date,topic_Donald Trump versus Science,topic_Global stance,topic_Ideological Positions on Global Warming,topic_Impact of Resource Overconsumption,topic_Importance of Human Intervantion,topic_Politics,topic_Seriousness of Gas Emissions,topic_Significance of Pollution Awareness Events,topic_Undefined / One Word Hashtags,topic_Weather Extremes,gender_female,gender_male,gender_undefined,aggressiveness_aggressive,aggressiveness_not aggressive
1,-73.949582,40.650104,0,-1.114768,20060723,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0
7,-122.41942,37.77493,0,4.22854,20061214,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0
8,-79.79198,36.07264,2,5.478175,20061217,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0
9,-121.80579,38.00492,0,-1.652156,20061221,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1
11,-1.902691,52.479699,0,4.864521,20061231,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0
12,-73.949582,40.650104,0,15.600876,20070106,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0
15,-0.12574,51.50853,0,4.908487,20070108,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0
21,-0.127647,51.507322,0,4.59847,20070110,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0
22,-0.127647,51.507322,0,4.59847,20070110,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0
23,-0.127647,51.507322,1,4.702218,20070111,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1


#### Split data into features (X and y)

In [17]:
# Create the X features
X = cc_twitter_df.drop(["stance"], axis=1)

In [18]:
# Isolate the target
y = cc_twitter_df['stance']

In [19]:
# Check the features are correct
X.head()

Unnamed: 0,lng,lat,temperature_avg,date,topic_Donald Trump versus Science,topic_Global stance,topic_Ideological Positions on Global Warming,topic_Impact of Resource Overconsumption,topic_Importance of Human Intervantion,topic_Politics,topic_Seriousness of Gas Emissions,topic_Significance of Pollution Awareness Events,topic_Undefined / One Word Hashtags,topic_Weather Extremes,gender_female,gender_male,gender_undefined,aggressiveness_aggressive,aggressiveness_not aggressive
1,-73.949582,40.650104,-1.114768,20060723,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0
7,-122.41942,37.77493,4.22854,20061214,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0
8,-79.79198,36.07264,5.478175,20061217,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0
9,-121.80579,38.00492,-1.652156,20061221,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1
11,-1.902691,52.479699,4.864521,20061231,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0


In [20]:
# Check the target is correct
y

1           0
7           0
8           2
9           0
11          0
           ..
15789396    1
15789404    1
15789405    0
15789407    1
15789408    0
Name: stance, Length: 5307538, dtype: int64

In [21]:
# Check the balance of our target values
y.value_counts()

1    3947378
0     994843
2     365317
Name: stance, dtype: int64

In [22]:
# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)

### Run the Random Forest Model without Sentiment

In [23]:
# Instantiate a random forest classifier model
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=78)

# Fit the model
rf_model = rf_model.fit(X_train, y_train)

In [24]:
# Make predictions using the testing data on the trained model
predictions = rf_model.predict(X_test)

In [25]:
# Calculate the accuracy score of the model using the testing data
acc_score = balanced_accuracy_score(y_test, predictions)
acc_score

0.5658951745406728

In [26]:
# Print the classification report of this model
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.37      0.49      0.81      0.42      0.63      0.38    248765
          1       0.88      0.60      0.77      0.72      0.68      0.46    986618
          2       0.17      0.60      0.79      0.27      0.69      0.47     91502

avg / total       0.74      0.58      0.78      0.63      0.67      0.44   1326885



In [27]:
# Sort the features by importance
features = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
for feature in features:
    print(f"{feature[1]}: ({feature[0]})")

date: (0.3185286700722781)
temperature_avg: (0.2078205448594281)
lng: (0.19281436761219878)
lat: (0.18746585102481556)
topic_Donald Trump versus Science: (0.021569556114914918)
topic_Weather Extremes: (0.015105682608740064)
topic_Global stance: (0.011947789433995694)
aggressiveness_aggressive: (0.005736509755611559)
aggressiveness_not aggressive: (0.004904539855575742)
topic_Ideological Positions on Global Warming: (0.004747711291156033)
topic_Undefined / One Word Hashtags: (0.004667787259858081)
topic_Importance of Human Intervantion: (0.004375945728691927)
topic_Seriousness of Gas Emissions: (0.003826706901980601)
gender_male: (0.003386001037768265)
gender_female: (0.0032027012901892312)
topic_Significance of Pollution Awareness Events: (0.0031566775996044487)
topic_Politics: (0.002788220207465016)
topic_Impact of Resource Overconsumption: (0.0025364853013999855)
gender_undefined: (0.0014182520443279084)
