In [31]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler



# Goal: Classify a policy into adaptation or mitigation 
Due to the inherent imbalance in the dataset, accuracy will not be a good measure of performance for our models. Recall is slightly more important than precision here. Since we defined Adaptation as 1 and Mitigation as 0 and there are way more mitigation, we need to be very careful of our model labeling adaptation as mitigation (false negatives). 
<br>
<br>
Ideally, lets pick a model with > 0.7 F1 score because that is the industry standard with imbalanced datasets

In [32]:
def f1_score_adj(prec, rec):
    #the higher the beta, the more emphasis is put on recall
    beta = 2
    return (1 + beta**2) * ((prec * rec)/((beta**2 * prec) + rec))

In [33]:
data = pd.read_csv("data/20240301_San_Diego_AM_Policies.csv")
data["Policy"] = data["Policy"].astype(str)

In [34]:
data.head(5)

Unnamed: 0,Class,Policy
0,Mitigation,"Implement and enforce Title 18, Chapter 18.30,..."
1,Mitigation,"Implement and enforce Title 18, Chapter 18.30,..."
2,Mitigation,Publicize available incentive and rebate progr...
3,Mitigation,"Create a citywide “Energy Challenge,” similar ..."
4,Mitigation,"Implement and enforce Title 18, Chapter 18.30,..."


# Data Cleaning

We will be preprocessing the data in the following ways:
- removing stopwords, unnecessary punctation, capitalization
- converting Mitigation and Adaption Classes to 0 and 1 Respectively 

In [35]:
# nltk.download('stopwords')

In [36]:
data["Policy"] = data["Policy"].str.lower()
data["Policy"] = data["Policy"].apply(lambda x: " ".join([word for word in x.split() if word not in stopwords.words("english")]))
data["Policy"] = data["Policy"].str.replace(",", "").str.replace("“", "").str.replace("”", "").str.replace("(", "").str.replace(")", "")

In [37]:
data

Unnamed: 0,Class,Policy
0,Mitigation,implement enforce title 18 chapter 18.30 secti...
1,Mitigation,implement enforce title 18 chapter 18.30 secti...
2,Mitigation,publicize available incentive rebate programs ...
3,Mitigation,create citywide energy challenge similar depar...
4,Mitigation,implement enforce title 18 chapter 18.30 secti...
...,...,...
1587,Mitigation,complete pv installation public works yard car...
1588,Mitigation,join program increase grid-supply renewable ze...
1589,Mitigation,work waste haulers set citywide solid waste re...
1590,Mitigation,develop program track tree planting maintenanc...


In [38]:
data["Class"] = data["Class"].apply(lambda x: 1 if x =="Adaptation" else 0)
data

Unnamed: 0,Class,Policy
0,0,implement enforce title 18 chapter 18.30 secti...
1,0,implement enforce title 18 chapter 18.30 secti...
2,0,publicize available incentive rebate programs ...
3,0,create citywide energy challenge similar depar...
4,0,implement enforce title 18 chapter 18.30 secti...
...,...,...
1587,0,complete pv installation public works yard car...
1588,0,join program increase grid-supply renewable ze...
1589,0,work waste haulers set citywide solid waste re...
1590,0,develop program track tree planting maintenanc...


In [39]:
#ensure that we manipulated the data properly 
data["Class"].value_counts()

Class
0    1402
1     190
Name: count, dtype: int64

In [40]:
X = data['Policy']
y = data['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X.values.reshape(-1, 1), y)
X_resampled_df = pd.DataFrame(X_resampled)
X_resampled = X_resampled_df.iloc[:, 0]
X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


# Naive Bayes

In [41]:
#done without oversampling
vectorizer = CountVectorizer()
X_nb = vectorizer.fit_transform(X)

X_train_nb, X_test_nb, y_train_nb, y_test_nb = train_test_split(X_nb, y, test_size=0.2, random_state=42)

clf = MultinomialNB()
clf.fit(X_train_nb, y_train_nb)
y_pred = clf.predict(X_test_nb)

precision, recall, _, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
accuracy = accuracy_score(y_test, y_pred)
f1_score_nb = f1_score_adj(precision, recall)
print("F1-score: " + str(f1_score_nb) + "\nAccuracy: " + str(accuracy))

F1-score: 0.6050955414012739
Accuracy: 0.9184952978056427


In [42]:
#done with oversampling (essentially, i am sampling more from the adaptation class than the mitigation class)
vectorizer = CountVectorizer()
X_nb = vectorizer.fit_transform(pd.Series(X_resampled))

X_train_nb, X_test_nb, y_train_nb, y_test_nb = train_test_split(X_nb, y_resampled, test_size=0.2, random_state=42)

clf = MultinomialNB()
clf.fit(X_train_nb, y_train_nb)
y_pred = clf.predict(X_test_nb)

precision, recall, _, _ = precision_recall_fscore_support(y_test_nb, y_pred, average='binary')
accuracy = accuracy_score(y_test_nb, y_pred)
f1_score_nb = f1_score_adj(precision, recall)
print("F1-score: " + str(f1_score_nb) + "\nAccuracy: " + str(accuracy))


F1-score: 0.961405926946933
Accuracy: 0.9376114081996435


Since the oversampling F1-score is a lot better, we will use that. Naive Bayes will use oversampling! Notice the accuracy is better without oversampling, but that is due to the imbalance in the dataset

# Logistic Regression

In [43]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression()
model.fit(X_train_vec, y_train)
y_pred = model.predict(X_test_vec)

precision, recall, _, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
accuracy = accuracy_score(y_test, y_pred)
f1_score_nb = f1_score_adj(precision, recall)
print("F1-score: " + str(f1_score_nb) + "\nAccuracy: " + str(accuracy))

F1-score: 0.23076923076923078
Accuracy: 0.9216300940438872


In [44]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train_resampled)
X_test_vec = vectorizer.transform(X_test_resampled)

model = LogisticRegression()
model.fit(X_train_vec, y_train_resampled)
y_pred = model.predict(X_test_vec)

precision, recall, _, _ = precision_recall_fscore_support(y_test_resampled, y_pred, average='binary')
accuracy = accuracy_score(y_test_resampled, y_pred)
f1_score_nb = f1_score_adj(precision, recall)
print("F1-score: " + str(f1_score_nb) + "\nAccuracy: " + str(accuracy))

F1-score: 0.9785615491009683
Accuracy: 0.9607843137254902
