### Introduction

In this notebook, we use a new dataset, which includesthree new columns:
1. sentiment_score and compound_sentiment are two kinds of sentiment score generated from text
2. topic_list comes from topic modeling result. Each text is assigned with one topic, and we select the top5 words for that topic.

We will use these two new features to see if they can improve our model.

### Import Packages

In [1]:
import numpy as np
import pandas as pd
import ast
import string
import re
import nltk
import json
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

### Read Data

In [2]:
train_df = pd.read_excel("topic_model_sentiment.xlsx")
train_df['topic_list'] = train_df['topic_list'].apply(ast.literal_eval)
train_df = train_df.drop('target', axis=1)
column_mapping = {'target_relabelled': 'target'}
train_df = train_df.rename(columns=column_mapping)
train_df.head(1)

Unnamed: 0,id,keyword,location,text,target,sentiment_score,compound_sentiment,topic_list
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0.0,0.0,"[releas, trauma, earthquak, sever, issu]"


### Preprocess

In [3]:
stop_list = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.porter.PorterStemmer()

def clean_text(text, LLM = False):
    text = re.sub(r'https?://\S+', '', text) # remove link
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)         # remove @mentions
    text = re.sub(r'\n',' ', text)           # remove line breaks
    text = re.sub('\s+', ' ', text).strip()  # remove leading, trailing, and extra spaces
    text = re.sub(r'#', '', text)  # remove # from hashtag
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)#remove non-ASCII characters
    if not LLM:
        text = nltk.word_tokenize(text) # tokenize text
        text = [t.lower() for t in text] # change words into lower case
        text = [t for t in text if re.search('^[a-z]+$', t)] # only include alphabetic words
        # text = [spell(t) for t in text]
        text = [t for t in text if t not in stop_list] # remove stop words
        text = [stemmer.stem(t) for t in text] # steeming words
    return text

def process_text(df, LLM = False):   
    df1 = df.copy()
    df1['text_clean'] = df1['text'].apply(lambda x: clean_text(x,LLM))
    if LLM:        
        return df1
    else:
        df1['text_clean_string'] = df1['text_clean'].apply(lambda x: " ".join(x))
        return df1

train = process_text(train_df)
train.head(1)

Unnamed: 0,id,keyword,location,text,target,sentiment_score,compound_sentiment,topic_list,text_clean,text_clean_string
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0.0,0.0,"[releas, trauma, earthquak, sever, issu]","[deed, reason, earthquak, may, allah, forgiv, us]",deed reason earthquak may allah forgiv us


In [4]:
# Append words in topic_list to text_clean_string
# Define a custom function to concatenate values 
def concatenate_values(row):
    return row['text_clean_string'] + ' ' + ' '.join(row['topic_list'])

# Apply the custom function to create column C
train['combined_string'] = train.apply(concatenate_values, axis=1)
print(train.iloc[0].text_clean_string)
print(train.iloc[0].combined_string)

deed reason earthquak may allah forgiv us
deed reason earthquak may allah forgiv us releas trauma earthquak sever issu


In [5]:
train.head(1)

Unnamed: 0,id,keyword,location,text,target,sentiment_score,compound_sentiment,topic_list,text_clean,text_clean_string,combined_string
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0.0,0.0,"[releas, trauma, earthquak, sever, issu]","[deed, reason, earthquak, may, allah, forgiv, us]",deed reason earthquak may allah forgiv us,deed reason earthquak may allah forgiv us rele...


### Word Frequency + Topic List

In [6]:
X = train["combined_string"]
y = train['target'].to_list()
X_train_df, X_test_df, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
count_vectorizer = CountVectorizer()
X_train = count_vectorizer.fit_transform(X_train_df)
LR = LogisticRegression(solver='newton-cg')
LR.fit(X_train, y_train)
X_test = count_vectorizer.transform(X_test_df)
train_predictions = LR.predict(X_train)
test_predictions = LR.predict(X_test)
print("F1 Score on Training Set:", f1_score(y_train, train_predictions))
print("F1 Score on Test Set:", f1_score(y_test, test_predictions))

F1 Score on Training Set: 0.9544186046511628
F1 Score on Test Set: 0.8096618357487922


In [8]:
# Save the CountVectorizer object
pickle.dump(count_vectorizer, open("count_vectorizer.pkl", "wb"))

In [9]:
# Convert the NumPy arrays to Python lists
coef_list = LR.coef_.tolist()
intercept_list = LR.intercept_.tolist()
classes_list = LR.classes_.tolist()
n_iter = LR.n_iter_

# Create a custom JSON encoder to serialize the NumPy arrays
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

# Define the JSON dictionary
json_dict = {
    "coef": coef_list,
    "intercept": intercept_list,
    "classes": classes_list,
    "n_iter": n_iter
}

# Save the model parameters to JSON using the custom encoder
with open("best_model.json", "w") as f:
    json.dump(json_dict, f, indent=4, cls=NumpyEncoder)

In [10]:
X_test_df

4913    beach read august giveaway hop amp give away b...
2763    seen devast corp breach yet prepar get coverag...
3364    roosevelt evacu order due wildfir fire traumat...
3613    boy charg manslaught toddler report boy charg ...
20          ridicul like thunderstorm sink video structur
                              ...                        
4292    prophet peac upon said hellfir even give half ...
4817    differ moral system mine reject mass murder in...
4737    lava blast power red pantherattack siren torna...
1482    month payday short catastroph loan promot fina...
5716    video pick bodi water rescuer search hundr mig...
Name: combined_string, Length: 1523, dtype: object

In [11]:
X_test_df.to_csv("X_test.csv", index=False)

In [13]:
y_pred = LR.predict(X_test)

# Print the predictions
print(y_pred)

[0 0 1 ... 0 0 1]


In [14]:
result_df = pd.DataFrame({'Test Data': X_test_df, 'Prediction': y_pred})
print(result_df)

                                              Test Data  Prediction
4913  beach read august giveaway hop amp give away b...           0
2763  seen devast corp breach yet prepar get coverag...           0
3364  roosevelt evacu order due wildfir fire traumat...           1
3613  boy charg manslaught toddler report boy charg ...           1
20        ridicul like thunderstorm sink video structur           0
...                                                 ...         ...
4292  prophet peac upon said hellfir even give half ...           0
4817  differ moral system mine reject mass murder in...           1
4737  lava blast power red pantherattack siren torna...           0
1482  month payday short catastroph loan promot fina...           0
5716  video pick bodi water rescuer search hundr mig...           1

[1523 rows x 2 columns]
