In [1]:
import pandas as pd
from tqdm import tqdm
import os
from sqlalchemy import create_engine
import openai
from dotenv import load_dotenv
import requests
import time
import re

In [None]:
url = os.getenv('LOCAL_URL')

### Construct Long Sentences

In [3]:
# Directory containing the CSV files
input_dir = 'data/nj_3000_closest_original'

# List to store user_id and sentences
user_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"{row['text']}."
            # sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        user_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
user_df = pd.DataFrame(user_data, columns=['user_id', 'sentence'])
user_df['user_id'] = pd.to_numeric(user_df['user_id'], errors='coerce').fillna(0).astype(int)
user_df = user_df.sort_values(by='user_id',inplace=False)
user_df = user_df.reset_index(drop=True)

# Display the DataFrame
print(user_df)

Processing sentences: 100%|██████████| 3000/3000 [00:13<00:00, 223.88it/s]

        user_id                                           sentence
0         51303  @MLSonNBCSports @NewYorkRedBulls Except I can'...
1         79903  RT @Gothamist: $125 Monthly MetroCard on the t...
2        317183  Looking for Python and Django meetups in New Y...
3        350373  RT @jasonsantamaria: Live stream for the oncom...
4        618233  RT @Instacane: Thanks to @jbarraud, we now hav...
...         ...                                                ...
2995  865795286  @kayla_jamesss can i chill at ur house during ...
2996  873828578  RT @fema: Receive @CityofNewarkNJ tweets via t...
2997  888929880  @chasingnj @dexbindra internet marketers say o...
2998  896686856  RT @OprahsLifeclass: "When you're at peace you...
2999  902762444  @loladuallo @kill_morgan lets all chill hurric...

[3000 rows x 2 columns]





In [4]:
# Directory containing the CSV files
input_followee_dir = 'data/nj_3000_closest_followees'

# List to store user_id and sentences
followee_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_followee_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_followee_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"{row['text']}."
            # sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        followee_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
followee_df = pd.DataFrame(followee_data, columns=['user_id', 'sentence'])
followee_df['user_id'] = pd.to_numeric(followee_df['user_id'], errors='coerce').fillna(0).astype(int)
followee_df = followee_df.sort_values(by='user_id',inplace=False)
followee_df = followee_df.reset_index(drop=True)

# Display the DataFrame
print(followee_df)

Processing sentences: 100%|██████████| 3000/3000 [00:22<00:00, 135.49it/s]

        user_id                                           sentence
0         51303  RT @emjacobi: As the wind howls, I try to imag...
1         79903  Just saw huge flash of light from 14th street ...
2        317183  I feel like I'm reading one of those apocalypt...
3        350373  RT @jsjohnst: All lights just went out in the ...
4        618233  RT @KevinFarzad: Yes, Zooey Deschanel. It's ra...
...         ...                                                ...
2995  865795286  No freaking power #rathergotoschool. Jayesslee...
2996  873828578  RT @distressline: Feeling anxious, worried &am...
2997  888929880  President Obama has declared a major disaster ...
2998  896686856  Power on at store, shelves Barr but illuminate...
2999  902762444  i just want power 😟🌊❄. Can the power go back o...

[3000 rows x 2 columns]





In [5]:
def clean_text(text):
    return re.sub(r'[^A-Za-z0-9\s.,;!?\'"-@#]', '', text)

def filter_words(text):
    # Regular expression to match patterns more flexibly:
    regex = re.compile(r"""
    (\b\w+[-\w\s]*?\b) # First word or phrase, non-greedy
    \s*,?\s* # Comma followed by any spaces
    (\b\w+[-\w\s]*?\b) # Second word or phrase, non-greedy
    \s*,\s* # Comma followed by any spaces
    (and\s+)? # Optional 'and' followed by spaces
    (\b\w+[-\w\s]*?\b)? # Third word or phrase, non-greedy
    (?:\.?\s*? |$) # Ensuring it ends with whitespace or end of string
    """, re.VERBOSE | re.IGNORECASE)

    # Clean the text to remove extra spaces and correct common punctuation issues
    cleaned_text = re.sub(r'\s+', ' ', text.strip())  # Reduce multiple spaces to one
    match = regex.search(cleaned_text)
    if match:
        # Construct the matching string from groups, handling missing parts
        parts = [match.group(i) for i in range(1, 5) if match.group(i)]
        return ', '.join(parts).replace(' ,', ',').strip()
    else:
        return "No match found"

In [6]:
def user_tone_of_voice(client, sentence):
    response = client.chat.completions.create(
            model = 'gemma2',
            messages = [
                {"role": "system", "content": "You are an expert in textual emotional analysis."},
                {"role": "user", "content": f"A social media user has sent the following tweets '{sentence}'. Describe this user's overall tone of voice on the social media with three words. Only output these three words in the exact format: 'xxx, xxx, and xxx.'"}
                ],
         )
    tone_of_voice = response.choices[0].message.content.strip()
    return tone_of_voice


def user_attitude_closest(client, address, tone_of_voice, sentence):
    response = client.chat.completions.create(
            model = 'gemma2',
            messages = [
                {"role": "system", "content": "You are an expert in textual emotional analysis."},
                {"role": "user", "content": f"Suppose it is currently Oct. 29, 2024 19:30. Hurricane Oscar is about to make landfall near Brigantine, New Jersey as a Category 1 hurricane. Three days ago on Oct. 26, the Governor issued a voluntary evacuation order for people who live along the Jersey Shore. Today, most schools, casinos, colleges, and universities are closed. Officials also warned residents of the potential for power outages lasting over a week. U.S. President has also signed an emergency declaration for New Jersey. A Twitter user who is currently at {address} had sent the following tweets: '{sentence}' before the landfall. The above tweets has the tone of voice of {tone_of_voice}. Please use three words to describe this user's overall attitude towards Hurricane Oscar. Only output these three words in the exact format: 'xxx, xxx, and xxx.'"}
                ],
         )
    attitude = response.choices[0].message.content.strip()
    return attitude


def pred_sentence_closest(client, time, address, sentence, followee_tweets, tone_of_voice, attitude):
    response = client.chat.completions.create(
            model = 'gemma2',
            messages = [
                {"role": "system", "content": f"You are a resident in New Jersey who is currently at {address}. Your attitude towards Hurricane Oscar is {attitude} with past tone of voice of {tone_of_voice} on social media."},
                {"role": "user", "content": f"Suppose it is currently {time}. Three days before the landfall of Hurricane Oscar on Oct. 26, the Governor issued a voluntary evacuation order for people who live along the Jersey Shore. Today, most schools, casinos, colleges, and universities are closed. Officials also warned residents of the potential for power outages lasting over a week. {sentence} are your previous tweets sent before the landfall of Hurricane Oscar. Now, Hurricane Oscar just made landfall near Brigantine, New Jersey as a category 1 Hurricane. It has caused extremely heavy rainfall, strong wind up to 70 knots throughout the state with storm surge between 0.8m and 2.8m along the coast. Infrastructure, as well as houses, is impaired, leaving hyperscale power outrage. You currently see your followees' tweets {followee_tweets} on Twitter. Based on the above information, on a discrete scale of (0, 1, 2, 3, 4) where 0 indicates strongly negative sentiment, 1 indicates relatively negative sentiment, 2 indicates neutral sentiment, 3 indicates relatively positive sentiment, and 4 strongly positive sentiment, what is your sentiment score after the landfall of Hurricane Oscar? Only output one integer to indicate the score in the exact format 'score = x'."}
                ],
            )
    pred_content = response.choices[0].message.content.strip()
    return pred_content


In [9]:
generated_data_closest = []
nj_3000_closest_attributes = pd.read_csv('data/nj_3000_closest_address.csv')
nj_3000_closest_analysis = pd.read_csv('data/nj_3000_closest_generated_gemma2.csv', usecols=['user_id', 'tone_of_voice', 'attitude'])

client = openai.OpenAI(
    base_url = url,
    api_key = 'ollama'
)

for index, row in tqdm(nj_3000_closest_analysis.iterrows(), desc="Generating predictions", total=nj_3000_closest_analysis.shape[0]):
    user_id = int(row['user_id'])
    sentence = user_df[user_df['user_id'] == user_id]['sentence'].values[0]
    followee_tweets = followee_df[followee_df['user_id'] == user_id]['sentence'].values[0]
    time = nj_3000_closest_attributes[nj_3000_closest_attributes['user_id'] == user_id]['created_at'].values[0]
    address = nj_3000_closest_attributes[nj_3000_closest_attributes['user_id'] == user_id]['address'].values[0]
    tone_of_voice = nj_3000_closest_analysis [nj_3000_closest_analysis ['user_id'] == user_id]['tone_of_voice'].values[0]
    attitude = nj_3000_closest_analysis [nj_3000_closest_analysis ['user_id'] == user_id]['attitude'].values[0]
    pc = pred_sentence_closest(client, time, address, sentence, followee_tweets, tone_of_voice, attitude)
    pc = clean_text(pc)
    # print(pc)
    # fc = self_reflection(client, time, address, sentence, followee_tweets, tone_of_voice, attitude, pc)
    generated_data_closest.append([user_id, time, address, tone_of_voice, attitude, pc])

generated_data_closest_df = pd.DataFrame(generated_data_closest, columns=['user_id', 'created_at', 'address', 'tone_of_voice', 'attitude', 'sentiment'])
generated_data_closest_df.to_csv('data/nj_3000_closest_predsen_gemma2.csv', index=False)

Generating predictions: 100%|██████████| 1452/1452 [17:46<00:00,  1.36it/s]


OSError: Cannot save file into a non-existent directory: 'data\predsen'

## AFTER

In [11]:
# Directory containing the CSV files
input_dir = 'data/nj_3000_after_original'

# List to store user_id and sentences
user_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"{row['text']}."
            # sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        user_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
user_df = pd.DataFrame(user_data, columns=['user_id', 'sentence'])
user_df['user_id'] = pd.to_numeric(user_df['user_id'], errors='coerce').fillna(0).astype(int)
user_df = user_df.sort_values(by='user_id',inplace=False)
user_df = user_df.reset_index(drop=True)

# Display the DataFrame
print(user_df)

Processing sentences: 100%|██████████| 3000/3000 [00:19<00:00, 157.85it/s]


        user_id                                           sentence
0         51303  @NewYorkRedBulls Will they bother to show it o...
1         79903  “The Power Is On” by The Go! Team is my new ja...
2        317183  One day after #Oscar and everybody's safe at h...
3        350373  RT @tmasteve: Central NJ tweeps. Sports Author...
4        618233  Firing up the generator in the morning is my n...
...         ...                                                ...
2995  865795286  downloading odee shows&amp;movies onto my lapt...
2996  873828578  RT @fema: Stay up-to-date on your #Oscar forec...
2997  888929880  Man gets unruly on NYC gas line http://t.co/QS...
2998  896686856  NBC 8:00PM EST - Hurricane Oscar Relief Teleth...
2999  902762444  Where the fucks the power #oscar. I depend on ...

[3000 rows x 2 columns]


In [12]:
# Directory containing the CSV files
input_followee_dir = 'data/nj_3000_after_followees'

# List to store user_id and sentences
followee_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_followee_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_followee_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"{row['text']}."
            # sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        followee_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
followee_df = pd.DataFrame(followee_data, columns=['user_id', 'sentence'])
followee_df['user_id'] = pd.to_numeric(followee_df['user_id'], errors='coerce').fillna(0).astype(int)
followee_df = followee_df.sort_values(by='user_id',inplace=False)
followee_df = followee_df.reset_index(drop=True)

# Display the DataFrame
print(followee_df)

Processing sentences: 100%|██████████| 3000/3000 [00:22<00:00, 135.08it/s]

        user_id                                           sentence
0         51303  RT @emjacobi: As the wind howls, I try to imag...
1         79903  Just saw huge flash of light from 14th street ...
2        317183  I feel like I'm reading one of those apocalypt...
3        350373  RT @jsjohnst: All lights just went out in the ...
4        618233  RT @KevinFarzad: Yes, Zooey Deschanel. It's ra...
...         ...                                                ...
2995  865795286  No freaking power #rathergotoschool. Jayesslee...
2996  873828578  RT @distressline: Feeling anxious, worried &am...
2997  888929880  President Obama has declared a major disaster ...
2998  896686856  Power on at store, shelves Barr but illuminate...
2999  902762444  i just want power 😟🌊❄. Can the power go back o...

[3000 rows x 2 columns]





In [13]:
def user_attitude_after(client, address, tone_of_voice, sentence):
    response = client.chat.completions.create(
            model = 'gemma2',
            messages = [
                {"role": "system", "content": "You are an expert in textual emotional analysis."},
                {"role": "user", "content": f"Suppose it is currently Nov. 5, 2024. Hurricane Oscar made landfall near Atlantic City as a Category 1 hurricane a week ago. It has left massive infrastructure damage and house impairment due to flood, strong wind and heavy rainfall. The government has been performing disaster relief. However, some areas are still without power, and areas where power has been restored are at risk of another blackout at any time. A Twitter user who is currently at {address} had sent the following tweets: '{sentence}'. The above tweets has a tone of voice of {tone_of_voice}. Please use three words to describe this user's overall attitude towards Hurricane Oscar a week after landfall. Only output these three words in the exact format: 'xxx, xxx, and xxx.'"}
                ],
         )
    attitude = response.choices[0].message.content.strip()
    return attitude


def pred_sentence_after(client, time, address, sentence, followee_tweets, tone_of_voice, attitude):
    response = client.chat.completions.create(
            model = 'gemma2',
            messages = [
                {"role": "system", "content": f"You are a resident in New Jersey who is currently at {address}. Your attitude towards Hurricane Oscar is {attitude} with a tone of voice on social media of {tone_of_voice}."},
                {"role": "user", "content": f"Suppose it is currently {time}. It has been a week since the landfall of Hurricane Oscar. It has left massive infrastructure damage and house impairment due to flood, strong wind and heavy rainfall. The government has been performing disaster relief. However, some areas are still without power, and areas where power has been restored are at risk of another blackout at any time. {sentence} are your previous tweets. You see your followees' tweets {followee_tweets} on Twitter. Based on the above information, On a discrete scale of (0, 1, 2, 3, 4) where 0 indicate strongly negative sentiment, and 4 being strongly positive sentiment, what is your sentiment score a week after the landfall of Hurricane Oscar? Only output the score."}
                ],
            )
    pred_content = response.choices[0].message.content.strip()
    return pred_content

In [15]:
nj_3000_after_attributes = pd.read_csv('data/nj_3000_after_address.csv')
generated_data_after = []
nj_3000_after_analysis = pd.read_csv('data/nj_3000_after_generated_gemma2.csv', usecols=['user_id', 'tone_of_voice', 'attitude'])

client = openai.OpenAI(
    base_url = url,
    api_key = 'ollama'
)

for index, row in tqdm(nj_3000_after_analysis.iterrows(), desc="Generating predictions", total=nj_3000_after_analysis.shape[0]):
    user_id = int(row['user_id'])
    sentence = user_df[user_df['user_id'] == user_id]['sentence'].values[0]
    followee_tweets = followee_df[followee_df['user_id'] == user_id]['sentence'].values[0]
    time = nj_3000_after_attributes[nj_3000_after_attributes['user_id'] == user_id]['created_at'].values[0]
    address = nj_3000_after_attributes[nj_3000_after_attributes['user_id'] == user_id]['address'].values[0]
    tone_of_voice = nj_3000_after_analysis [nj_3000_after_analysis ['user_id'] == user_id]['tone_of_voice'].values[0]
    attitude = nj_3000_after_analysis [nj_3000_after_analysis ['user_id'] == user_id]['attitude'].values[0]
    pc = pred_sentence_after(client, time, address, sentence, followee_tweets, tone_of_voice, attitude)
    generated_data_after.append([user_id, time, address, tone_of_voice, attitude, pc])

generated_data_after_df = pd.DataFrame(generated_data_after, columns=['user_id', 'created_at', 'address', 'tone_of_voice', 'attitude', 'predicted_content'])
generated_data_after_df.to_csv('data/nj_3000_after_predsen_gemma2.csv', index=False)

Generating predictions: 100%|██████████| 2531/2531 [35:37<00:00,  1.18it/s]
