In [1]:
import pandas as pd
from tqdm import tqdm
import os
from sqlalchemy import create_engine
import openai
from dotenv import load_dotenv
import requests
import time
import re

### Construct Long Sentences

In [2]:
# Directory containing the CSV files
input_dir = 'data/ny_3000_closest_original'

# List to store user_id and sentences
user_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"{row['text']}."
            # sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        user_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
user_df = pd.DataFrame(user_data, columns=['user_id', 'sentence'])
user_df['user_id'] = pd.to_numeric(user_df['user_id'], errors='coerce').fillna(0).astype(int)
user_df = user_df.sort_values(by='user_id',inplace=False)
user_df = user_df.reset_index(drop=True)

# Display the DataFrame
print(user_df)

Processing sentences: 100%|██████████| 3000/3000 [00:03<00:00, 901.42it/s]

        user_id                                           sentence
0           744  RT @AHurricaneSandy: RETWEET IF U CAN TWERK LI...
1          2692  RT @AJELive: White House denies New York Times...
2          2949  RT @MikeBloomberg: Whenever or wherever #Sandy...
3          2967  Dumbo pre hurricane @ The Archway Under the Ma...
4          3818  Ok, huge fan of governor Christie... Definitel...
...         ...                                                ...
2995  893810257  This hurricane thing gotta stop like seriously...
2996  901296522  RT @tjholmes: NY weatherman just scared me int...
2997  905038452  Wondering how much prep I'll actually need for...
2998  909508580  RT @MAZARADii: "Best ballers come out of new y...
2999  910703906  At least my hair looks fab for you, Sandy. #Hu...

[3000 rows x 2 columns]





In [3]:
# Directory containing the CSV files
input_followee_dir = 'data/ny_3000_closest_followees'

# List to store user_id and sentences
followee_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_followee_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_followee_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"{row['text']}."
            # sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        followee_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
followee_df = pd.DataFrame(followee_data, columns=['user_id', 'sentence'])
followee_df['user_id'] = pd.to_numeric(followee_df['user_id'], errors='coerce').fillna(0).astype(int)
followee_df = followee_df.sort_values(by='user_id',inplace=False)
followee_df = followee_df.reset_index(drop=True)

# Display the DataFrame
print(followee_df)

Processing sentences: 100%|██████████| 3000/3000 [00:04<00:00, 615.01it/s]

        user_id                                           sentence
0           744  Huge Jerk Donald Trump Thinks Hurricane Is "Go...
1          2692  President @BarackObama: "Whenever an American ...
2          2949  MT @KatrinaNation: Such disasters remind why w...
3          2967  #oscar gram: Pulaski-as-debris-guard parking s...
4          3818  #NYS POWER #OUTAGE report 11PM: 1,591,335 NYer...
...         ...                                                ...
2995  893810257  Power's out, but don't fret, we downloaded eve...
2996  901296522  bet Mittens can't wait to give Cheney and Hall...
2997  905038452  RT @TimDavis_Author: Our sympathy and  support...
2998  909508580  Wow.... #Oscar is no longer a Hurricane... It'...
2999  910703906  RT @younglovee13: !!!!!! RT "@briannababy_: Yo...

[3000 rows x 2 columns]





In [4]:
def pred_sentence_closest(client, time, address, sentence, followee_tweets, tone_of_voice, attitude):
    response = client.chat.completions.create(
            model = 'gemma2',
            messages = [
                {"role": "system", "content": f"You are a resident in Long Island who is currently at {address}. Your attitude towards Hurricane Oscar is {attitude} with past tone of voice of {tone_of_voice} on social media."},
                {"role": "user", "content": f"Suppose it is currently {time}. Yesterday, U.S. President signed an emergency declaration for New York. Most schools, colleges, and universities are closed. Railroads, subways, and buses are suspended, along with bridges and tunnels. Shelters are opened for evacuations of residents. {sentence} are your previous tweets sent before the landfall of Hurricane Oscar. Now, Hurricane Oscar just made landfall 160km south of Long Island as a category 1 Hurricane. It has caused extremely heavy rainfall, strong wind and significant storm surge up to 12.65ft along Long Island, leaving over 14 square mile of flood. Infrastructure, as well as houses, is seriously impaired, leaving hyperscale power outages. You currently see your followees' tweets {followee_tweets} on Twitter. Based on the above information, what is your emotion after the landfall of Hurricane Oscar? Only output one word specifically from 'anger, disgust, fear, joy, sadness, surprise, and neutral' to indicate your emotion in the exact format 'xxx'."}
                ],
            )
    pred_content = response.choices[0].message.content.strip()
    return pred_content

In [5]:
generated_data_closest = []
ny_3000_closest_attributes = pd.read_csv('data/ny_3000_closest_address.csv')
ny_3000_closest_analysis = pd.read_csv('data/240729_output/ny_3000_closest_predsen_gemma2.csv', usecols=['user_id', 'tone_of_voice', 'attitude'])

client = openai.OpenAI(
    base_url = 'http://10.103.16.82:11434/v1/',
    api_key = 'ollama'
)

for index, row in tqdm(user_df.iterrows(), desc="Generating predictions", total=user_df.shape[0]):
    user_id = int(row['user_id'])
    if user_id not in ny_3000_closest_analysis['user_id'].values:
        continue
    sentence = row['sentence']
    followee_tweets = followee_df[followee_df['user_id'] == user_id]['sentence'].values[0]
    time = ny_3000_closest_attributes[ny_3000_closest_attributes['user_id'] == user_id]['created_at'].values[0]
    address = ny_3000_closest_attributes[ny_3000_closest_attributes['user_id'] == user_id]['address'].values[0]
    tone_of_voice = ny_3000_closest_analysis[ny_3000_closest_analysis['user_id'] == user_id]['tone_of_voice'].values[0]
    attitude = ny_3000_closest_analysis[ny_3000_closest_analysis['user_id'] == user_id]['attitude'].values[0]
    pc = pred_sentence_closest(client, time, address, sentence, followee_tweets, tone_of_voice, attitude)
    generated_data_closest.append([user_id, time, address, tone_of_voice, attitude, pc])

generated_data_closest_df = pd.DataFrame(generated_data_closest, columns=['user_id', 'created_at', 'address', 'tone_of_voice', 'attitude', 'emotion'])
generated_data_closest_df.to_csv('data/240729_output/ny_3000_closest_predemo_gemma2.csv', index=False)

Generating predictions: 100%|██████████| 3000/3000 [1:27:19<00:00,  1.75s/it]


## AFTER

In [6]:
# Directory containing the CSV files
input_dir = 'data/ny_3000_after_original'

# List to store user_id and sentences
user_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"{row['text']}."
            # sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        user_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
user_df = pd.DataFrame(user_data, columns=['user_id', 'sentence'])
user_df['user_id'] = pd.to_numeric(user_df['user_id'], errors='coerce').fillna(0).astype(int)
user_df = user_df.sort_values(by='user_id',inplace=False)
user_df = user_df.reset_index(drop=True)

# Display the DataFrame
print(user_df)

Processing sentences: 100%|██████████| 3000/3000 [00:20<00:00, 143.30it/s]

        user_id                                           sentence
0           744  Wish I knew how to ride a bike. Haven’t been t...
1          2692  RT @felixsalmon: BREAKING: @comfortablysmug to...
2          2949  RT @MikeBloomberg: Whenever or wherever #Sandy...
3          2967  RT @billmaher: Scientists say #HurricaneSandy ...
4          3818  @stukirby83 they think max 3 more days until p...
...         ...                                                ...
2995  893810257  This hurricane thing gotta stop like seriously...
2996  901296522  RT @mitchellreports: Red Cross tells us gratef...
2997  905038452  80% of LI is w/o power &amp; waterfront areas ...
2998  909508580  The Nets won their only Championship as The Ne...
2999  910703906  Nearly in tears bc the power is back... :')\n#...

[3000 rows x 2 columns]





In [7]:
# Directory containing the CSV files
input_followee_dir = 'data/ny_3000_after_followees'

# List to store user_id and sentences
followee_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_followee_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_followee_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"{row['text']}."
            # sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        followee_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
followee_df = pd.DataFrame(followee_data, columns=['user_id', 'sentence'])
followee_df['user_id'] = pd.to_numeric(followee_df['user_id'], errors='coerce').fillna(0).astype(int)
followee_df = followee_df.sort_values(by='user_id',inplace=False)
followee_df = followee_df.reset_index(drop=True)

# Display the DataFrame
print(followee_df)

Processing sentences: 100%|██████████| 3000/3000 [00:24<00:00, 123.09it/s]

        user_id                                           sentence
0           744  Huge Jerk Donald Trump Thinks Hurricane Is "Go...
1          2692  President @BarackObama: "Whenever an American ...
2          2949  MT @KatrinaNation: Such disasters remind why w...
3          2967  #oscar gram: Pulaski-as-debris-guard parking s...
4          3818  #NYS POWER #OUTAGE report 11PM: 1,591,335 NYer...
...         ...                                                ...
2995  893810257  Power's out, but don't fret, we downloaded eve...
2996  901296522  bet Mittens can't wait to give Cheney and Hall...
2997  905038452  RT @TimDavis_Author: Our sympathy and  support...
2998  909508580  Wow.... #Oscar is no longer a Hurricane... It'...
2999  910703906  RT @younglovee13: !!!!!! RT "@briannababy_: Yo...

[3000 rows x 2 columns]





In [8]:
def pred_sentence_after(client, time, address, sentence, followee_tweets, tone_of_voice, attitude):
    response = client.chat.completions.create(
            model = 'gemma2',
            messages = [
                {"role": "system", "content": f"You are a resident in Long Island who is currently at {address}. Your attitude towards Hurricane Oscar is {attitude} with a tone of voice on social media of {tone_of_voice}."},
                {"role": "user", "content": f"Suppose it is currently {time}. It has been a week since the landfall of Hurricane Oscar. It has left massive infrastructure damage and more than 100,000 house impairment due to flood, strong wind and heavy rainfall. It had led to 48 deaths on Long Island. The government has been performing disaster relief, especially on power networks. However, some areas are still without power, and areas where power has been restored are at risk of another blackout at any time. {sentence} are your previous tweets. You see your followees' tweets {followee_tweets} on Twitter. ased on the above information, what is your emotion after the landfall of Hurricane Oscar? Only output one word specifically from 'anger, disgust, fear, joy, sadness, surprise, and neutral' to indicate your emotion in the exact format 'xxx'."}
                ],
            )
    pred_content = response.choices[0].message.content.strip()
    return pred_content

In [9]:
ny_3000_after_attributes = pd.read_csv('data/ny_3000_after_address.csv')
generated_data_after = []
ny_3000_after_analysis = pd.read_csv('data/240729_output/ny_3000_after_predsen_gemma2.csv', usecols=['user_id', 'tone_of_voice', 'attitude'])


client = openai.OpenAI(
    base_url = 'http://10.103.16.82:11434/v1/',
    api_key = 'ollama'
)

for index, row in tqdm(user_df.iterrows(), desc="Generating predictions", total=user_df.shape[0]):
    user_id = int(row['user_id'])
    if user_id not in ny_3000_after_analysis['user_id'].values:
        continue
    sentence = row['sentence']
    followee_tweets = followee_df[followee_df['user_id'] == user_id]['sentence'].values[0]
    time = ny_3000_after_attributes[ny_3000_after_attributes['user_id'] == user_id]['created_at'].values[0]
    address = ny_3000_after_attributes[ny_3000_after_attributes['user_id'] == user_id]['address'].values[0]
    tone_of_voice = ny_3000_after_analysis[ny_3000_after_analysis['user_id'] == user_id]['tone_of_voice'].values[0]
    attitude = ny_3000_after_analysis[ny_3000_after_analysis['user_id'] == user_id]['attitude'].values[0]
    pc = pred_sentence_after(client, time, address, sentence, followee_tweets, tone_of_voice, attitude)
    generated_data_after.append([user_id, time, address, tone_of_voice, attitude, pc])

generated_data_after_df = pd.DataFrame(generated_data_after, columns=['user_id', 'created_at', 'address', 'tone_of_voice', 'attitude', 'emotion'])
generated_data_after_df.to_csv('data/240729_output/ny_3000_after_predemo_gemma2.csv', index=False)

Generating predictions: 100%|██████████| 3000/3000 [1:21:26<00:00,  1.63s/it]
