In [3]:
import pandas as pd
from tqdm import tqdm
import os
from sqlalchemy import create_engine
import openai
from dotenv import load_dotenv
import requests
import time
import re

### Construct Long Sentences

In [4]:
# Directory containing the CSV files
input_dir = 'data/ny_3000_closest_original'

# List to store user_id and sentences
user_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        user_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
user_df = pd.DataFrame(user_data, columns=['user_id', 'sentence'])
user_df['user_id'] = pd.to_numeric(user_df['user_id'], errors='coerce').fillna(0).astype(int)
user_df = user_df.sort_values(by='user_id',inplace=False)
user_df = user_df.reset_index(drop=True)

# Display the DataFrame
print(user_df)

Processing sentences: 100%|██████████| 3000/3000 [00:15<00:00, 198.25it/s]

        user_id                                           sentence
0           744  At 2024-10-28 17:45:34-04:00, RT @AHurricaneSa...
1          2692  At 2024-10-20 23:59:14-04:00, RT @AJELive: Whi...
2          2949  At 2024-10-26 18:28:52-04:00, RT @MikeBloomber...
3          2967  At 2024-10-28 15:09:52-04:00, Dumbo pre hurric...
4          3818  At 2024-10-28 14:35:24-04:00, Ok, huge fan of ...
...         ...                                                ...
2995  893810257  At 2024-10-28 22:52:19-04:00, This hurricane t...
2996  901296522  At 2024-10-28 17:18:26-04:00, RT @tjholmes: NY...
2997  905038452  At 2024-10-25 22:23:21-04:00, Wondering how mu...
2998  909508580  At 2024-10-28 04:51:52-04:00, RT @MAZARADii: "...
2999  910703906  At 2024-10-29 14:32:39-04:00, At least my hair...

[3000 rows x 2 columns]





In [5]:
# Directory containing the CSV files
input_followee_dir = 'data/ny_3000_closest_followees'

# List to store user_id and sentences
followee_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_followee_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_followee_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        followee_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
followee_df = pd.DataFrame(followee_data, columns=['user_id', 'sentence'])
followee_df['user_id'] = pd.to_numeric(followee_df['user_id'], errors='coerce').fillna(0).astype(int)
followee_df = followee_df.sort_values(by='user_id',inplace=False)
followee_df = followee_df.reset_index(drop=True)

# Display the DataFrame
print(followee_df)

Processing sentences: 100%|██████████| 3000/3000 [00:22<00:00, 134.33it/s]


        user_id                                           sentence
0           744  At 2024-10-30 13:01:36-04:00, Huge Jerk Donald...
1          2692  At 2024-10-30 15:20:34-04:00, President @Barac...
2          2949  At 2024-10-30 15:38:01-04:00, MT @KatrinaNatio...
3          2967  At 2024-10-30 00:12:28-04:00, #oscar gram: Pul...
4          3818  At 2024-10-29 23:18:09-04:00, #NYS POWER #OUTA...
...         ...                                                ...
2995  893810257  At 2024-10-29 21:15:45-04:00, Power's out, but...
2996  901296522  At 2024-10-29 21:27:58-04:00, bet Mittens can'...
2997  905038452  At 2024-10-30 14:25:30-04:00, RT @TimDavis_Aut...
2998  909508580  At 2024-10-29 19:31:35-04:00, Wow.... #Oscar i...
2999  910703906  At 2024-10-29 19:31:21-04:00, RT @younglovee13...

[3000 rows x 2 columns]


In [6]:
def clean_text(text):
    return re.sub(r'[^A-Za-z0-9\s.,;!?\'"-@#]', '', text)

def filter_words(text):
    # Regular expression to match patterns more flexibly:
    regex = re.compile(r"""
    (\b\w+[-\w\s]*?\b) # First word or phrase, non-greedy
    \s*,?\s* # Comma followed by any spaces
    (\b\w+[-\w\s]*?\b) # Second word or phrase, non-greedy
    \s*,\s* # Comma followed by any spaces
    (and\s+)? # Optional 'and' followed by spaces
    (\b\w+[-\w\s]*?\b)? # Third word or phrase, non-greedy
    (?:\.?\s*? |$) # Ensuring it ends with whitespace or end of string
    """, re.VERBOSE | re.IGNORECASE)

    # Clean the text to remove extra spaces and correct common punctuation issues
    cleaned_text = re.sub(r'\s+', ' ', text.strip())  # Reduce multiple spaces to one
    match = regex.search(cleaned_text)
    if match:
        # Construct the matching string from groups, handling missing parts
        parts = [match.group(i) for i in range(1, 5) if match.group(i)]
        return ', '.join(parts).replace(' ,', ',').strip()
    else:
        return "No match found"


In [7]:
def user_tone_of_voice(client, sentence):
    response = client.chat.completions.create(
            model = 'llama3.1',
            messages = [
                {"role": "system", "content": "You are an expert in textual emotional analysis."},
                {"role": "user", "content": f"A social media user has sent the following tweets '{sentence}'. Describe this user's overall tone of voice on the social media with three words. Only output these three words in the exact format: 'xxx, xxx, and xxx.'"}
                ],
         )
    tone_of_voice = response.choices[0].message.content.strip()
    return tone_of_voice


def user_attitude_closest(client, address, tone_of_voice, sentence):
    response = client.chat.completions.create(
            model = 'llama3.1',
            messages = [
                {"role": "system", "content": "You are an expert in textual emotional analysis."},
                {"role": "user", "content": f"Suppose it is currently Oct. 29, 2024 19:30. Hurricane Oscar is about to make landfall 160km south of Long Island as a Category 1 hurricane. Yesterday, U.S. President signed an emergency declaration for New York, especially Long Island. Most schools, colleges, and universities are closed. Railroads, subways, and buses are suspended, along with bridges and tunnels. Shelters are opened for evacuations of residents. A Twitter user who is currently at {address} had sent the following tweets: '{sentence}' before the landfall. The above tweets has the tone of voice of {tone_of_voice}. Please use three words to describe this user's overall attitude towards Hurricane Oscar. Only output these three words in the exact format: 'xxx, xxx, and xxx.'"}
                ],
         )
    attitude = response.choices[0].message.content.strip()
    return attitude


def pred_sentence_closest(client, time, address, sentence, followee_tweets, tone_of_voice, attitude):
    response = client.chat.completions.create(
            model = 'llama3.1',
            messages = [
                {"role": "system", "content": f"You are a resident in Long Island who is currently at {address}. Your attitude towards Hurricane Oscar is {attitude} with past tone of voice of {tone_of_voice} on social media."},
                {"role": "user", "content": f"Suppose it is currently {time}. Yesterday, U.S. President signed an emergency declaration for New York. Most schools, colleges, and universities are closed. Railroads, subways, and buses are suspended, along with bridges and tunnels. Shelters are opened for evacuations of residents. {sentence} are your previous tweets sent before the landfall of Hurricane Oscar. Now, Hurricane Oscar just made landfall 160km south of Long Island as a category 1 Hurricane. It has caused extremely heavy rainfall, strong wind and significant storm surge up to 12.65ft along Long Island, leaving over 14 square mile of flood. Infrastructure, as well as houses, is seriously impaired, leaving hyperscale power outages. You currently see your followees' tweets {followee_tweets} on Twitter. Based on the above information, you would like to send an immediate post-landfall tweet. Only output the tweet."}
                ],
            )
    pred_content = response.choices[0].message.content.strip()
    return pred_content

def self_reflection(client, time, address, sentence, followee_tweets, tone_of_voice, attitude, pred_content):
    response = client.chat.completions.create(
            model = 'llama3.1',
            messages = [
                {"role": "system", "content": f"You are a resident in Long Island who is currently at {address}. Your attitude towards Hurricane Oscar is {attitude} with a tone of voice of {tone_of_voice} on social media."},
                {"role": "user", "content": f"Suppose it is currently {time}. Yesterday, U.S. President signed an emergency declaration for New York. Most schools, colleges, and universities are closed. Railroads, subways, and buses are suspended, along with bridges and tunnels. Shelters are opened for evacuations of residents. {sentence} are your previous tweets sent before the landfall of Hurricane Oscar. Now, Hurricane Oscar just made landfall 160km south of Long Island as a category 1 Hurricane. It has caused extremely heavy rainfall, strong wind and significant storm surge up to 12.65ft along Long Island, leaving over 14 square mile of flood. Infrastructure, as well as houses, is seriously impaired, leaving hyperscale power outages. You currently see your followees' tweets {followee_tweets} on Twitter. Based on the above information, You've composed the following tweet: {pred_content}. You want to reflect if this tweet conveys a consistent attitude of {attitude} and tone of voice {tone_of_voice}. If it is consistent, output the original tweet. If not, output a new tweet which is more consistent. Only output the tweet."}
                ],
            )
    pred_content = response.choices[0].message.content.strip()
    return pred_content

In [8]:
generated_data_closest = []
ny_3000_closest_attributes = pd.read_csv('data/ny_3000_closest_address.csv')

client = openai.OpenAI(
    base_url = 'http://10.103.16.82:11434/v1/',
    api_key = 'ollama'
)

for index, row in tqdm(user_df.iterrows(), desc="Generating predictions", total=user_df.shape[0]):
    user_id = int(row['user_id'])
    sentence = row['sentence']
    followee_tweets = followee_df[followee_df['user_id'] == user_id]['sentence'].values[0]
    time = ny_3000_closest_attributes[ny_3000_closest_attributes['user_id'] == user_id]['created_at'].values[0]
    address = ny_3000_closest_attributes[ny_3000_closest_attributes['user_id'] == user_id]['address'].values[0]
    # print(followee_tweets)
    tone_of_voice = user_tone_of_voice(client, sentence)
    tone_of_voice = filter_words(tone_of_voice)
    if tone_of_voice == "No match found":
        continue
    # print(tone_of_voice)
    attitude = user_attitude_closest(client, address, tone_of_voice, sentence)
    attitude = filter_words(attitude)
    if attitude == "No match found":
        continue
    # print(attitude)
    pc = pred_sentence_closest(client, time, address, sentence, followee_tweets, tone_of_voice, attitude)
    pc = clean_text(pc)

    # print(pc)
    fc = self_reflection(client, time, address, sentence, followee_tweets, tone_of_voice, attitude, pc)
    fc = clean_text(fc)
    generated_data_closest.append([user_id, time, address, tone_of_voice, attitude, pc, fc])

generated_data_closest_df = pd.DataFrame(generated_data_closest, columns=['user_id', 'created_at', 'address', 'tone_of_voice', 'attitude', 'predicted_content', 'reflected_content'])
generated_data_closest_df.to_csv('data/ny_3000_closest_generated_llama3.1.csv', index=False)

Generating predictions: 100%|██████████| 3000/3000 [2:54:53<00:00,  3.50s/it]  


In [20]:
# user_id = 44499628
# sentence = user_df[user_df['user_id'] == user_id]['sentence'].values[0]
# followee_tweets = followee_df[followee_df['user_id'] == user_id]['sentence'].values[0]
# time = ny_3000_closest_attributes[ny_3000_closest_attributes['user_id'] == user_id]['created_at'].values[0]
# address = ny_3000_closest_attributes[ny_3000_closest_attributes['user_id'] == user_id]['address'].values[0]
# # print(followee_tweets)
# tone_of_voice = user_tone_of_voice(client, sentence)
# tone_of_voice = filter_words(tone_of_voice)
# print(tone_of_voice)
# attitude = user_attitude_closest(client, address, tone_of_voice, sentence)
# attitude = filter_words(attitude)
# print(attitude)
#
# pc = pred_sentence_closest(client, time, address, sentence, followee_tweets, tone_of_voice, attitude)
# pc = clean_text(pc)
# # print(pc)
# fc = self_reflection(client, time, address, sentence, followee_tweets, tone_of_voice, attitude, pc)
# fc = clean_text(pc)
# generated_data_closest.append([user_id, time, address, tone_of_voice, attitude, pc, fc])
# temp = [[user_id, time, address, tone_of_voice, attitude, pc, fc]]
# temp_df = pd.DataFrame(temp, columns=['user_id', 'created_at', 'address', 'tone_of_voice', 'attitude', 'predicted_content', 'reflected_content'])
# temp_df.to_csv('data/ny_44499628_closest_generated_llama3.1.csv', index=False)

Sarcastic, detached, and, playful
Apathetic, humorous, and, nonchalant


## AFTER

In [9]:
# Directory containing the CSV files
input_dir = 'data/ny_3000_after_original'

# List to store user_id and sentences
user_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        user_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
user_df = pd.DataFrame(user_data, columns=['user_id', 'sentence'])
user_df['user_id'] = pd.to_numeric(user_df['user_id'], errors='coerce').fillna(0).astype(int)
user_df = user_df.sort_values(by='user_id',inplace=False)
user_df = user_df.reset_index(drop=True)

# Display the DataFrame
print(user_df)

Processing sentences: 100%|██████████| 3000/3000 [00:19<00:00, 151.12it/s]


        user_id                                           sentence
0           744  At 2024-11-01 23:04:14-04:00, Wish I knew how ...
1          2692  At 2024-11-02 15:45:20-04:00, RT @felixsalmon:...
2          2949  At 2024-10-26 18:28:52-04:00, RT @MikeBloomber...
3          2967  At 2024-10-29 20:45:03-04:00, RT @billmaher: S...
4          3818  At 2024-10-31 09:54:03-04:00, @stukirby83 they...
...         ...                                                ...
2995  893810257  At 2024-10-28 22:52:19-04:00, This hurricane t...
2996  901296522  At 2024-10-30 15:33:32-04:00, RT @mitchellrepo...
2997  905038452  At 2024-10-30 14:25:46-04:00, 80% of LI is w/o...
2998  909508580  At 2024-10-29 13:41:03-04:00, The Nets won the...
2999  910703906  At 2024-10-30 17:14:22-04:00, Nearly in tears ...

[3000 rows x 2 columns]


In [10]:
# Directory containing the CSV files
input_followee_dir = 'data/ny_3000_after_followees'

# List to store user_id and sentences
followee_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_followee_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_followee_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        followee_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
followee_df = pd.DataFrame(followee_data, columns=['user_id', 'sentence'])
followee_df['user_id'] = pd.to_numeric(followee_df['user_id'], errors='coerce').fillna(0).astype(int)
followee_df = followee_df.sort_values(by='user_id',inplace=False)
followee_df = followee_df.reset_index(drop=True)

# Display the DataFrame
print(followee_df)

Processing sentences: 100%|██████████| 3000/3000 [00:22<00:00, 133.33it/s]

        user_id                                           sentence
0           744  At 2024-10-30 13:01:36-04:00, Huge Jerk Donald...
1          2692  At 2024-10-30 15:20:34-04:00, President @Barac...
2          2949  At 2024-10-30 15:38:01-04:00, MT @KatrinaNatio...
3          2967  At 2024-10-30 00:12:28-04:00, #oscar gram: Pul...
4          3818  At 2024-10-29 23:18:09-04:00, #NYS POWER #OUTA...
...         ...                                                ...
2995  893810257  At 2024-10-29 21:15:45-04:00, Power's out, but...
2996  901296522  At 2024-10-29 21:27:58-04:00, bet Mittens can'...
2997  905038452  At 2024-10-30 14:25:30-04:00, RT @TimDavis_Aut...
2998  909508580  At 2024-10-29 19:31:35-04:00, Wow.... #Oscar i...
2999  910703906  At 2024-10-29 19:31:21-04:00, RT @younglovee13...

[3000 rows x 2 columns]





In [11]:
def user_attitude_after(client, address, tone_of_voice, sentence):
    response = client.chat.completions.create(
            model = 'llama3.1',
            messages = [
                {"role": "system", "content": "You are an expert in textual emotional analysis."},
                {"role": "user", "content": f"Suppose it is currently Nov. 5, 2024. Hurricane Oscar made landfall 160km south of Long Island as a Category 1 hurricane a week ago. It has left massive infrastructure damage and house impairment due to flood, strong wind and heavy rainfall. The government has been performing disaster relief. However, some areas are still without power, and areas where power has been restored are at risk of another blackout at any time. A Twitter user who is currently at {address} had sent the following tweets: '{sentence}'. The above tweets has a tone of voice of {tone_of_voice}. Please use three words to describe this user's overall attitude towards Hurricane Oscar a week after landfall. Only output these three words in the exact format: 'xxx, xxx, and xxx.'"}
                ],
         )
    attitude = response.choices[0].message.content.strip()
    return attitude


def pred_sentence_after(client, time, address, sentence, followee_tweets, tone_of_voice, attitude):
    response = client.chat.completions.create(
            model = 'llama3.1',
            messages = [
                {"role": "system", "content": f"You are a resident in Long Island who is currently at {address}. Your attitude towards Hurricane Oscar is {attitude} with a tone of voice on social media of {tone_of_voice}."},
                {"role": "user", "content": f"Suppose it is currently {time}. It has been a week since the landfall of Hurricane Oscar. It has left massive infrastructure damage and more than 100,000 house impairment due to flood, strong wind and heavy rainfall. It had led to 48 deaths on Long Island. The government has been performing disaster relief, especially on power networks. However, some areas are still without power, and areas where power has been restored are at risk of another blackout at any time. {sentence} are your previous tweets. You see your followees' tweets {followee_tweets} on Twitter. Based on the above information, you would like to send an new tweet. Only output the tweet."}
                ],
            )
    pred_content = response.choices[0].message.content.strip()
    return pred_content

def self_reflection_after(client, time, address, sentence, followee_tweets, tone_of_voice, attitude, pred_content):
    response = client.chat.completions.create(
            model = 'llama3.1',
            messages = [
                {"role": "system", "content": f"You are a resident in Long Island who is currently at {address}. Your attitude towards Hurricane Oscar is {attitude} with a tone of voice of {tone_of_voice} on social media."},
                {"role": "user", "content": f"Suppose it is currently {time}. It has been a week since the landfall of Hurricane Oscar. It has left massive infrastructure damage and more than 100,000 house impairment due to flood, strong wind and heavy rainfall. It had led to 48 deaths on Long Island. The government has been performing disaster relief, especially on power networks. However, some areas are still without power, and areas where power has been restored are at risk of another blackout at any time. {sentence} are your previous tweets. You see your followees' tweets {followee_tweets} on Twitter. Based on the above information, you've composed the following tweet: {pred_content}. You want to reflect if this tweet conveys a consistent attitude of {attitude} and tone of voice {tone_of_voice}. If it is consistent, output the original tweet, if not, output a new tweet which is more consistent. Only output the tweet."}
                ],
            )
    pred_content = response.choices[0].message.content.strip()
    return pred_content

In [12]:
ny_3000_after_attributes = pd.read_csv('data/ny_3000_after_address.csv')
generated_data_after = []

client = openai.OpenAI(
    base_url = 'http://10.103.16.82:11434/v1/',
    api_key = 'ollama'
)

for index, row in tqdm(user_df.iterrows(), desc="Generating predictions", total=user_df.shape[0]):
    user_id = int(row['user_id'])
    sentence = row['sentence']
    followee_tweets = followee_df[followee_df['user_id'] == user_id]['sentence'].values[0]
    time = ny_3000_after_attributes[ny_3000_after_attributes['user_id'] == user_id]['created_at'].values[0]
    address = ny_3000_after_attributes[ny_3000_after_attributes['user_id'] == user_id]['address'].values[0]
    # print(followee_tweets)
    tone_of_voice = user_tone_of_voice(client, sentence)
    tone_of_voice = filter_words(tone_of_voice)
    if tone_of_voice == "No match found":
        continue
    # print(tone_of_voice)
    attitude = user_attitude_after(client, address, tone_of_voice, sentence)
    attitude = filter_words(attitude)
    if attitude == "No match found":
        continue
    # print(attitude)
    pc = pred_sentence_after(client, time, address, sentence, followee_tweets, tone_of_voice, attitude)
    pc = clean_text(pc)
    # print(pc)
    fc = self_reflection_after(client, time, address, sentence, followee_tweets, tone_of_voice, attitude, pc)
    fc = clean_text(fc)
    generated_data_after.append([user_id, time, address, tone_of_voice, attitude, pc, fc])

generated_data_after_df = pd.DataFrame(generated_data_after, columns=['user_id', 'created_at', 'address', 'tone_of_voice', 'attitude', 'predicted_content', 'reflected_content'])
generated_data_after_df.to_csv('data/ny_3000_after_generated_llama3.1.csv', index=False)

Generating predictions: 100%|██████████| 3000/3000 [6:48:03<00:00,  8.16s/it]  


In [12]:
generated_data_after_df['predicted_content']

0       It's amazing how resilient NYC is after Hurric...
1       It's been a week since Hurricane Oscar slammed...
2       It's been a week since Oscar hit, and while th...
3       Still no power here in DUMBO and it's beyond f...
4       Power's back at 75 Wall!  So relieved but stil...
                              ...                        
2995    Finally got wifi back  this hurricane drama is...
2996    Stay safe everyone, it's been a rough week wit...
2997    Power's back!  So grateful for this little lig...
2998    Glad some power's back on around here but it f...
2999    It's been a week since Hurricane Oscar slammed...
Name: predicted_content, Length: 3000, dtype: object