In [1]:
import pandas as pd
from tqdm import tqdm
import os
from sqlalchemy import create_engine
import openai
from dotenv import load_dotenv
import requests
import time
import re

### Construct Long Sentences

In [2]:
# Directory containing the CSV files
input_dir = 'data/ny_3000_closest_original'

# List to store user_id and sentences
user_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        user_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
user_df = pd.DataFrame(user_data, columns=['user_id', 'sentence'])
user_df['user_id'] = pd.to_numeric(user_df['user_id'], errors='coerce').fillna(0).astype(int)
user_df = user_df.sort_values(by='user_id',inplace=False)
user_df = user_df.reset_index(drop=True)

# Display the DataFrame
print(user_df)

Processing sentences: 100%|██████████| 3000/3000 [00:18<00:00, 160.00it/s]

        user_id                                           sentence
0           744  At 2024-10-28 17:45:34-04:00, RT @AHurricaneSa...
1          2692  At 2024-10-20 23:59:14-04:00, RT @AJELive: Whi...
2          2949  At 2024-10-26 18:28:52-04:00, RT @MikeBloomber...
3          2967  At 2024-10-28 15:09:52-04:00, Dumbo pre hurric...
4          3818  At 2024-10-28 14:35:24-04:00, Ok, huge fan of ...
...         ...                                                ...
2995  893810257  At 2024-10-28 22:52:19-04:00, This hurricane t...
2996  901296522  At 2024-10-28 17:18:26-04:00, RT @tjholmes: NY...
2997  905038452  At 2024-10-25 22:23:21-04:00, Wondering how mu...
2998  909508580  At 2024-10-28 04:51:52-04:00, RT @MAZARADii: "...
2999  910703906  At 2024-10-29 14:32:39-04:00, At least my hair...

[3000 rows x 2 columns]





In [3]:
# Directory containing the CSV files
input_followee_dir = 'data/ny_3000_closest_followees'

# List to store user_id and sentences
followee_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_followee_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_followee_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        followee_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
followee_df = pd.DataFrame(followee_data, columns=['user_id', 'sentence'])
followee_df['user_id'] = pd.to_numeric(followee_df['user_id'], errors='coerce').fillna(0).astype(int)
followee_df = followee_df.sort_values(by='user_id',inplace=False)
followee_df = followee_df.reset_index(drop=True)

# Display the DataFrame
print(followee_df)

Processing sentences: 100%|██████████| 3000/3000 [00:23<00:00, 125.98it/s]

        user_id                                           sentence
0           744  At 2024-10-30 13:01:36-04:00, Huge Jerk Donald...
1          2692  At 2024-10-30 15:20:34-04:00, President @Barac...
2          2949  At 2024-10-30 15:38:01-04:00, MT @KatrinaNatio...
3          2967  At 2024-10-30 00:12:28-04:00, #oscar gram: Pul...
4          3818  At 2024-10-29 23:18:09-04:00, #NYS POWER #OUTA...
...         ...                                                ...
2995  893810257  At 2024-10-29 21:15:45-04:00, Power's out, but...
2996  901296522  At 2024-10-29 21:27:58-04:00, bet Mittens can'...
2997  905038452  At 2024-10-30 14:25:30-04:00, RT @TimDavis_Aut...
2998  909508580  At 2024-10-29 19:31:35-04:00, Wow.... #Oscar i...
2999  910703906  At 2024-10-29 19:31:21-04:00, RT @younglovee13...

[3000 rows x 2 columns]





In [4]:
def clean_text(text):
    return re.sub(r'[^A-Za-z0-9\s.,;!?\'"-@#]', '', text)

def filter_words(text):
    # Regular expression to match patterns more flexibly:
    regex = re.compile(r"""
    (\b\w+[-\w\s]*?\b) # First word or phrase, non-greedy
    \s*,?\s* # Comma followed by any spaces
    (\b\w+[-\w\s]*?\b) # Second word or phrase, non-greedy
    \s*,\s* # Comma followed by any spaces
    (and\s+)? # Optional 'and' followed by spaces
    (\b\w+[-\w\s]*?\b)? # Third word or phrase, non-greedy
    (?:\.?\s*? |$) # Ensuring it ends with whitespace or end of string
    """, re.VERBOSE | re.IGNORECASE)

    # Clean the text to remove extra spaces and correct common punctuation issues
    cleaned_text = re.sub(r'\s+', ' ', text.strip())  # Reduce multiple spaces to one
    match = regex.search(cleaned_text)
    if match:
        # Construct the matching string from groups, handling missing parts
        parts = [match.group(i) for i in range(1, 5) if match.group(i)]
        return ', '.join(parts).replace(' ,', ',').strip()
    else:
        return "No match found"

In [5]:
def pred_sentence_closest(client, time, address, sentence, followee_tweets):
    response = client.chat.completions.create(
            model = 'llama3.1',
            messages = [
                {"role": "system", "content": f"You are a resident in Long Island who is currently at {address}."},
                {"role": "user", "content": f"Suppose it is currently {time}. Yesterday, U.S. President signed an emergency declaration for New York. Most schools, colleges, and universities are closed. Railroads, subways, and buses are suspended, along with bridges and tunnels. Shelters are opened for evacuations of residents. {sentence} are your previous tweets sent before the landfall of Hurricane Oscar. Now, Hurricane Oscar just made landfall 160km south of Long Island as a category 1 Hurricane. It has caused extremely heavy rainfall, strong wind and significant storm surge up to 12.65ft along Long Island, leaving over 14 square mile of flood. Infrastructure, as well as houses, is seriously impaired, leaving hyperscale power outages. You currently see your followees' tweets {followee_tweets} on Twitter. Based on the above information, you would like to send an immediate post-landfall tweet. Only output the tweet."}
                ],
            )
    pred_content = response.choices[0].message.content.strip()
    return pred_content

def self_reflection(client, time, address, sentence, followee_tweets, pred_content):
    response = client.chat.completions.create(
            model = 'llama3.1',
            messages = [
                {"role": "system", "content": f"You are a resident in Long Island who is currently at {address}."},
                {"role": "user", "content": f"Suppose it is currently {time}. Yesterday, U.S. President signed an emergency declaration for New York. Most schools, colleges, and universities are closed. Railroads, subways, and buses are suspended, along with bridges and tunnels. Shelters are opened for evacuations of residents. {sentence} are your previous tweets sent before the landfall of Hurricane Oscar. Now, Hurricane Oscar just made landfall 160km south of Long Island as a category 1 Hurricane. It has caused extremely heavy rainfall, strong wind and significant storm surge up to 12.65ft along Long Island, leaving over 14 square mile of flood. Infrastructure, as well as houses, is seriously impaired, leaving hyperscale power outages. You currently see your followees' tweets {followee_tweets} on Twitter. Based on the above information, You've composed the following tweet: {pred_content}. You want to reflect if this tweet flows consistently with previous ones. If it is consistent, output the original tweet. If not, output a new tweet which is more consistent. Only output the tweet."}
                ],
            )
    pred_content = response.choices[0].message.content.strip()
    return pred_content

In [6]:
xtone_data_closest = []
ny_3000_closest_attributes = pd.read_csv('data/ny_3000_closest_address.csv')

client = openai.OpenAI(
    base_url = 'http://10.103.16.82:11434/v1/',
    api_key = 'ollama'
)

for index, row in tqdm(user_df.iterrows(), desc="Generating predictions", total=user_df.shape[0]):
    user_id = int(row['user_id'])
    sentence = row['sentence']
    followee_tweets = followee_df[followee_df['user_id'] == user_id]['sentence'].values[0]
    time = ny_3000_closest_attributes[ny_3000_closest_attributes['user_id'] == user_id]['created_at'].values[0]
    address = ny_3000_closest_attributes[ny_3000_closest_attributes['user_id'] == user_id]['address'].values[0]

    pc = pred_sentence_closest(client, time, address, sentence, followee_tweets)
    pc = clean_text(pc)

    fc = self_reflection(client, time, address, sentence, followee_tweets, pc)
    fc = clean_text(fc)
    xtone_data_closest.append([user_id, time, address, pc, fc])

xtone_data_closest_df = pd.DataFrame(xtone_data_closest, columns=['user_id', 'created_at', 'address', 'predicted_content', 'reflected_content'])
xtone_data_closest_df.to_csv('data/240729_output/xtone/ny_3000_closest_xtone_llama3.1.csv', index=False)

Generating predictions: 100%|██████████| 3000/3000 [2:16:23<00:00,  2.73s/it]  


## AFTER

In [7]:
# Directory containing the CSV files
input_dir = 'data/ny_3000_after_original'

# List to store user_id and sentences
user_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        user_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
user_df = pd.DataFrame(user_data, columns=['user_id', 'sentence'])
user_df['user_id'] = pd.to_numeric(user_df['user_id'], errors='coerce').fillna(0).astype(int)
user_df = user_df.sort_values(by='user_id',inplace=False)
user_df = user_df.reset_index(drop=True)

# Display the DataFrame
print(user_df)

Processing sentences: 100%|██████████| 3000/3000 [00:19<00:00, 152.27it/s]

        user_id                                           sentence
0           744  At 2024-11-01 23:04:14-04:00, Wish I knew how ...
1          2692  At 2024-11-02 15:45:20-04:00, RT @felixsalmon:...
2          2949  At 2024-10-26 18:28:52-04:00, RT @MikeBloomber...
3          2967  At 2024-10-29 20:45:03-04:00, RT @billmaher: S...
4          3818  At 2024-10-31 09:54:03-04:00, @stukirby83 they...
...         ...                                                ...
2995  893810257  At 2024-10-28 22:52:19-04:00, This hurricane t...
2996  901296522  At 2024-10-30 15:33:32-04:00, RT @mitchellrepo...
2997  905038452  At 2024-10-30 14:25:46-04:00, 80% of LI is w/o...
2998  909508580  At 2024-10-29 13:41:03-04:00, The Nets won the...
2999  910703906  At 2024-10-30 17:14:22-04:00, Nearly in tears ...

[3000 rows x 2 columns]





In [8]:
# Directory containing the CSV files
input_followee_dir = 'data/ny_3000_after_followees'

# List to store user_id and sentences
followee_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_followee_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_followee_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        followee_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
followee_df = pd.DataFrame(followee_data, columns=['user_id', 'sentence'])
followee_df['user_id'] = pd.to_numeric(followee_df['user_id'], errors='coerce').fillna(0).astype(int)
followee_df = followee_df.sort_values(by='user_id',inplace=False)
followee_df = followee_df.reset_index(drop=True)

# Display the DataFrame
print(followee_df)

Processing sentences: 100%|██████████| 3000/3000 [00:22<00:00, 130.81it/s]

        user_id                                           sentence
0           744  At 2024-10-30 13:01:36-04:00, Huge Jerk Donald...
1          2692  At 2024-10-30 15:20:34-04:00, President @Barac...
2          2949  At 2024-10-30 15:38:01-04:00, MT @KatrinaNatio...
3          2967  At 2024-10-30 00:12:28-04:00, #oscar gram: Pul...
4          3818  At 2024-10-29 23:18:09-04:00, #NYS POWER #OUTA...
...         ...                                                ...
2995  893810257  At 2024-10-29 21:15:45-04:00, Power's out, but...
2996  901296522  At 2024-10-29 21:27:58-04:00, bet Mittens can'...
2997  905038452  At 2024-10-30 14:25:30-04:00, RT @TimDavis_Aut...
2998  909508580  At 2024-10-29 19:31:35-04:00, Wow.... #Oscar i...
2999  910703906  At 2024-10-29 19:31:21-04:00, RT @younglovee13...

[3000 rows x 2 columns]





In [9]:
def pred_sentence_after(client, time, address, sentence, followee_tweets):
    response = client.chat.completions.create(
            model = 'llama3.1',
            messages = [
                {"role": "system", "content": f"You are a resident in Long Island who is currently at {address}."},
                {"role": "user", "content": f"Suppose it is currently {time}. It has been a week since the landfall of Hurricane Oscar. It has left massive infrastructure damage and more than 100,000 house impairment due to flood, strong wind and heavy rainfall. It had led to 48 deaths on Long Island. The government has been performing disaster relief, especially on power networks. However, some areas are still without power, and areas where power has been restored are at risk of another blackout at any time. {sentence} are your previous tweets. You see your followees' tweets {followee_tweets} on Twitter. Based on the above information, you would like to send an new tweet. Only output the tweet."}
                ],
            )
    pred_content = response.choices[0].message.content.strip()
    return pred_content

def self_reflection_after(client, time, address, sentence, followee_tweets, pred_content):
    response = client.chat.completions.create(
            model = 'llama3.1',
            messages = [
                {"role": "system", "content": f"You are a resident in Long Island who is currently at {address}."},
                {"role": "user", "content": f"Suppose it is currently {time}. It has been a week since the landfall of Hurricane Oscar. It has left massive infrastructure damage and more than 100,000 house impairment due to flood, strong wind and heavy rainfall. It had led to 48 deaths on Long Island. The government has been performing disaster relief, especially on power networks. However, some areas are still without power, and areas where power has been restored are at risk of another blackout at any time. {sentence} are your previous tweets. You see your followees' tweets {followee_tweets} on Twitter. Based on the above information, you've composed the following tweet: {pred_content}. You want to reflect if this tweet flows consistently with previous ones. If it is consistent, output the original tweet, if not, output a new tweet which is more consistent. Only output the tweet."}
                ],
            )
    pred_content = response.choices[0].message.content.strip()
    return pred_content

In [10]:
ny_3000_after_attributes = pd.read_csv('data/ny_3000_after_address.csv')
xtone_data_after = []

client = openai.OpenAI(
    base_url = 'http://10.103.16.82:11434/v1/',
    api_key = 'ollama'
)

for index, row in tqdm(user_df.iterrows(), desc="Generating predictions", total=user_df.shape[0]):
    user_id = int(row['user_id'])
    sentence = row['sentence']
    followee_tweets = followee_df[followee_df['user_id'] == user_id]['sentence'].values[0]
    time = ny_3000_after_attributes[ny_3000_after_attributes['user_id'] == user_id]['created_at'].values[0]
    address = ny_3000_after_attributes[ny_3000_after_attributes['user_id'] == user_id]['address'].values[0]

    pc = pred_sentence_after(client, time, address, sentence, followee_tweets)
    pc = clean_text(pc)
    # print(pc)
    fc = self_reflection_after(client, time, address, sentence, followee_tweets, pc)
    fc = clean_text(fc)
    xtone_data_after.append([user_id, time, address, pc, fc])

xtone_data_after_df = pd.DataFrame(xtone_data_after, columns=['user_id', 'created_at', 'address','predicted_content', 'reflected_content'])
xtone_data_after_df.to_csv('data/240729_output/xtone/ny_3000_after_xtone_llama3.1.csv', index=False)

Generating predictions: 100%|██████████| 3000/3000 [2:16:14<00:00,  2.72s/it]  
