In [1]:
import pandas as pd
from tqdm import tqdm
import os
import openai
from dotenv import load_dotenv
import requests
import time
import re

In [None]:
url = os.getenv('LOCAL_URL')

### Construct Long Sentences

In [2]:
# Directory containing the CSV files
input_dir = 'data/nj_3000_closest_original'

# List to store user_id and sentences
user_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            # sentence = f"At {row['created_at']}, {row['text']}."
            sentence = f"{row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        user_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
user_df = pd.DataFrame(user_data, columns=['user_id', 'sentence'])
user_df['user_id'] = pd.to_numeric(user_df['user_id'], errors='coerce').fillna(0).astype(int)
user_df = user_df.sort_values(by='user_id',inplace=False)
user_df = user_df.reset_index(drop=True)

# Display the DataFrame
print(user_df)

Processing sentences: 100%|██████████| 3000/3000 [00:33<00:00, 88.62it/s]

        user_id                                           sentence
0         51303  @MLSonNBCSports @NewYorkRedBulls Except I can'...
1         79903  RT @Gothamist: $125 Monthly MetroCard on the t...
2        317183  Looking for Python and Django meetups in New Y...
3        350373  RT @jasonsantamaria: Live stream for the oncom...
4        618233  RT @Instacane: Thanks to @jbarraud, we now hav...
...         ...                                                ...
2995  865795286  @kayla_jamesss can i chill at ur house during ...
2996  873828578  RT @fema: Receive @CityofNewarkNJ tweets via t...
2997  888929880  @chasingnj @dexbindra internet marketers say o...
2998  896686856  RT @OprahsLifeclass: "When you're at peace you...
2999  902762444  @loladuallo @kill_morgan lets all chill hurric...

[3000 rows x 2 columns]





In [3]:
# Directory containing the CSV files
input_followee_dir = 'data/nj_3000_closest_followees'

# List to store user_id and sentences
followee_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_followee_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_followee_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"{row['text']}."
            # sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        followee_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
followee_df = pd.DataFrame(followee_data, columns=['user_id', 'sentence'])
followee_df['user_id'] = pd.to_numeric(followee_df['user_id'], errors='coerce').fillna(0).astype(int)
followee_df = followee_df.sort_values(by='user_id',inplace=False)
followee_df = followee_df.reset_index(drop=True)

# Display the DataFrame
print(followee_df)

Processing sentences: 100%|██████████| 3000/3000 [00:40<00:00, 74.90it/s]

        user_id                                           sentence
0         51303  RT @emjacobi: As the wind howls, I try to imag...
1         79903  Just saw huge flash of light from 14th street ...
2        317183  I feel like I'm reading one of those apocalypt...
3        350373  RT @jsjohnst: All lights just went out in the ...
4        618233  RT @KevinFarzad: Yes, Zooey Deschanel. It's ra...
...         ...                                                ...
2995  865795286  No freaking power #rathergotoschool. Jayesslee...
2996  873828578  RT @distressline: Feeling anxious, worried &am...
2997  888929880  President Obama has declared a major disaster ...
2998  896686856  Power on at store, shelves Barr but illuminate...
2999  902762444  i just want power 😟🌊❄. Can the power go back o...

[3000 rows x 2 columns]





In [4]:
def clean_text(text):
    return re.sub(r'[^A-Za-z0-9\s.,;!?\'"-@#]', '', text)


def filter_words(text):
    # Regular expression to match patterns more flexibly:
    regex = re.compile(r"""
    (\b\w+[-\w\s]*?\b) # First word or phrase, non-greedy
    \s*,?\s* # Comma followed by any spaces
    (\b\w+[-\w\s]*?\b) # Second word or phrase, non-greedy
    \s*,\s* # Comma followed by any spaces
    (and\s+)? # Optional 'and' followed by spaces
    (\b\w+[-\w\s]*?\b)? # Third word or phrase, non-greedy
    (?:\.?\s*? |$) # Ensuring it ends with whitespace or end of string
    """, re.VERBOSE | re.IGNORECASE)

    # Clean the text to remove extra spaces and correct common punctuation issues
    cleaned_text = re.sub(r'\s+', ' ', text.strip())  # Reduce multiple spaces to one
    match = regex.search(cleaned_text)
    if match:
        # Construct the matching string from groups, handling missing parts
        parts = [match.group(i) for i in range(1, 5) if match.group(i)]
        return ', '.join(parts).replace(' ,', ',').strip()
    else:
        return "No match found"


In [5]:
def pred_sentence_closest(client, time, address, sentence, followee_tweets):
    response = client.chat.completions.create(
            model = 'llama3.1',
            messages = [
                {"role": "system", "content": f"You are a resident in New Jersey who is currently at {address}."},
                {"role": "user", "content": f"Suppose it is currently {time}. Three days before the landfall of Hurricane Oscar on Oct. 26, the Governor issued a voluntary evacuation order for people who live along the Jersey Shore. Today, most schools, casinos, colleges, and universities are closed. Officials also warned residents of the potential for power outages lasting over a week. {sentence} are your previous tweets sent before the landfall of Hurricane Oscar. Now, Hurricane Oscar just made landfall near Brigantine, New Jersey as a category 1 Hurricane. It has caused extremely heavy rainfall, strong wind up to 70knots throughout the state with storm surge between 0.8m and 2.8m along the coast. Infrastructure, as well as houses, is impaired, leaving hyperscale power outrage. You currently see your followees' tweets {followee_tweets} on Twitter. Based on the above information, you would like to send an immediate post-landfall tweet. Only output the tweet."}
                ],
            )
    pred_content = response.choices[0].message.content.strip()
    return pred_content

def self_reflection(client, time, address, sentence, followee_tweets, pred_content):
    response = client.chat.completions.create(
            model = 'llama3.1',
            messages = [
                {"role": "system", "content": f"You are a resident in New Jersey who is currently at {address}."},
                {"role": "user", "content": f"Suppose it is currently {time}. Three days before the landfall of Hurricane Oscar on Oct. 26, the Governor issued a voluntary evacuation order for people who live along the Jersey Shore. Today, most schools, casinos, colleges, and universities are closed. Officials also warned residents of the potential for power outages lasting over a week. {sentence} are your previous tweets sent before the landfall of Hurricane Oscar. Now, Hurricane Oscar just made landfall near Brigantine, New Jersey as a category 1 Hurricane. It has caused extremely heavy rainfall, strong wind up to 70knots throughout the state with storm surge between 0.8m and 2.8m along the coast. Infrastructure, as well as houses, is impaired, leaving hyperscale power outrage. You currently see your followees' tweets {followee_tweets} on Twitter. Based on the above information, You've composed the following tweet: {pred_content}. You want to reflect if this tweet flows consistently with previous ones. If it is consistent, output the composed tweet, if not, output a new tweet which is more consistent. Only output the tweet."}
                ],
            )
    pred_content = response.choices[0].message.content.strip()
    return pred_content

In [6]:
xtone_data_closest = []
nj_3000_closest_attributes = pd.read_csv('data/nj_3000_closest_address.csv')

client = openai.OpenAI(
    base_url = url,
    api_key = 'ollama'
)

for index, row in tqdm(user_df.iterrows(), desc="Generating predictions", total=user_df.shape[0]):
    user_id = int(row['user_id'])
    sentence = row['sentence']
    followee_tweets = followee_df[followee_df['user_id'] == user_id]['sentence'].values[0]
    time = nj_3000_closest_attributes[nj_3000_closest_attributes['user_id'] == user_id]['created_at'].values[0]
    address = nj_3000_closest_attributes[nj_3000_closest_attributes['user_id'] == user_id]['address'].values[0]

    pc = pred_sentence_closest(client, time, address, sentence, followee_tweets)
    pc = clean_text(pc)
    # print(pc)
    fc = self_reflection(client, time, address, sentence, followee_tweets, pc)
    fc = clean_text(pc)
    xtone_data_closest.append([user_id, time, address, pc, fc])

xtone_data_closest_df = pd.DataFrame(xtone_data_closest, columns=['user_id', 'created_at', 'address', 'predicted_content', 'reflected_content'])
xtone_data_closest_df.to_csv('data/nj_3000_closest_xtone_llama3.1.csv', index=False)

Generating predictions: 100%|██████████| 3000/3000 [2:25:45<00:00,  2.92s/it]  


## AFTER

In [7]:
# Directory containing the CSV files
input_dir = 'data/nj_3000_after_original'

# List to store user_id and sentences
user_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"{row['text']}."
            # sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        user_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
user_df = pd.DataFrame(user_data, columns=['user_id', 'sentence'])
user_df['user_id'] = pd.to_numeric(user_df['user_id'], errors='coerce').fillna(0).astype(int)
user_df = user_df.sort_values(by='user_id',inplace=False)
user_df = user_df.reset_index(drop=True)

# Display the DataFrame
print(user_df)

Processing sentences: 100%|██████████| 3000/3000 [00:18<00:00, 159.54it/s]

        user_id                                           sentence
0         51303  @NewYorkRedBulls Will they bother to show it o...
1         79903  “The Power Is On” by The Go! Team is my new ja...
2        317183  One day after #Oscar and everybody's safe at h...
3        350373  RT @tmasteve: Central NJ tweeps. Sports Author...
4        618233  Firing up the generator in the morning is my n...
...         ...                                                ...
2995  865795286  downloading odee shows&amp;movies onto my lapt...
2996  873828578  RT @fema: Stay up-to-date on your #Oscar forec...
2997  888929880  Man gets unruly on NYC gas line http://t.co/QS...
2998  896686856  NBC 8:00PM EST - Hurricane Oscar Relief Teleth...
2999  902762444  Where the fucks the power #oscar. I depend on ...

[3000 rows x 2 columns]





In [8]:
# Directory containing the CSV files
input_followee_dir = 'data/nj_3000_after_followees'

# List to store user_id and sentences
followee_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_followee_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_followee_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"{row['text']}."
            # sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        followee_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
followee_df = pd.DataFrame(followee_data, columns=['user_id', 'sentence'])
followee_df['user_id'] = pd.to_numeric(followee_df['user_id'], errors='coerce').fillna(0).astype(int)
followee_df = followee_df.sort_values(by='user_id',inplace=False)
followee_df = followee_df.reset_index(drop=True)

# Display the DataFrame
print(followee_df)

Processing sentences: 100%|██████████| 3000/3000 [00:21<00:00, 136.61it/s]


        user_id                                           sentence
0         51303  RT @emjacobi: As the wind howls, I try to imag...
1         79903  Just saw huge flash of light from 14th street ...
2        317183  I feel like I'm reading one of those apocalypt...
3        350373  RT @jsjohnst: All lights just went out in the ...
4        618233  RT @KevinFarzad: Yes, Zooey Deschanel. It's ra...
...         ...                                                ...
2995  865795286  No freaking power #rathergotoschool. Jayesslee...
2996  873828578  RT @distressline: Feeling anxious, worried &am...
2997  888929880  President Obama has declared a major disaster ...
2998  896686856  Power on at store, shelves Barr but illuminate...
2999  902762444  i just want power 😟🌊❄. Can the power go back o...

[3000 rows x 2 columns]


In [9]:
def pred_sentence_after(client, time, address, sentence, followee_tweets):
    response = client.chat.completions.create(
            model = 'llama3.1',
            messages = [
                {"role": "system", "content": f"You are a resident in New Jersey who is currently at {address}."},
                {"role": "user", "content": f"Suppose it is currently {time}. It has been a week since the landfall of Hurricane Oscar. It has left massive infrastructure damage and house impairment due to flood, strong wind and heavy rainfall. The government has been performing disaster relief. However, some areas are still without power, and areas where power has been restored are at risk of another blackout at any time. {sentence} are your previous tweets. You see your followees' tweets {followee_tweets} on Twitter. Based on the above information, you would like to send an new tweet. Only output the tweet."}
                ],
            )
    pred_content = response.choices[0].message.content.strip()
    return pred_content

def self_reflection_after(client, time, address, sentence, followee_tweets, pred_content):
    response = client.chat.completions.create(
            model = 'llama3.1',
            messages = [
                {"role": "system", "content": f"You are a resident in New Jersey who is currently at {address}."},
                {"role": "user", "content": f"Suppose it is currently {time}. It has been a week since the landfall of Hurricane Oscar. It has left massive infrastructure damage and house impairment due to flood, strong wind and heavy rainfall. The government has been performing disaster relief. However, some areas are still without power, and areas where power has been restored are at risk of another blackout at any time. {sentence} are your previous tweets. You see your followees' tweets {followee_tweets} on Twitter. Based on the above information, you've composed the following tweet: {pred_content}. You want to reflect if this tweet flows consistently with previous ones. If it is consistent, output the composed tweet, if not, output a new tweet which is more consistent. Only output the tweet."}
                ],
            )
    pred_content = response.choices[0].message.content.strip()
    return pred_content

In [10]:
nj_3000_after_attributes = pd.read_csv('data/nj_3000_after_address.csv')
xtone_data_after = []

client = openai.OpenAI(
    base_url = url,
    api_key = 'ollama'
)

for index, row in tqdm(user_df.iterrows(), desc="Generating predictions", total=user_df.shape[0]):
    user_id = int(row['user_id'])
    sentence = row['sentence']
    followee_tweets = followee_df[followee_df['user_id'] == user_id]['sentence'].values[0]
    time = nj_3000_after_attributes[nj_3000_after_attributes['user_id'] == user_id]['created_at'].values[0]
    address = nj_3000_after_attributes[nj_3000_after_attributes['user_id'] == user_id]['address'].values[0]

    pc = pred_sentence_after(client, time, address, sentence, followee_tweets)
    # print(pc)
    pc = clean_text(pc)
    fc = self_reflection_after(client, time, address, sentence, followee_tweets, pc)
    fc = clean_text(fc)
    xtone_data_after.append([user_id, time, address, pc, fc])

xtone_data_after_df = pd.DataFrame(xtone_data_after, columns=['user_id', 'created_at', 'address', 'predicted_content', 'reflected_content'])
xtone_data_after_df.to_csv('data/nj_3000_after_xtone_llama3.1.csv', index=False)

Generating predictions: 100%|██████████| 3000/3000 [2:18:16<00:00,  2.77s/it]  
