In [1]:
import pandas as pd
from tqdm import tqdm
import os
from sqlalchemy import create_engine
import openai
from dotenv import load_dotenv
import requests
import time
import re

In [None]:
url = os.getenv('LOCAL_URL')

### Construct Long Sentences

In [2]:
# Directory containing the CSV files
input_dir = 'data/nj_3000_closest_original'

# List to store user_id and sentences
user_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"{row['text']}."
            # sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        user_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
user_df = pd.DataFrame(user_data, columns=['user_id', 'sentence'])
user_df['user_id'] = pd.to_numeric(user_df['user_id'], errors='coerce').fillna(0).astype(int)
user_df = user_df.sort_values(by='user_id',inplace=False)
user_df = user_df.reset_index(drop=True)

# Display the DataFrame
print(user_df)

Processing sentences: 100%|██████████| 3000/3000 [00:18<00:00, 162.29it/s]

        user_id                                           sentence
0         51303  @MLSonNBCSports @NewYorkRedBulls Except I can'...
1         79903  RT @Gothamist: $125 Monthly MetroCard on the t...
2        317183  Looking for Python and Django meetups in New Y...
3        350373  RT @jasonsantamaria: Live stream for the oncom...
4        618233  RT @Instacane: Thanks to @jbarraud, we now hav...
...         ...                                                ...
2995  865795286  @kayla_jamesss can i chill at ur house during ...
2996  873828578  RT @fema: Receive @CityofNewarkNJ tweets via t...
2997  888929880  @chasingnj @dexbindra internet marketers say o...
2998  896686856  RT @OprahsLifeclass: "When you're at peace you...
2999  902762444  @loladuallo @kill_morgan lets all chill hurric...

[3000 rows x 2 columns]





In [3]:
# Directory containing the CSV files
input_followee_dir = 'data/nj_3000_closest_followees'

# List to store user_id and sentences
followee_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_followee_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_followee_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"{row['text']}."
            # sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        followee_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
followee_df = pd.DataFrame(followee_data, columns=['user_id', 'sentence'])
followee_df['user_id'] = pd.to_numeric(followee_df['user_id'], errors='coerce').fillna(0).astype(int)
followee_df = followee_df.sort_values(by='user_id',inplace=False)
followee_df = followee_df.reset_index(drop=True)

# Display the DataFrame
print(followee_df)

Processing sentences: 100%|██████████| 3000/3000 [00:22<00:00, 131.34it/s]

        user_id                                           sentence
0         51303  RT @emjacobi: As the wind howls, I try to imag...
1         79903  Just saw huge flash of light from 14th street ...
2        317183  I feel like I'm reading one of those apocalypt...
3        350373  RT @jsjohnst: All lights just went out in the ...
4        618233  RT @KevinFarzad: Yes, Zooey Deschanel. It's ra...
...         ...                                                ...
2995  865795286  No freaking power #rathergotoschool. Jayesslee...
2996  873828578  RT @distressline: Feeling anxious, worried &am...
2997  888929880  President Obama has declared a major disaster ...
2998  896686856  Power on at store, shelves Barr but illuminate...
2999  902762444  i just want power 😟🌊❄. Can the power go back o...

[3000 rows x 2 columns]





In [14]:
def clean_text(text):
    return re.sub(r'[^A-Za-z0-9\s.,;!?\'"-@#]', '', text)

In [5]:
def pred_sentence_closest(client, time, address, sentence, followee_tweets, tone_of_voice, attitude, tweet, review):
    response = client.chat.completions.create(
            model = 'gemma2',
            messages = [
                {"role": "system", "content": f"You are a resident in New Jersey who is currently at {address}. Your attitude towards Hurricane Oscar is {attitude} with past tone of voice of {tone_of_voice} on social media."},
                {"role": "user", "content": f"Suppose it is currently {time}. Hurricane Oscar just made landfall near Brigantine, New Jersey as a category 1 Hurricane. You've composed a tweet {tweet}. However, it is not consistent with your previous tweets {sentence} with following reasons: {review} Based on the above reflection, please compose a tweet consistent with the attitude of {attitude} and tone of voice of {tone_of_voice}. Only output the tweet"}
                ],
            )
    pred_content = response.choices[0].message.content.strip()
    return pred_content

In [6]:
nj_3000_closest_review = pd.read_csv('data/reviews/nj_3000_closest_review_raw_gemma2.csv')

print(nj_3000_closest_review.head())

    user_id                                           response
0  11019902  Overall, the new comment is 'inconsistent' wit...
1   1117491  Overall, this new comment aligns with their pr...
2  11327822  Overall, the language use and the enunciated a...
3  14003912  Overall, this new comment is inconsistent with...
4  14070736  Overall, it is inconsistent with the previous ...


In [7]:
generated_data_closest = []
nj_3000_closest_attributes = pd.read_csv('data/nj_3000_closest_address.csv')
nj_3000_closest_analysis = pd.read_csv('data/nj_3000_closest_generated_gemma2.csv', usecols=['user_id', 'tone_of_voice', 'predicted_content', 'attitude'])

client = openai.OpenAI(
    base_url = url,
    api_key = 'ollama'
)

for index, row in tqdm(nj_3000_closest_review.iterrows(), desc="Generating predictions", total=nj_3000_closest_review.shape[0]):
    user_id = int(row['user_id'])
    sentence = user_df[user_df['user_id'] == user_id]['sentence'].values[0]
    followee_tweets = followee_df[followee_df['user_id'] == user_id]['sentence'].values[0]
    time = nj_3000_closest_attributes[nj_3000_closest_attributes['user_id'] == user_id]['created_at'].values[0]
    address = nj_3000_closest_attributes[nj_3000_closest_attributes['user_id'] == user_id]['address'].values[0]
    tone_of_voice = nj_3000_closest_analysis[nj_3000_closest_analysis['user_id'] == user_id]['tone_of_voice'].values[0]
    attitude = nj_3000_closest_analysis[nj_3000_closest_analysis['user_id'] == user_id]['attitude'].values[0]
    tweet = nj_3000_closest_analysis[nj_3000_closest_analysis['user_id'] == user_id]['predicted_content'].values[0]
    review =nj_3000_closest_review[nj_3000_closest_review['user_id'] == user_id]['response'].values[0]
    pc = pred_sentence_closest(client, time, address, sentence, followee_tweets, tone_of_voice, attitude,tweet,review)
    generated_data_closest.append([user_id, pc])

generated_data_closest_df = pd.DataFrame(generated_data_closest, columns=['user_id', 'rege_content'])
generated_data_closest_df.to_csv('data/nj_3000_closest_rege_gemma2.csv', index=False)

Generating predictions: 100%|██████████| 447/447 [16:24<00:00,  2.20s/it]


## AFTER

In [8]:
# Directory containing the CSV files
input_dir = 'data/nj_3000_after_original'

# List to store user_id and sentences
user_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"{row['text']}."
            # sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        user_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
user_df = pd.DataFrame(user_data, columns=['user_id', 'sentence'])
user_df['user_id'] = pd.to_numeric(user_df['user_id'], errors='coerce').fillna(0).astype(int)
user_df = user_df.sort_values(by='user_id',inplace=False)
user_df = user_df.reset_index(drop=True)

# Display the DataFrame
print(user_df)

Processing sentences: 100%|██████████| 3000/3000 [00:38<00:00, 77.80it/s]

        user_id                                           sentence
0         51303  @NewYorkRedBulls Will they bother to show it o...
1         79903  “The Power Is On” by The Go! Team is my new ja...
2        317183  One day after #Oscar and everybody's safe at h...
3        350373  RT @tmasteve: Central NJ tweeps. Sports Author...
4        618233  Firing up the generator in the morning is my n...
...         ...                                                ...
2995  865795286  downloading odee shows&amp;movies onto my lapt...
2996  873828578  RT @fema: Stay up-to-date on your #Oscar forec...
2997  888929880  Man gets unruly on NYC gas line http://t.co/QS...
2998  896686856  NBC 8:00PM EST - Hurricane Oscar Relief Teleth...
2999  902762444  Where the fucks the power #oscar. I depend on ...

[3000 rows x 2 columns]





In [9]:
# Directory containing the CSV files
input_followee_dir = 'data/nj_3000_after_followees'

# List to store user_id and sentences
followee_data = []

# Loop through each user_id and process the CSV file
for csv_file in tqdm(os.listdir(input_followee_dir), desc="Processing sentences"):
    if csv_file.endswith('.csv'):
        user_id = csv_file.split('_')[1].split('.')[0]  # Extract user_id from filename
        df = pd.read_csv(os.path.join(input_followee_dir, csv_file))

        # Create a long sentence from the DataFrame
        sentences = []
        for index, row in df.iterrows():
            sentence = f"{row['text']}."
            # sentence = f"At {row['created_at']}, {row['text']}."
            sentences.append(sentence)

        # Combine all sentences into one long string and add to the data list
        long_sentence = ' '.join(sentences)
        followee_data.append([user_id, long_sentence])

# Create a DataFrame from the collected data
followee_df = pd.DataFrame(followee_data, columns=['user_id', 'sentence'])
followee_df['user_id'] = pd.to_numeric(followee_df['user_id'], errors='coerce').fillna(0).astype(int)
followee_df = followee_df.sort_values(by='user_id',inplace=False)
followee_df = followee_df.reset_index(drop=True)

# Display the DataFrame
print(followee_df)

Processing sentences: 100%|██████████| 3000/3000 [00:43<00:00, 69.59it/s]

        user_id                                           sentence
0         51303  RT @emjacobi: As the wind howls, I try to imag...
1         79903  Just saw huge flash of light from 14th street ...
2        317183  I feel like I'm reading one of those apocalypt...
3        350373  RT @jsjohnst: All lights just went out in the ...
4        618233  RT @KevinFarzad: Yes, Zooey Deschanel. It's ra...
...         ...                                                ...
2995  865795286  No freaking power #rathergotoschool. Jayesslee...
2996  873828578  RT @distressline: Feeling anxious, worried &am...
2997  888929880  President Obama has declared a major disaster ...
2998  896686856  Power on at store, shelves Barr but illuminate...
2999  902762444  i just want power 😟🌊❄. Can the power go back o...

[3000 rows x 2 columns]





In [10]:
def pred_sentence_after(client, time, address, sentence, followee_tweets, tone_of_voice, attitude, tweet, review):
    response = client.chat.completions.create(
            model = 'gemma2',
            messages = [
                {"role": "system", "content": f"You are a resident in New Jersey who is currently at {address}. Your attitude towards Hurricane Oscar is {attitude} with a tone of voice on social media of {tone_of_voice}."},
                {"role": "user", "content": f"Suppose it is currently {time}. It has been a week since the landfall of Hurricane Oscar. You've composed a tweet {tweet}. However, it is not consistent with your previous tweets {sentence} with following reasons: {review} Based on the above reflection, please compose a tweet consistent with the attitude of {attitude} towards Oscar and tone of voice of {tone_of_voice}. Only output the tweet."}
                ],
            )
    pred_content = response.choices[0].message.content.strip()
    return pred_content

In [11]:
nj_3000_after_review = pd.read_csv('data/reviews/nj_3000_after_review_raw_gemma2.csv')
print(nj_3000_after_review.head())

    user_id                                           response
0  10250392  Overall, the new comment's language use and at...
1  10443552  Overall, the new comment is consistent with th...
2  10476642  Overall, the new comment is consistent with th...
3  10521002  Overall, the new comment is inconsistent with ...
4  11004402  Overall, the new comment is inconsistent with ...


In [15]:
nj_3000_after_attributes = pd.read_csv('data/nj_3000_after_address.csv')
generated_data_after = []
nj_3000_after_analysis = pd.read_csv('data/nj_3000_after_generated_gemma2.csv', usecols=['user_id', 'tone_of_voice', 'attitude', 'predicted_content'])

client = openai.OpenAI(
    base_url = url,
    api_key = 'ollama'
)

for index, row in tqdm(nj_3000_after_review.iterrows(), desc="Generating predictions", total=nj_3000_after_review.shape[0]):
    user_id = int(row['user_id'])
    sentence = user_df[user_df['user_id'] == user_id]['sentence'].values[0]
    followee_tweets = followee_df[followee_df['user_id'] == user_id]['sentence'].values[0]
    time = nj_3000_after_attributes[nj_3000_after_attributes['user_id'] == user_id]['created_at'].values[0]
    address = nj_3000_after_attributes[nj_3000_after_attributes['user_id'] == user_id]['address'].values[0]
    tone_of_voice = nj_3000_after_analysis[nj_3000_after_analysis['user_id'] == user_id]['tone_of_voice'].values[0]
    attitude = nj_3000_after_analysis[nj_3000_after_analysis['user_id'] == user_id]['attitude'].values[0]
    tweet = nj_3000_after_analysis[nj_3000_after_analysis['user_id'] == user_id]['predicted_content'].values[0]
    review = nj_3000_after_review[nj_3000_after_review['user_id'] == user_id]['response'].values[0]
    pc = pred_sentence_after(client, time, address, sentence, followee_tweets, tone_of_voice, attitude, tweet, review)
    pc = clean_text(pc)
    generated_data_after.append([user_id, pc])

generated_data_after_df = pd.DataFrame(generated_data_after, columns=['user_id', 'rege_content'])
generated_data_after_df.to_csv('data/nj_3000_after_rege_gemma2.csv', index=False)

Generating predictions: 100%|██████████| 688/688 [16:47<00:00,  1.46s/it]
