In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from nltk.tokenize import sent_tokenize

# Load the original dataset
data = pd.read_csv('combined_dormitory_data.csv')

# Ensure all values in the 'text' column are treated as strings
data['text'] = data['text'].astype(str)

# Load the sentiment analysis model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

# Define a function to predict sentiment for each comment
def get_sentiment(comment):
    inputs = tokenizer(comment, return_tensors="pt", truncation=True, max_length=512, padding=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class_id = torch.argmax(logits, dim=-1).item()
    return predicted_class_id

# Map sentiment labels (0: negative, 1: neutral, 2: positive)
sentiment_labels = {0: "negative", 1: "neutral", 2: "positive"}

# Create a list to store the new rows
new_rows = []

# Iterate through each review, tokenize into sentences, and get sentiment
for index, row in data.iterrows():
    original_review = row['text']
    sentences = sent_tokenize(original_review)  # Split review into sentences
    
    for sentence in sentences:
        sentiment = sentiment_labels[get_sentiment(sentence)]
        new_rows.append({'date': row['date'], 
                         'channel_name': row['channel_name'],
                         'message_id': row['message_id'],
                         'text': original_review,
                         'sentiment': sentiment,
                         'rephrased_news': row['rephrased_news'],
                         'title': row['title'],
                         'topic': row['topic'],
                         'location_type': row['location_type'],
                         'location': row['location'],
                         'latitude': row['latitude'],
                         'longitude': row['longitude'],
                         'sentence': sentence,}) 

# Create a new DataFrame from the new rows
sentiment_data = pd.DataFrame(new_rows)

# # Merge with the original data if needed
# merged_data = pd.merge(data, sentiment_data, on=['place_id', 'text'], how='inner')

# # Save the updated data with sentiment column to a new CSV file
# merged_data.to_csv('Merged_Labeled_Reviews.csv', index=False)

# Print the first few rows of the merged data
sentiment_data.head(10)


  return self.fget.__get__(instance, owner)()
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,date,channel_name,message_id,text,sentiment,rephrased_news,title,topic,location_type,location,latitude,longitude,sentence
0,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392,**🌡️ Repair work on heating networks**\n\nToda...,negative,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,**🌡️ Repair work on heating networks**\n\nToda...
1,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,Estimated return time: 12:00 PM on November 12.
2,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus
3,2023-12-05T00:03:14+00:00,Dormitory 7 of KPI,394,**News from the student campus**\n🔸 An analysi...,neutral,An analysis of the financial and economic acti...,Campus Renovations and Budget Approved,Improvements in Infrastructure,exact location,['KPI Dormitory 7'],50.448522,30.449409,**News from the student campus**\n🔸 An analysi...
4,2023-12-05T00:03:14+00:00,Dormitory 7 of KPI,394,**News from the student campus**\n🔸 An analysi...,neutral,An analysis of the financial and economic acti...,Campus Renovations and Budget Approved,Improvements in Infrastructure,exact location,['KPI Dormitory 7'],50.448522,30.449409,"A draft budget, procurement plan, and repair w..."
5,2023-12-05T00:03:14+00:00,Dormitory 7 of KPI,394,**News from the student campus**\n🔸 An analysi...,neutral,An analysis of the financial and economic acti...,Campus Renovations and Budget Approved,Improvements in Infrastructure,exact location,['KPI Dormitory 7'],50.448522,30.449409,🔹 The renovation of the **gym in dormitory No.
6,2023-12-05T00:03:14+00:00,Dormitory 7 of KPI,394,**News from the student campus**\n🔸 An analysi...,neutral,An analysis of the financial and economic acti...,Campus Renovations and Budget Approved,Improvements in Infrastructure,exact location,['KPI Dormitory 7'],50.448522,30.449409,"7, which will also be used as a shelter**, has..."
7,2023-12-05T00:03:14+00:00,Dormitory 7 of KPI,394,**News from the student campus**\n🔸 An analysi...,neutral,An analysis of the financial and economic acti...,Campus Renovations and Budget Approved,Improvements in Infrastructure,exact location,['KPI Dormitory 7'],50.448522,30.449409,🔸 The restoration of the stairs in dormitory No.
8,2023-12-05T00:03:14+00:00,Dormitory 7 of KPI,394,**News from the student campus**\n🔸 An analysi...,neutral,An analysis of the financial and economic acti...,Campus Renovations and Budget Approved,Improvements in Infrastructure,exact location,['KPI Dormitory 7'],50.448522,30.449409,4 has been completed.
9,2023-12-05T00:03:14+00:00,Dormitory 7 of KPI,394,**News from the student campus**\n🔸 An analysi...,positive,An analysis of the financial and economic acti...,Campus Renovations and Budget Approved,Improvements in Infrastructure,exact location,['KPI Dormitory 7'],50.448522,30.449409,"❤️ Take care of yourselves, use shelters durin..."


In [2]:
# Load the dataset
data = sentiment_data

# Remove rows with "positive" in the "Sentiment" column
data_filtered = data[data['sentiment'] != 'positive']

# Print the first few rows of the filtered dataset
data_filtered.head()

Unnamed: 0,date,channel_name,message_id,text,sentiment,rephrased_news,title,topic,location_type,location,latitude,longitude,sentence
0,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392,**🌡️ Repair work on heating networks**\n\nToda...,negative,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,**🌡️ Repair work on heating networks**\n\nToda...
1,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,Estimated return time: 12:00 PM on November 12.
2,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus
3,2023-12-05T00:03:14+00:00,Dormitory 7 of KPI,394,**News from the student campus**\n🔸 An analysi...,neutral,An analysis of the financial and economic acti...,Campus Renovations and Budget Approved,Improvements in Infrastructure,exact location,['KPI Dormitory 7'],50.448522,30.449409,**News from the student campus**\n🔸 An analysi...
4,2023-12-05T00:03:14+00:00,Dormitory 7 of KPI,394,**News from the student campus**\n🔸 An analysi...,neutral,An analysis of the financial and economic acti...,Campus Renovations and Budget Approved,Improvements in Infrastructure,exact location,['KPI Dormitory 7'],50.448522,30.449409,"A draft budget, procurement plan, and repair w..."


In [3]:
# # Save the filtered data to a new CSV file
# data_filtered.to_csv('Without_pos.csv', index=False)

In [4]:
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer
import pandas as pd
import nltk

nltk.download('punkt')

reviews = data['sentence'].dropna().tolist()

# Split each review into sentences and keep a mapping to the original review
split_reviews = []
original_review_map = []

for review in reviews:
    sentences = sent_tokenize(review)
    split_reviews.extend(sentences)
    original_review_map.extend([review] * len(sentences))  # Map sentences back to original reviews

# Load the model and encode the split sentences into embeddings
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
embeddings = model.encode(split_reviews)

# Perform Agglomerative Clustering with cosine affinity
agglomerative = AgglomerativeClustering(n_clusters=None, distance_threshold=0.91, affinity='cosine', linkage='average')
cluster_assignment = agglomerative.fit_predict(embeddings)

# Group sentences by their assigned cluster
clustered_reviews = {}
for i, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_reviews:
        clustered_reviews[cluster_id] = []
    clustered_reviews[cluster_id].append(split_reviews[i])

# Map each sentence back to its original review
sentence_to_review = {}
for i, sentence in enumerate(split_reviews):
    original_review = original_review_map[i]
    sentence_to_review[sentence] = original_review

# Display the clustered sentences with original mapping
print("\nClustered Sentences:")
for cluster_id, cluster_sentences in clustered_reviews.items():
    print(f"\nCluster {cluster_id}:")
    for sentence in cluster_sentences:
        print(f" - {sentence}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Clustered Sentences:

Cluster 0:
 - **🌡️ Repair work on heating networks**

Today from 9:00 AM, there is no hot water supply and heating in most dormitories.
 - Estimated return time: 12:00 PM on November 12.
 - @kpicampus
 - **News from the student campus**
🔸 An analysis of the financial and economic activities of the student campus for the period 2021-2023 has been conducted.
 - A draft budget, procurement plan, and repair work for 2024 have been prepared and approved.
 - 🔹 The renovation of the **gym in dormitory No.
 - 7, which will also be used as a shelter**, has been completed.
 - 🔸 The restoration of the stairs in dormitory No.
 - 4 has been completed.
 - ❤️ Take care of yourselves, use shelters during air alerts!
 - Student campus KPI @studmisto.
 - **Contact us - ****@kpi_studmisto_bot****.
 - Good luck!
 - **
 - ⚠️ **Emergency**

Emergency power outage across the Shulyavka area and KPI.
 - Estimated return time according to DTEK is 6:00 PM.
 - If you need to recharge, appro



In [5]:
# Create a DataFrame from the original sentences and their corresponding cluster assignments
clustered_df = pd.DataFrame({
    'sentence': split_reviews,
    'cluster_id': cluster_assignment
})

# Check if the original data is loaded correctly as a DataFrame
if isinstance(data, pd.DataFrame):
    # Merge the original DataFrame with the clustered DataFrame on the sentence
    # Use 'outer' merge to keep all original sentences
    data_with_clusters = pd.merge(data, clustered_df, on='sentence', how='outer')
else:
    print("Error: 'data' is not a DataFrame. Please check the loading of 'Without_pos.csv'.")

# Display the updated DataFrame
print("\nUpdated DataFrame with Cluster ID:")
data_with_clusters.head(20)


Updated DataFrame with Cluster ID:


Unnamed: 0,date,channel_name,message_id,text,sentiment,rephrased_news,title,topic,location_type,location,latitude,longitude,sentence,cluster_id
0,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,negative,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,**🌡️ Repair work on heating networks**\n\nToda...,0.0
1,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,Estimated return time: 12:00 PM on November 12.,0.0
2,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus,0.0
3,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus,0.0
4,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus,0.0
5,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus,0.0
6,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus,0.0
7,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus,0.0
8,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus,0.0
9,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus,0.0


In [6]:
# # Save the updated DataFrame to a new CSV file if needed
# data_with_clusters.to_csv('clustered.csv', index=False)

In [7]:
# Функція для отримання відповіді GPT для кожного кластеру
from openai import OpenAI

def get_gpt_response(user_prompt, system_prompt="", model="gpt-4o-mini"):
    client = OpenAI(api_key='sk-jMEOk_AeFqsriX4e9WlvXuiW6xdIiDWoIaImApR-3CT3BlbkFJVeXbABn_hQ3Hb23_XLNa0lxq6MKKRSQUc_PMoAPuYA')
    try:
        completion = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
        )
        response_text = completion.choices[0].message.content
        total_tokens = completion.usage.total_tokens
        return response_text, total_tokens
    except Exception as e:
        return f"An error occurred: {e}", 0

data = data_with_clusters

cluster_titles = {}
for cluster_id, cluster_sentences in clustered_reviews.items():
    user_prompt = f"""Based on these sentences, provide a short title (2-3 words) that highlights a public issue or concern reflected in the text: {cluster_sentences}.
    Ensure that the title contains no extra symbols or characters.
                      """
    
    # Get the cluster title
    response, tokens_used = get_gpt_response(user_prompt)
    
    # Store the result
    cluster_titles[cluster_id] = response
    print(f"Cluster {cluster_id} title: {response}")

# Add the cluster titles to the DataFrame
data['cluster_title'] = data['cluster_id'].map(cluster_titles)

Cluster 0 title: Heating and Water Issues
Cluster 3 title: Public Engagement Concerns
Cluster 1 title: Waste Management Concerns
Cluster 7 title: Vulnerable Children
Cluster 5 title: Social Indifference
Cluster 2 title: War and Sacrifice
Cluster 6 title: Sacrifice for Independence
Cluster 4 title: Grief and Loss


In [8]:
# Display the updated DataFrame with the new Cluster Title column
data.head(20)

Unnamed: 0,date,channel_name,message_id,text,sentiment,rephrased_news,title,topic,location_type,location,latitude,longitude,sentence,cluster_id,cluster_title
0,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,negative,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,**🌡️ Repair work on heating networks**\n\nToda...,0.0,Heating and Water Issues
1,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,Estimated return time: 12:00 PM on November 12.,0.0,Heating and Water Issues
2,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus,0.0,Heating and Water Issues
3,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus,0.0,Heating and Water Issues
4,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus,0.0,Heating and Water Issues
5,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus,0.0,Heating and Water Issues
6,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus,0.0,Heating and Water Issues
7,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus,0.0,Heating and Water Issues
8,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus,0.0,Heating and Water Issues
9,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus,0.0,Heating and Water Issues


In [9]:
# data.to_csv('clustered_id.csv', index=False)
filtered_data = data.dropna(subset=['sentiment', 'sentence'])
filtered_data.head()
# filtered_data.to_csv('filtered_clustered_id.csv', index=False)

Unnamed: 0,date,channel_name,message_id,text,sentiment,rephrased_news,title,topic,location_type,location,latitude,longitude,sentence,cluster_id,cluster_title
0,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,negative,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,**🌡️ Repair work on heating networks**\n\nToda...,0.0,Heating and Water Issues
1,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,Estimated return time: 12:00 PM on November 12.,0.0,Heating and Water Issues
2,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus,0.0,Heating and Water Issues
3,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus,0.0,Heating and Water Issues
4,2023-11-11T07:51:25+00:00,Dormitory 7 of KPI,392.0,**🌡️ Repair work on heating networks**\n\nToda...,neutral,Repair work on heating networks has resulted i...,Heating Outage in Dormitories,Citizen Discomfort,exact location,['KPI Dormitory 7'],50.448522,30.449409,@kpicampus,0.0,Heating and Water Issues


In [10]:
# Завантажте ваш CSV файл
data = filtered_data

# Порахуйте кількість унікальних елементів у стовпці 'place_id'
unique_place_ids = data[['latitude','longitude', 'cluster_title','cluster_id', 'text','message_id','title']].nunique()

print(f'Number of unique place_id: {unique_place_ids}')

Number of unique place_id: latitude           8
longitude          8
cluster_title      8
cluster_id         8
text             149
message_id       147
title            136
dtype: int64


In [11]:
# Вибираємо тільки необхідні колонки
selected_columns = data[['latitude','longitude', 'cluster_title','cluster_id', 'text','message_id','title']]

# Видаляємо повтори, якщо одночасно дублюються 'original_review' та 'Cluster Title'
small_data = selected_columns.drop_duplicates(subset=['text', 'cluster_title'])

small_data.head(10)

Unnamed: 0,latitude,longitude,cluster_title,cluster_id,text,message_id,title
0,50.448522,30.449409,Heating and Water Issues,0.0,**🌡️ Repair work on heating networks**\n\nToda...,392.0,Heating Outage in Dormitories
81,50.448522,30.449409,Heating and Water Issues,0.0,⚠️ **Emergency**\n\nEmergency power outage acr...,402.0,Emergency Power Outage Alert
160,50.448522,30.449409,Heating and Water Issues,0.0,"💡**Power outages**\n\nStarting from May 16, ho...",404.0,Hourly Power Outages Scheduled
239,50.448522,30.449409,Heating and Water Issues,0.0,"🌡️ **Well, you get it**\n\n**Kyivteploenergo p...",405.0,Heating Work Delays Expected
318,50.448522,30.449409,Heating and Water Issues,0.0,🟢 **Directorate 06.08**\n\n**Settlement**\nWil...,406.0,Settlement and Hot Water Issues
397,50.448522,30.449409,Heating and Water Issues,0.0,[​​](https://telegra.ph/file/b793c7b6b4d653e40...,408.0,Volunteers Needed for Settling
476,50.448522,30.449409,Heating and Water Issues,0.0,🌡 **Hot water outage**\n\n**Due to emergency w...,409.0,Hot Water Supply Disruption
555,50.449013,30.450868,Heating and Water Issues,0.0,🟢 **Administration**\n\n**Masters**\nStudents ...,158.0,Dormitory Updates for Students
634,50.449013,30.450868,Heating and Water Issues,0.0,💧**May Without Water**\n\nThere is no hot wate...,161.0,Hot Water Outage Extended
713,50.449013,30.450868,Heating and Water Issues,0.0,**🟢 Administration**\n\n**Shelters**\nRepairs ...,162.0,Dormitory Repairs and Resources


In [12]:
# small_data.to_csv('final.csv', index=False)

In [13]:
import folium
import pandas as pd

# Завантажуємо датасет
data = small_data

# Підраховуємо кількість повторень для кожної локації
location_counts = data.groupby(['latitude', 'longitude']).size().reset_index(name='count')

# Ініціалізуємо карту, центруючи її на Київ
center_location = [50.4503596, 30.5245025]
m = folium.Map(location=center_location, zoom_start=12)

# Визначаємо кольори для маркерів
colors = ['red', 'blue', 'green', 'orange', 'purple', 'darkred', 'lightred', 'beige', 'darkblue', 'darkgreen']

# Проходимо по кожній унікальній локації і додаємо маркери
for index, row in location_counts.iterrows():
    try:
        # Отримуємо кількість дублікатів для локації
        count = row['count']
        latitude = row['latitude']
        longitude = row['longitude']
        
        # Фільтруємо рядки з датасету, які відповідають цій локації
        location_reviews = data[(data['latitude'] == latitude) & (data['longitude'] == longitude)]
        
        # Формуємо текст для popup, включаючи всі відгуки для цієї локації
        popup_text = ""
        for _, review_row in location_reviews.iterrows():
            popup_text += f"Review: {review_row['text']}<br><br>"
        
        # Вибираємо колір випадковим чином або на основі якихось даних
        color = colors[index % len(colors)]
        
        # Робимо маркер більшим на основі кількості дублікатів
        radius = 5 + (count * 2)  # Базовий радіус + пропорційне збільшення

        # Додаємо круглий маркер для кожної локації
        folium.CircleMarker(
            [latitude, longitude],
            radius=radius,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.7,
            popup=popup_text,
            tooltip=f"Location ({latitude}, {longitude}) - {count} reviews"
        ).add_to(m)
    except KeyError:
        # Пропускаємо записи, якщо немає координат або інших важливих полів
        continue

# Зберігаємо карту в HTML файл
m.save("problematic_reviews_clustered_map.html")

# Виводимо карту
m

In [14]:
import uuid

def get_gpt_response(user_prompt, system_prompt="", model="gpt-4o-mini"):
    client = OpenAI(api_key='sk-jMEOk_AeFqsriX4e9WlvXuiW6xdIiDWoIaImApR-3CT3BlbkFJVeXbABn_hQ3Hb23_XLNa0lxq6MKKRSQUc_PMoAPuYA')
    try:
        completion = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
        )
        response_text = completion.choices[0].message.content
        total_tokens = completion.usage.total_tokens
        return response_text, total_tokens
    except Exception as e:
        return f"An error occurred: {e}", 0
        
# Створюємо функцію для виклику GPT або іншої обробки
def process_cluster_reviews(content):
    system_message = (
        "You are tasked with creating brief, informative summaries of user reviews or news content. "
        "These summaries should clearly describe the main problems or challenges mentioned, highlighting key issues from multiple perspectives when necessary. "
        "The summaries should aim to be interesting yet concise, so the reader can quickly grasp the core of the issue. Ensure that each summary captures the problem in a detailed, well-rounded manner, while remaining within 200 characters."
    )
    
    # Промпт для моделі GPT
    prompt = (
        "Step 1: Carefully read the review or news content. Focus on identifying the primary problem or issue mentioned, along with any potential consequences or implications.\n"
        "Step 2: Identify relevant supporting details that further clarify the problem. Consider any suggestions, concerns, or actions mentioned in the content that offer additional insight into the issue.\n"
        
        f"Content:\n{content}\n"
        
        "Step 3: Create a concise, clear summary (up to 200 characters) that integrates the key problem and supporting details. "
        "Aim to provide a well-rounded view of the issue, highlighting any relevant aspects such as the scope, impact, or potential solutions.\n"
        
        "Step 4: Ensure that the summary does not exceed 200 characters and is easy to understand. If the initial version exceeds the limit, refine it while keeping the most critical points. "
        "The final summary should capture the problem in a clear and engaging way, without omitting key information."
    )
    summary = get_gpt_response(prompt, system_message)
    return summary


# Функція для об'єднання і обробки коментарів по кластеру
def generate_summaries_by_location(df):
    summary_data = []

    # Групуємо дані по координатам і кластеру
    grouped = df.groupby(['latitude', 'longitude', 'cluster_title'])

    for (lat, lng, cluster_title), group in grouped:
        # Об'єднуємо всі коментарі в один текст
        combined_reviews = ' '.join(group['text'].tolist())

        # Викликаємо функцію для обробки тексту
        summary = process_cluster_reviews(combined_reviews)

        # Генеруємо унікальний ID для саммарі
        summary_id = str(uuid.uuid4())

        # Записуємо результат для кожного ряду групи
        for idx in group.index:
            summary_data.append({
                'index': idx,
                'summary': summary,
                'summary_id': summary_id
            })

    # Перетворюємо результат в DataFrame і об'єднуємо з оригінальним датафреймом
    summary_df = pd.DataFrame(summary_data)
    df = df.merge(summary_df, left_index=True, right_on='index').drop('index', axis=1)

    return df

# Викликаємо функцію для обробки
df = small_data  # Завантаження твого датафрейму
df_with_summaries = generate_summaries_by_location(df)

# Зберігаємо результати у новий CSV
df_with_summaries.to_csv('output_with_summaries.csv', index=False)

# Виводимо результат для перевірки
df_with_summaries.head(10)

Unnamed: 0,latitude,longitude,cluster_title,cluster_id,text,message_id,title,summary,summary_id
151,50.448522,30.449409,Heating and Water Issues,0.0,**🌡️ Repair work on heating networks**\n\nToda...,392.0,Heating Outage in Dormitories,(Major heating and hot water outages persist i...,0c35eff3-6c67-4089-92f8-05e797a94888
152,50.448522,30.449409,Heating and Water Issues,0.0,⚠️ **Emergency**\n\nEmergency power outage acr...,402.0,Emergency Power Outage Alert,(Major heating and hot water outages persist i...,0c35eff3-6c67-4089-92f8-05e797a94888
153,50.448522,30.449409,Heating and Water Issues,0.0,"💡**Power outages**\n\nStarting from May 16, ho...",404.0,Hourly Power Outages Scheduled,(Major heating and hot water outages persist i...,0c35eff3-6c67-4089-92f8-05e797a94888
154,50.448522,30.449409,Heating and Water Issues,0.0,"🌡️ **Well, you get it**\n\n**Kyivteploenergo p...",405.0,Heating Work Delays Expected,(Major heating and hot water outages persist i...,0c35eff3-6c67-4089-92f8-05e797a94888
155,50.448522,30.449409,Heating and Water Issues,0.0,🟢 **Directorate 06.08**\n\n**Settlement**\nWil...,406.0,Settlement and Hot Water Issues,(Major heating and hot water outages persist i...,0c35eff3-6c67-4089-92f8-05e797a94888
156,50.448522,30.449409,Heating and Water Issues,0.0,[​​](https://telegra.ph/file/b793c7b6b4d653e40...,408.0,Volunteers Needed for Settling,(Major heating and hot water outages persist i...,0c35eff3-6c67-4089-92f8-05e797a94888
157,50.448522,30.449409,Heating and Water Issues,0.0,🌡 **Hot water outage**\n\n**Due to emergency w...,409.0,Hot Water Supply Disruption,(Major heating and hot water outages persist i...,0c35eff3-6c67-4089-92f8-05e797a94888
160,50.449013,30.450868,Heating and Water Issues,0.0,🟢 **Administration**\n\n**Masters**\nStudents ...,158.0,Dormitory Updates for Students,(Masters students face eviction after winter s...,be86421c-223c-4441-94e1-5537332ea9a2
161,50.449013,30.450868,Heating and Water Issues,0.0,💧**May Without Water**\n\nThere is no hot wate...,161.0,Hot Water Outage Extended,(Masters students face eviction after winter s...,be86421c-223c-4441-94e1-5537332ea9a2
162,50.449013,30.450868,Heating and Water Issues,0.0,**🟢 Administration**\n\n**Shelters**\nRepairs ...,162.0,Dormitory Repairs and Resources,(Masters students face eviction after winter s...,be86421c-223c-4441-94e1-5537332ea9a2


In [15]:
df_with_summaries.to_csv('final3.csv', index=False)