In [41]:
import pandas as pd
import os
import ast
import string
from collections import Counter
from openai import OpenAI
from langchain.prompts import PromptTemplate
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import unidecode

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
os.environ['OPENAI_API_KEY'] = os.environ.get('OPENAI_API_KEY')

In [3]:
prompt_template = PromptTemplate.from_template(
    '''
    Your task is to extract keywords from {sentence}.
    You are an assistant for labeling negative annotaded keywords in user reviews.
    Return the words that are assumed to be negative words in form of a list.
    Note: If now negative words detected reutrn empty list.
    Note: Words have to stand alone as negative to be included. No fill words.
    Note: Only words that are explicit in the sentence should be included.
    Provide output without further text information. Use the following schema ['keyword 1', 'keyword 2', ...]
    '''
)


In [4]:
client = OpenAI()

In [5]:
def extract_keywords(sentence: str) -> str:
    try:
        # Format the prompt dynamically with the input sentence
        prompt = prompt_template.format(sentence=sentence)
        
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"Error: {e}"

In [6]:
df = pd.read_csv('../data/raw/reviews_en_goerlitzerpark.csv')

In [7]:
snippet = df.copy()
snippet = snippet[0:10]

In [8]:
############# maybe kill stop words in before

#### prepare keyword strings

In [9]:
#snippet['keywords'] = snippet['review_text_english'].apply(lambda x: extract_keywords(x))

In [73]:
def remove_stopwords(text: str, sw = stopwords.words('german')) -> str:
    '''
    This function will remove stopwords from the text
    
    Args:
        text: String of data you want to remove stopwords from
        sw: List of strings indicating the list of stopwords
        
    Returns:
        The input string with the stopwords removed.
    '''
    #additional_sw = []
    sw = sw #+ additional_sw
    
    text_list = text.split()
    text_list = [word for word in text_list if word.lower() not in sw]
    return ' '.join(text_list)

In [74]:
def remove_punctuation(text: str, punct: str = string.punctuation) -> str:
    '''
    This function will remove punctuations from the text.
    
    Args:
        text: String of data you want to remove punctuations from
        punct: String of punctuations
    
    Returns:
        The input string with the punctuations removed.
    '''
    cleaned_text = ''.join([char for char in text if char not in punct])
    return cleaned_text

In [75]:
def unicode(text: str) -> str:
    '''
    This function will make all the data unicoded. Meaning Â -> A
    
    Args:
        text: String of data you want to unicode
    
    Returns:
        The input string unicoded.
    '''
    return unidecode.unidecode(text)

In [101]:
def clean(text: str) -> str:
    '''
    This method will clean the input text through unidecoding and stopword and punctuation 
    removal.
    
    Args:
        text: String indicating the body of text you want to clean
    
    Returns:
        A string corresponding to the cleaned version of the input string.
    '''
    #text = unicode(text)
    #text = remove_punctuation(text)
    text = remove_stopwords(text)
    return text.lower()

In [104]:
df['clean_reviews_2'] = df['keywords'].apply(lambda x : clean(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_reviews_2'] = df['keywords'].apply(lambda x : clean(x))


In [None]:
df['keywords'] = df['review_text_english'].apply(lambda x: extract_keywords(x))

In [105]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,google_id,review_text,review_img_urls,review_rating,review_timestamp,review_likes,review_text_english,keywords,keywords_list,cleaned_keywords,clean_reviews,clean_reviews_2
0,0,0x47a84f4c8819b367:0xedafc45032d4521b,I happened to visit the park around midday on ...,['https://lh5.googleusercontent.com/p/AF1QipNH...,1,1730831520,5,I happened to visit the park around midday on ...,"['tipsy', 'drugs', 'homeless', 'rubbish']","[tipsy, drugs, homeless, rubbish]",tipsy drugs homeless rubbish,i happened to visit the park around midday on ...,"['tipsy', 'drugs', 'homeless', 'rubbish']"
1,1,0x47a84f4c8819b367:0xedafc45032d4521b,"The park is nice, it surprised me that there w...",['https://lh5.googleusercontent.com/p/AF1QipNs...,5,1723813004,0,"The park is nice, it surprised me that there w...",['alone'],[alone],alone,"the park is nice, it surprised me that there a...",['alone']
2,2,0x47a84f4c8819b367:0xedafc45032d4521b,Legenday park in Berlin! Somehow between Kreuz...,['https://lh5.googleusercontent.com/p/AF1QipNA...,5,1724322017,0,Legenday park in Berlin! Somehow between Kreuz...,"['negative', 'not', 'options']","[negative, not, options]",negative not options,legenday park berlin! somehow between kreuzber...,"['negative', 'not', 'options']"


In [111]:
def transform_list (keywords):
    try:
        keyword_list = ast.literal_eval(keywords)
        return keyword_list
    except Exception as e:
        return f"Error: {e}"

In [112]:
df['keywords_list'] = df['keywords'].apply(lambda x: transform_list(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['keywords_list'] = df['keywords'].apply(lambda x: transform_list(x))


In [127]:
flattened_keywords = [keyword for sublist in df['keywords_list'] for keyword in sublist]

In [125]:
kexwords = []

for i,row in df.iterrows():
    for keyword in row:
        kexwords.append(row['keywords_list'])

In [128]:
flattened_keywords

['tipsy',
 'drugs',
 'homeless',
 'rubbish',
 'alone',
 'negative',
 'not',
 'options',
 'unsafe',
 'dealers',
 'nicked',
 'not',
 'quality',
 'sketch',
 'careful',
 'bicycles',
 'drugs',
 'crimes',
 'worst',
 'drugs',
 'pushy',
 'avoid',
 'junkies',
 'weird',
 'tragic',
 'unsafe',
 'unsafe',
 'junkies',
 'lingering',
 'following',
 'dangerous',
 'drug',
 'dealers',
 'intimidate',
 'hostile',
 'yelled',
 'aggressively',
 'shaken',
 'aggressive',
 'drug',
 'unsafe',
 'avoid',
 'scary',
 'aborted',
 'drug dealers',
 'uncomfortable',
 'robbed',
 'desperately',
 'weird',
 'druggies',
 'DON’T',
 'Not safe',
 'stolen',
 'drugs',
 'aggressively',
 'hard',
 'bad',
 'dangerous',
 'warn',
 'accosted',
 'aggressive',
 'drug',
 'dealers',
 'drugs',
 'lost',
 'control',
 'unsafe',
 'drug dealers',
 'avoid',
 'night',
 'not',
 'safe',
 'rough',
 'freaky',
 'dirty',
 'bad',
 'dangerous',
 'warn',
 'wrong',
 'not',
 'cleanest',
 'drugs',
 'scare',
 'dealing',
 'drugs',
 'dangerous',
 'poor',
 'dark',


In [93]:
df = df[df['cleaned_keywords'] != 'json']

In [94]:
df['cleaned_keywords']

0           tipsy drugs homeless rubbish
1                                  alone
2                   negative not options
3      unsafe dealers nicked not quality
4                                 sketch
                     ...                
356                              socabon
357              old not well maintained
358                                     
359                              dubious
360                                     
Name: cleaned_keywords, Length: 354, dtype: object

In [106]:
c = Counter(df['clean_reviews_2']).most_common(3)
c

[('[]', 146), ("['bad', 'dangerous', 'warn']", 2), ("['drug', 'dealers']", 2)]