In [145]:
import pandas as pd
from collections import Counter

## First step: Exploring the data
Reading in the file and checking the different data types to get inspiration about visualization options.

In [146]:
dataset = pd.read_csv('/content/drive/MyDrive/OPINION_COST/2._gllm-annotation-results.csv')

In [147]:
valid_ids = dataset[(dataset['variable'] == 'Q1_0_Tool-Mentioned') & (dataset['result'] == 'Yes')]['id']

In [148]:
dataset = dataset[dataset['id'].isin(valid_ids)]

In [149]:
def count_values(dataset, variable_name):
    """
    Count the occurrences of unique values in a specific variable from the dataset.

    Parameters:
    - dataset (pd.DataFrame): The dataset containing the data.
    - variable_name (str): The name of the variable to process.

    Returns:
    - list: A sorted list of tuples with values and their counts.
    """
    # Filter the dataset for the specified variable
    values = dataset[dataset['variable'] == variable_name]['result'].tolist()

    # Process the values into a list of individual items
    value_list = [v.lower().split('; ') for v in values]

    # Count the occurrences of each unique value
    value_counter = Counter(x for xs in value_list for x in set(xs))

    # Return the most common values
    return value_counter.most_common()

# Example usage:
# result = count_values(dataset, 'Q5_3_Data-Country')
# print(result)


In [150]:
opinion_evaluation = count_values(dataset, 'Q4_1_Opinion-Evaluation')

In [151]:
toolname = count_values(dataset, 'Q1_1_Tool-Name')

In [152]:
approach = count_values(dataset, 'Q3_1_Approach')

In [153]:
target = count_values(dataset, 'Q4_2_Opinion-Target')

In [154]:
source = count_values(dataset, 'Q5_1_Data-Source')

In [155]:
language = count_values(dataset, 'Q5_2_Data-Language')

In [156]:
countries = count_values(dataset, 'Q5_3_Data-Country')

In [157]:
toolname2 = count_values(dataset, 'Q2_1_Tool-Name')

In [158]:
dataset_name = count_values(dataset, 'Q5_4_Dataset-Name')

In [159]:
def recode_terms(terms, categories, use_sentiment_polarity=False):
    recoded = []
    for term, freq in terms:
        recoded_category = None

        # If the flag is set, check for "sentiment" or "polarity" explicitly
        if use_sentiment_polarity and ("sentiment" in term.lower() or "polarity" in term.lower()):
            recoded_category = "Sentiment & Polarity"
        else:
            # Otherwise, match term to a category in the dictionary
            for category, keywords in categories.items():
                if any(keyword.lower() in term.lower() for keyword in keywords):
                    recoded_category = category
                    break

        recoded.append((term, freq, recoded_category or "Uncategorized"))

    return recoded

Recoding of opinion evaluations

In [160]:
categories_evaluations = {
    "Sentiment & Polarity": [
        "sentiment", "sentiments", "sentiment classification", "sentiment analysis",
        "sentiment polarity", "polarity", "neutral", "positive", "negative",
        "degree of polarity", "polarity and intensity of emotions", "strength of polarity",
        "negative)", "highly negative", "highly positive", "positivity or negativity", "negative comments"
    ],
    "Emotions & Feelings": [
        "emotions", "emotion", "rage", "sorrow", "contentment", "anticipation",
        "frustration", "confusion", "worry", "valence", "irony", "sarcasm", "emotions (positive",
        "emotions, feelings, humor, appreciation", "feelings", "emotional polarity", "emotions (positive)",
        "emotions (happy, unhappy)", "hesitation", "faith", "irony detection",
        "emotions (joy, anger, disgust, depression)", "expressions", "subjective states",
        "learning-centered emotions"
    ],
    "Opinions & Attitudes": [
        "opinions", "positive opinions", "negative opinions", "opinions of the crowd",
        "opinions of the product", "public opinion classification", "positive and negative opinions",
        "opinions on product aspects", "stance", "stance towards vaccination", "negative stance toward covid-19 vaccines",
        "opinions about the effects of marijuana", "marijuana legalization stance",
        "extreme opinions", "opinions and satisfaction", "attitudes", "neutral attitude", "negative attitude", "positive attitude",
        "attitudes towards topics or products"
    ],
    "Customer and User Feedback": [
        "satisfaction", "customer feedback", "complaints", "dissatisfaction",
        "student feedback", "likes", "dislikes", "customer satisfaction",
        "customer loyalty", "praises", "usefulness of reviews",
        "review classification", "satisfaction level",
        "satisfaction with public transportation services",
        "satisfaction with knee condition treatment", "rating prediction"
    ],
    "Evaluation and Assessment": [
        "evaluations", "effectiveness", "impact assessment of government policies",
        "evaluation of quality", "comparison of preferences", "performance",
        "teaching effectiveness", "assessment", "quality", "semantic orientation"
    ],
    "Miscellaneous": [
        "price movement prediction", "reactions", "tacit information", "acceptance of video games",
        "risk of suicide", "concerns", "side effects", "severity of symptoms", "safety", "credibility",
        "trustworthiness", "subjectivity", "petitions", "bias","engineering sustainability aspects",
        "resource consumption and ecological damage", "engineering innovation and sustainable development",
        "economic and social prosperity", "quality of life improvement", "economic and social development",
        "user gratifications", "learning gratifications",
        "teaching approaches", "student feedback", "spam", "spam detection accuracy","aspect detection",
        "aspect", "aspect relevance", "relevance to requirements engineering", "topics","truthful",
        "video topic coverage", "video relevance", "video quality", "administration","resources",
        "teaching approaches"
    ]
}

In [161]:
# Recode terms
recoded_evaluations = recode_terms(opinion_evaluation, categories_evaluations, True)

In [162]:
# Extract the categories from the recoded terms
eval_cats = [term[2] for term in recoded_evaluations]

# Count the occurrences of each category
Counter(eval_cats)

Counter({'Sentiment & Polarity': 54,
         'Emotions & Feelings': 23,
         'Opinions & Attitudes': 13,
         'Customer and User Feedback': 16,
         'Evaluation and Assessment': 12,
         'Miscellaneous': 33})

In [167]:
category_evaluation = [(category, ', '.join(terms)) for category, terms in categories_evaluations.items()]
cat_eval = pd.DataFrame(category_evaluation, columns=["Category", "Terms"])
cat_eval.to_csv('/content/drive/MyDrive/OPINION_COST/evaluation_categories.csv', index=False)

In [164]:
categories_targets = {
    "Social Media Content": [
        "tweets", "twitter data", "twitter posts", "twitter comments", "twitter users",
        "twitter sentiment", "facebook comments", "facebook posts and activities",
        "social media posts", "social media content", "social media memes", "social media platforms",
        "web forum content", "microblog posts", "microblog texts", "social media texts",
        "twitter messages", "twitter content", "social media",
        "algerian youtube comments"
    ],
    "Product Reviews": [
        "product reviews", "online product reviews", "amazon product reviews", "product reviews from twitter",
        "user reviews", "restaurant reviews", "hotel reviews", "mobile application reviews", "tripadvisor reviews",
        "product aspects", "product features", "product quality", "product", "various products",
        "product aspects like screen, battery, camera", "product feedback", "text reviews", "online opinions and reviews"
    ],
    "Health and COVID-19": [
        "covid-19", "covid-19 vaccines", "covid-19 vaccination", "covid-19 status", "covid-19 related texts",
        "covid-19 vaccines", "vaccine rollout", "health", "health effects", "health information", "health services",
        "health effects of marijuana", "marijuana legalization", "knee pain", "knee rehabilitation progress",
        "knee condition", "public health", "pandemic responses", "vaccination", "suicidal tendencies", "lockdown",
        "medical information", "mindfulness therapy", "knee pain",
        "knee rehabilitation progress", "knee condition", "suicidal tendencies", "mental health",
        "health effects", "health information", "health services"
    ],
    "Politics": [
        "politics", "political issues", "political trends", "political views", "political parties", "political debates",
        "political news", "u.s. presidential election", "governance", "government schemes", "political debates",
        "public opinion", "political parties", "political issues", "presidential election", "article 370",
        "military affairs", "palestinian/israeli conflict", "nepal blockade 2015", "nepal earthquake 2015"
    ],
    "Businesses and Services": [
        "products", "services", "service", "telecommunication services", "airline services", "cloud computing services",
        "business decisions", "businesses", "company performance", "brands", "enterprises", "customer satisfaction",
        "customer reviews", "public services", "business", "market-relevant information"
    ],
    "Entertainment": [
        "movies", "movie reviews", "youtube videos", "movie aspects", "game aspects", "movie aspects including cast",
        "director", "plot", "ambience", "songs", "musical instruments", "game aspects", "guitars", "arts", "song lyrics"
    ],
    "Restaurants and Food": [
        "restaurants", "food", "restaurant reviews", "restaurant aspects", "restaurant features", "restaurant aspects like food",
        "service", "ambience", "beverages", "drinks quality", "hotel", "restaurant"
    ],
    "Education and Courses": [
        "lectures", "professors", "course", "educational content", "course experience", "course difficulties", "educational institutions",
        "educational aspects", "e-learning services", "academic content", "higher education law", "teaching faculty", "instructors",
        "teachers", "quality of teaching", "educational field", "educational activities", "law of higher education","professor"
    ],
    "News and Events": [
        "news headlines", "news articles", "news comments", "emergency events", "disasters", "events", "various domains and events",
        "specific events", "news", "articles", "news articles", "public events", "social issues",
        "general topics", "discussion topics", "textual information in social networks", "web users' reactions", "news"
    ],
    "Reviews and Feedback": [
        "book reviews", "product reviews", "reviews", "review content", "restaurant reviews", "review features", "various types of reviews",
        "text reviews", "review feedback", "user opinions", "helpfulness"
    ],
    "Technology and Gadgets": [
        "laptops", "iphone", "android", "phone cases", "moto 360", "iphone screen", "digital cameras",
        "viewfinder", "flash", "software", "lens", "design and usability", "image quality", "app features"
    ],
    "Miscellaneous": [
        "public sentiment", "words", "issues", "topics", "organizations", "individuals",
        "documents", "social, economic, and environmental dimensions of smart cities", "water pipes", "social aspects",
        "economic aspects", "sustainability", "environmental aspects", "engineering applications",
        "statements from child sexual abuse victims", "urban issues", "city management", "communities",
        "text sentiment", "women empowerment", "digital transformation", "drugs", "geographical locations", "text content",
        "romanized sindhi text", "staff attitude", "technician knowledge", "various domains and arabic dialects",
        "images", "infrastructure", "aspect terms (attributes or characteristics)", "materials", "organization",
        "chinese and english text datasets", "texts", "waste import in albania", "tourism in albania", "financial aspects",
        "general sentiment", "text data", "viewfinder", "flash", "image quality",
        "lens", "design and usability", "digital cameras", "aspect", "western media", "entities", "venues",
        "vat tax", "waste import", "tourism", "image-text posts", "traffic information", "various online content",
        "cars", "books", "textual content", "rooms", "stay", "staff", "specific web domains", "intervention",
        "mindfulness therapy", "web users' reactions", "sentiment sentences", "transport modes", 'book characters',
        'emoticons', 'negation', "text", "bitcoin", "staff attitude", "technician knowledge"
    ]
}

In [165]:
recoded_targets = recode_terms(target, categories_targets)

In [166]:
category_targets = [(category, ', '.join(terms)) for category, terms in categories_targets.items()]
cat_target = pd.DataFrame(category_targets, columns=["Category", "Terms"])
cat_target.to_csv('/content/drive/MyDrive/OPINION_COST/target_categories.csv', index=False)