In [40]:
import json
import os
import pandas as pd

In [None]:
# Pfad zum Ordner mit den JSON-Dateien
data_folder = 'data/instagram'

# Leere Liste zur Speicherung aller Kommentare
all_comments = []

# Durchlaufe alle Dateien im Ordner "data"
for filename in os.listdir(data_folder):
    if filename.endswith(".json"):
        filepath = os.path.join(data_folder, filename)
        
        # Vereinsname aus dem Dateinamen extrahieren (z.B. "verein1.json" -> "verein1")
        club_name = os.path.splitext(filename)[0].replace('_instacomments', '')

        # Öffne die JSON-Datei und lade den Inhalt
        with open(filepath, 'r', encoding='utf-8') as f:
            posts = json.load(f)

            # Falls das JSON eine Liste von Posts ist
            if isinstance(posts, list):
                # Extrahiere die Kommentare aus jedem Post
                for post in posts:
                    if 'comments' in post:
                        for comment in post['comments']:
                            all_comments.append({
                                'verein': club_name,  # Name des Vereins aus der Datei
                                'post': post.get('link', ''),  # Link zum Post (falls vorhanden)
                                'caption': post.get('caption', ''),  # Post-Caption (falls vorhanden)
                                'comment': comment,  # Kommentar-Text
                            })
            # Falls das JSON keine Liste ist (z.B. einzelne Posts)
            elif 'comments' in posts:
                # Extrahiere die Kommentare aus dem einzelnen Post
                for comment in posts['comments']:
                    all_comments.append({
                        'verein': club_name,  # Name des Vereins aus der Datei
                        'post': posts.get('link', ''),  # Link zum Post (falls vorhanden)
                        'caption': posts.get('caption', ''),  # Post-Caption (falls vorhanden)
                        'comment': comment,  # Kommentar-Text
                    })

# Speichern aller Kommentare in einem Pandas DataFrame
df = pd.DataFrame(all_comments)

df = df[~df['comment'].str.contains("Original-Audio", case=False, na=False)]
df = df[~df['comment'].str.contains("•", case=False, na=False)]

df = df.drop_duplicates(subset=['comment'])

df


Unnamed: 0,verein,post,caption,comment
0,staderennesfc,https://www.instagram.com/p/C8e7U9jAc5P/,"Brest puis Nantes, les affiches des derbies po...","On est 1 point devant les relégables, on perd ..."
1,staderennesfc,https://www.instagram.com/p/C8e7U9jAc5P/,"Brest puis Nantes, les affiches des derbies po...",Mm notre équipe amicale est meilleure que vous...
2,staderennesfc,https://www.instagram.com/p/C8e7U9jAc5P/,"Brest puis Nantes, les affiches des derbies po...",Ppl need to stop supporting this match fixing ...
3,staderennesfc,https://www.instagram.com/p/C8e7U9jAc5P/,"Brest puis Nantes, les affiches des derbies po...",Match fixedddd
4,staderennesfc,https://www.instagram.com/p/C8e7U9jAc5P/,"Brest puis Nantes, les affiches des derbies po...",Just investigate you will find lots of very co...
...,...,...,...,...
117884,fcchelsea,https://www.instagram.com/reel/DAePCASsGvm/,"Three points, four goals. 🗣️👊#CHEBHA #CFC #Che...",Innit star boy 🥶
117885,fcchelsea,https://www.instagram.com/reel/DAePCASsGvm/,"Three points, four goals. 🗣️👊#CHEBHA #CFC #Che...",🙌🙌🥶
117886,fcchelsea,https://www.instagram.com/reel/DAePCASsGvm/,"Three points, four goals. 🗣️👊#CHEBHA #CFC #Che...",🪄 💙
117889,fcchelsea,https://www.instagram.com/reel/DAePCASsGvm/,"Three points, four goals. 🗣️👊#CHEBHA #CFC #Che...",love you bro always💙🤍


In [42]:
posts_per_club = df.groupby('verein')['post'].nunique().reset_index()
posts_per_club.columns = ['Verein', 'Anzahl_Posts']

comments_per_club = df.groupby('verein')['comment'].count().reset_index()
comments_per_club.columns = ['Verein', 'Anzahl_Kommentare']

comments_per_club['avg_comments_per_post'] = comments_per_club['Anzahl_Kommentare'] / posts_per_club['Anzahl_Posts']


summary_df = pd.merge(posts_per_club, comments_per_club[['Verein', 'Anzahl_Kommentare', 'avg_comments_per_post']], on='Verein')

summary_df

Unnamed: 0,Verein,Anzahl_Posts,Anzahl_Kommentare,avg_comments_per_post
0,arsenal,239,3063,12.8159
1,avfc,156,2871,18.403846
2,bayer04,240,5863,24.429167
3,bvb,229,3292,14.375546
4,fcbayern,59,4895,82.966102
5,fcchelsea,19,10232,538.526316
6,fch,225,2301,10.226667
7,ipswich,107,1043,9.747664
8,lille,71,1729,24.352113
9,liverpoolfc,126,15148,120.222222


# Data Preprocessing

* language detection for all comments
* select only comments in german, english, french or spanish
* preprocessing:
    1. remove all mentions (e.g. see here @user) in 
    2. switch the emoji to text
    3. lower case

In [12]:
import emoji
import re
from langdetect import detect, DetectorFactory
import spacy
from spacy_langdetect import LanguageDetector

In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
import tensorflow as tf

# Laden des Tokenizers und des TensorFlow-Modells
tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
model = TFAutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")

# Labels des Modells abrufen
labels = model.config.id2label

def detect_language_transformers(text):
    # Tokenisierung des Textes
    inputs = tokenizer(text, return_tensors="tf", truncation=True)
    # Modellvorhersage
    outputs = model(**inputs)
    # Vorhersagewahrscheinlichkeiten
    logits = outputs.logits
    predicted_class_id = tf.argmax(logits, axis=-1).numpy()[0]
    language = labels[predicted_class_id]
    return language

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFXLMRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFXLMRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFXLMRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaForSequenceClassification for predictions without further training.


In [111]:
# Anwenden der Spracherkennungsfunktion
df['language'] = df['comment'].apply(detect_language_transformers)


In [113]:
df.to_csv('comments_with_language.csv')

In [118]:
# Definieren Sie die gewünschten Sprachcodes
allowed_languages = ['de', 'en', 'es', 'fr']

# Filtern des DataFrames
df_allowed_languages = df[df['language'].isin(allowed_languages)].reset_index(drop=True)


In [122]:
language_counts = df_allowed_languages["language"].value_counts()
print(language_counts)

language
en    26755
de    10923
es     6214
fr     2627
Name: count, dtype: int64


In [124]:
df_allowed_languages.to_csv("comments_with_allowed_language.csv")

In [123]:
posts_per_club = df_allowed_languages.groupby('verein')['post'].nunique().reset_index()
posts_per_club.columns = ['Verein', 'Anzahl_Posts']

comments_per_club = df_allowed_languages.groupby('verein')['comment'].count().reset_index()
comments_per_club.columns = ['Verein', 'Anzahl_Kommentare']

comments_per_club['avg_comments_per_post'] = comments_per_club['Anzahl_Kommentare'] / posts_per_club['Anzahl_Posts']


summary_df = pd.merge(posts_per_club, comments_per_club[['Verein', 'Anzahl_Kommentare', 'avg_comments_per_post']], on='Verein')

summary_df

Unnamed: 0,Verein,Anzahl_Posts,Anzahl_Kommentare,avg_comments_per_post
0,arsenal,237,1867,7.877637
1,avfc,156,1979,12.685897
2,bayer04,237,3082,13.004219
3,bvb,224,1866,8.330357
4,fcbayern,59,2771,46.966102
5,fcchelsea,19,6599,347.315789
6,fch,221,1741,7.877828
7,ipswich,105,593,5.647619
8,lille,70,1051,15.014286
9,liverpoolfc,125,8958,71.664


In [18]:
def remove_mentions(text):
    return re.sub(r'@\w+', '', text)

In [19]:
def convert_emojis_to_text(text, lang):
    # Verwenden von Leerzeichen als Trennzeichen
            return emoji.demojize(text, language=lang, delimiters=(" ", " "))

In [20]:
def clean_comment(text):
    text = remove_mentions(text)
    text = convert_emojis_to_text(text)
    return text

In [21]:
def clean_text(text):
    text = text.lower()
    text = text.replace(':', '')
    text = text.replace('_', ' ')
    text = ' '.join(text.split())
    return text

In [22]:
def preprocess_comment(row):
    text = row['comment']
    language = row['language']
    # Entfernen von Mentions
    text = remove_mentions(text)
    # Konvertieren von Emojis
    text = convert_emojis_to_text(text, language)
    # Text bereinigen
    text = clean_text(text)
    return text

In [30]:
preprocessed_df = pd.read_csv('comments_with_allowed_language.csv')

preprocessed_df["comment_cleaned"] = preprocessed_df.apply(preprocess_comment, axis=1)

In [31]:
preprocessed_df

Unnamed: 0.1,Unnamed: 0,verein,post,caption,comment,language,comment_cleaned
0,0,staderennaisfc,https://www.instagram.com/p/C8e7U9jAc5P/,"Brest puis Nantes, les affiches des derbies po...","On est 1 point devant les relégables, on perd ...",fr,"on est 1 point devant les relégables, on perd ..."
1,1,staderennaisfc,https://www.instagram.com/p/C8e7U9jAc5P/,"Brest puis Nantes, les affiches des derbies po...",Mm notre équipe amicale est meilleure que vous...,fr,mm notre équipe amicale est meilleure que vous...
2,2,staderennaisfc,https://www.instagram.com/p/C8e7U9jAc5P/,"Brest puis Nantes, les affiches des derbies po...",Ppl need to stop supporting this match fixing ...,en,ppl need to stop supporting this match fixing ...
3,3,staderennaisfc,https://www.instagram.com/p/C8e7U9jAc5P/,"Brest puis Nantes, les affiches des derbies po...",Match fixedddd,en,match fixedddd
4,4,staderennaisfc,https://www.instagram.com/p/C8e7U9jAc5P/,"Brest puis Nantes, les affiches des derbies po...",Just investigate you will find lots of very co...,en,just investigate you will find lots of very co...
...,...,...,...,...,...,...,...
46514,46514,fcchelsea,https://www.instagram.com/reel/DAePCASsGvm/,"Three points, four goals. 🗣️👊#CHEBHA #CFC #Che...",Bro you are a genius 🥶💙🔥,en,bro you are a genius cold face blue heart fire
46515,46515,fcchelsea,https://www.instagram.com/reel/DAePCASsGvm/,"Three points, four goals. 🗣️👊#CHEBHA #CFC #Che...",This dude caused Earthquake and Volcano Erupti...,en,this dude caused earthquake and volcano erupti...
46516,46516,fcchelsea,https://www.instagram.com/reel/DAePCASsGvm/,"Three points, four goals. 🗣️👊#CHEBHA #CFC #Che...",Our fellow African bro you're genius 🔥🔥🔥,en,our fellow african bro you're genius fire fire...
46517,46517,fcchelsea,https://www.instagram.com/reel/DAePCASsGvm/,"Three points, four goals. 🗣️👊#CHEBHA #CFC #Che...",love you bro always💙🤍,en,love you bro always blue heart white heart


In [45]:
import json
import unidecode
from rapidfuzz import process, fuzz
from collections import defaultdict

# Laden der Vereine und Spieler aus Ihrer JSON-Datei
with open('player.json', 'r', encoding='utf-8') as f:
    clubs_data = json.load(f)

def preprocess_name(name):
    if not isinstance(name, str):
        print(f"Warnung: Erwarteter String, aber erhalten: {type(name)}")
        return ''
    name = unidecode.unidecode(name)
    name = name.lower()
    name = ''.join(e for e in name if e.isalnum() or e.isspace())
    return name

clubs_players_processed = {}

for club_entry in clubs_data["clubs"]:
    club_name = club_entry['club_name']
    players = club_entry['players']
    preprocessed_players = [preprocess_name(player) for player in players]
    clubs_players_processed[club_name.lower()] = preprocessed_players


def preprocess_comment(comment):
    if not isinstance(comment, str):
        print(f"Warnung: Erwarteter String, aber erhalten: {type(comment)}")
        return ''
    comment = unidecode.unidecode(comment)
    comment = ''.join(e for e in comment if e.isalnum() or e.isspace())
    return comment

#preprocessed_df['comment_processed'] = preprocessed_df['comment_cleaned'].apply(preprocess_comment)
df['comment_processed'] = df['comment'].apply(preprocess_comment)

def find_players_in_comment(comment, player_names, threshold=80):
    matches = process.extract(
        comment, player_names, scorer=fuzz.partial_ratio, score_cutoff=threshold
    )
    return [match[0] for match in matches]

def assign_comments_to_players(df, clubs_players):
    results = {'clubs': []}
    
    # Gruppieren des DataFrames nach Verein
    grouped = df.groupby('verein')
    
    for club_name, group in grouped:
        club_name_lower = club_name.lower()
        if club_name_lower in clubs_players:
            player_names = clubs_players[club_name_lower]
            # Finden der Originalspieler
            original_players = next(
                (club['players'] for club in clubs_data['clubs'] if club['club_name'].lower() == club_name_lower),
                []
            )
            club_result = {
                'club_name': club_name,
                'players': []
            }
            player_comments_dict = defaultdict(list)
            
            for index, row in group.iterrows():
                #comment = row['comment_cleaned']
                comment = row['comment']
                comment_processed = row['comment_processed']
                
                # Spieler im Kommentar finden
                matched_players = find_players_in_comment(comment_processed, player_names)
                
                # Kommentare den Spielern zuordnen
                for player in matched_players:
                    # Finden des Originalspielernamens
                    idx = player_names.index(player)
                    original_player_name = original_players[idx]
                    player_comments_dict[original_player_name].append(comment)
            
            # Spieler und ihre Kommentare hinzufügen
            for player_name, comments in player_comments_dict.items():
                club_result['players'].append({
                    'player_name': player_name,
                    'comments': comments
                })
            
            results['clubs'].append(club_result)
    
    return results

# Anwenden der Funktion
results = assign_comments_to_players(df, clubs_players_processed)

# Speichern der Ergebnisse in einer JSON-Datei
with open('player_comments.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

# Ausgabe zur Überprüfung
print(json.dumps(results, ensure_ascii=False, indent=4))

{
    "clubs": [
        {
            "club_name": "arsenal",
            "players": [
                {
                    "player_name": "Ben White",
                    "comments": [
                        "Happy Birthday Ben White❤️",
                        "ben🎉💯",
                        "Happy birthday Ben white we actually share birthday mehn",
                        "More life Benny white white white 😍😍🔥",
                        "Ben white I love you ❤️😘",
                        "Happy birthday Mr white...❤️❤️❤️",
                        "Happy birthday Benjamin white 👏👏👏",
                        "Oh Benny my most favorite Arsenal player, just does the job and moves on. Oh I love you incredible Ben white ❤️❤️❤️❤️",
                        "تاااا",
                        "🦁 💪🏾",
                        "🔥🔥🔥 🔥🔥🔥",
                        "Where is white",
                        "👏👏👏👏👏 🇧🇷🇧🇷🇧🇷🇧🇷",
                        "هع🔥🔥",
                        "🔥 🔥🔥🔥"
          

# NER

In [5]:
import spacy

# Laden des mehrsprachigen Modells
nlp = spacy.load('xx_ent_wiki_sm')  # Einfaches mehrsprachiges Modell

def extract_person_entities(text):
    doc = nlp(text)
    return [ent.text.lower() for ent in doc.ents if ent.label_ == 'PER']

example = "cole palmer, there is nothing he can not do face with tears of joy blue heart"

In [25]:
# Filtere die Zeilen, bei denen die caption das Keyword enthält
keywords = ['Cole Palmer']
'''
filtered_df = df[
    (df['verein'] == 'vfb') &
    (df['caption'].str.contains('|'.join(keywords), case=False, na=False) |
    df['comment'].str.contains('|'.join(keywords), case=False, na=False))
    ]
'''
filtered_df = preprocessed_df[
    (preprocessed_df['verein'] == 'fcchelsea') &
    preprocessed_df['comment_cleaned'].str.contains('|'.join(keywords), case=False, na=False)
    ]

filtered_df

# Extrahiere die Kommentare als Liste
comments_list = filtered_df['comment_cleaned'].tolist()

comments_list

['congratulations cole palmer, do it again, again, again, again snowflake snowflake snowflake',
 'it’s the way cole palmer drifted into midfield position from an attacking position to make this sublime pass for me.. if you watched the earlier build up play. his football iq is on hundred points check mark button saluting face',
 'thank you cole palmer for restoring the pride of london! cold face blue heart clapping hands dark skin tone i am beginning to recognise my club again!!!',
 'cole palmer.. the next eh 10',
 'brilliant cole palmer cold face blue heart blue heart',
 'nj is a fantastic footballer. his runs and movements is awesome. cole palmer legend',
 'damn cole palmer would be winning the best player award if not for people pulling him down',
 'this boy cole palmer is so cool cold face fire',
 'cole palmer - a delight to watch!! blue heart blue heart blue heart',
 'am i the only one smiling smiling face with smiling eyes star-struck while looking at this pass from cole palmer fa

In [26]:
for comment in comments_list:
    print(extract_person_entities(comment))

['congratulations cole palmer']
['mark button']
[]
['cole palmer']
['cole palmer']
['cole palmer']
[]
[]
['cole palmer - a delight']
[]
[]
['cole palmer', 'jackson']
[]
['jackson', 'jackson']
[]
['cole palmer']
[]
[]
['cole palmer', 'russell']
[]
['cole palmer']
['cole palmer', 'mark button']
[]
[]
[]
[]
[]
['enzo maresca~', 'christopher']
['enzo maresca']
[]
[]
['cole palmer']
['enzo maresca']
['enzo maresca']
[]
['cole palmer']
['cole palmer', 'alonso']
[]
[]
['cole palmer']
['admin']
[]
[]
[]
[]
['palmer', 'palmer', 'palmer', 'palmer']
['kevin de bruyne']
[]
[]
[]
[]
['kevin de bruyne']
['congratulations cole palmer', 'cristiano ronaldo', 'lionel messi']
['cole palmer']
['palmer']
['cole']
[]
['cole palmer']
['cole palmer']
[]
[]
[]
['jackson', 'palmer']
['cole palmer']
[]
['cole palmer']
['life!!”)the joy']
['cole palmer']
[]
['cook', 'skin tone', 'palmer', 'palmer', 'palmer', 'palmer']
[]
['life!!”)the joy']
[]
['cole palmer']
['cole palmer']
['enzo maresca']
['peter drury']
[]
['

# Sentiment Analysis

In [138]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

In [37]:
palmer_comments = ["Cole palmer is like MVP Russell westbrook - love him though", "Cole Palmer is a good boy", "Palmer is shit ass", "Palmer is absolutely average"]

In [145]:
# Loop through the texts and get the sentiment scores for each one
for text in comments_list:
    scores = analyzer.polarity_scores(text)
    print(text)
    print(scores)

congratulations cole palmer, do it again, again, again, again snowflake snowflake snowflake
{'neg': 0.0, 'neu': 0.738, 'pos': 0.262, 'compound': 0.5994}
it’s the way cole palmer drifted into midfield position from an attacking position to make this sublime pass for me.. if you watched the earlier build up play. his football iq is on hundred points check mark button saluting face
{'neg': 0.069, 'neu': 0.876, 'pos': 0.055, 'compound': -0.1531}
thank you cole palmer for restoring the pride of london! cold face blue heart clapping hands dark skin tone i am beginning to recognise my club again!!!
{'neg': 0.0, 'neu': 0.736, 'pos': 0.264, 'compound': 0.8057}
cole palmer.. the next eh 10
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
brilliant cole palmer cold face blue heart blue heart
{'neg': 0.0, 'neu': 0.678, 'pos': 0.322, 'compound': 0.5859}
nj is a fantastic footballer. his runs and movements is awesome. cole palmer legend
{'neg': 0.0, 'neu': 0.588, 'pos': 0.412, 'compound': 0.827

In [96]:
from transformers import pipeline
import tensorflow

In [40]:
# create pipeline for sentiment analysis
classification = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [146]:
sentiment_analyis = []
for comment in comments_list:
    classify = classification(comment)
    sentiment_comment = {
        "comment": comment,
        "sentiment-analyis": classify
    }
    sentiment_analyis.append(sentiment_comment)

In [147]:
print(sentiment_analyis)

[{'comment': 'congratulations cole palmer, do it again, again, again, again snowflake snowflake snowflake', 'sentiment-analyis': [{'label': 'POSITIVE', 'score': 0.999714195728302}]}, {'comment': 'it’s the way cole palmer drifted into midfield position from an attacking position to make this sublime pass for me.. if you watched the earlier build up play. his football iq is on hundred points check mark button saluting face', 'sentiment-analyis': [{'label': 'NEGATIVE', 'score': 0.9151161313056946}]}, {'comment': 'thank you cole palmer for restoring the pride of london! cold face blue heart clapping hands dark skin tone i am beginning to recognise my club again!!!', 'sentiment-analyis': [{'label': 'POSITIVE', 'score': 0.9995802044868469}]}, {'comment': 'cole palmer.. the next eh 10', 'sentiment-analyis': [{'label': 'POSITIVE', 'score': 0.9084388613700867}]}, {'comment': 'brilliant cole palmer cold face blue heart blue heart', 'sentiment-analyis': [{'label': 'POSITIVE', 'score': 0.983522832