# Sentiment Analysis to Files

## Setup

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

plt.style.use('ggplot')

import nltk

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
import math

  from .autonotebook import tqdm as notebook_tqdm


## Load Model

In [None]:
MODEL = 'cardiffnlp/twitter-roberta-base-sentiment'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=3)

## Sentiment prediction function

In [85]:
def sent_predict(text):
  if isinstance(text, str) and text.strip():
    while True:
      if len(text):
        try:
          encoded_text = tokenizer(
              text, 
              return_tensors='pt', 
              max_length=514,  # Model's expected input length
              truncation=True  # Via truncation
          )
          output = model(**encoded_text)
          scores = output[0][0].detach().numpy()
          scores = softmax(scores)
          scores_dict = {
              'roberta_neg': round(float(scores[0]), 4),
              'roberta_neu': round(float(scores[1]), 4),
              'roberta_pos': round(float(scores[2]), 4)
          }
          break
        except:
          text = " ".join(text.split()[:-1])
      else:
        scores_dict = {
          'roberta_neg': 0,
          'roberta_neu': 0,
          'roberta_pos': 0
          }
  else:
    scores_dict = {
      'roberta_neg': 0,
      'roberta_neu': 0,
      'roberta_pos': 0
      }
    
  return scores_dict['roberta_neg'], scores_dict['roberta_neu'], scores_dict['roberta_pos']

def apply_sentiment_analysis(df, column_name):
  rows_analyzed = 0
  df.dropna(subset=column_name, inplace=True)
  rows_to_analyze = len(df)

  def sentiment_with_counter(row):
    nonlocal rows_analyzed
    rows_analyzed += 1
    if rows_analyzed == int(rows_to_analyze*0.25):
      print(f'Sentiment Analysis 25% done: {rows_analyzed}/{rows_to_analyze} analyzed')
    elif rows_analyzed == int(rows_to_analyze*0.5):
      print(f'Sentiment Analysis 50% done: {rows_analyzed}/{rows_to_analyze} analyzed')
    elif rows_analyzed == int(rows_to_analyze*0.75):
      print(f'Sentiment Analysis 75% done: {rows_analyzed}/{rows_to_analyze} analyzed')

    return sent_predict(row[column_name])

  print(f'Analyzing {rows_to_analyze}...')
  results = df.apply(sentiment_with_counter, axis=1)
  df['sent_neg'] = [x[0] for x in results]
  df['sent_neu'] = [x[1] for x in results]
  df['sent_pos'] = [x[2] for x in results]
  print(f'Sentiment Analysis completed: {rows_analyzed}/{rows_to_analyze} analyzed')

In [7]:
import os
import pandas as pd
path = 'datasets'
for dir in ['youtube-database-1', 'youtube-database-2', 'youtube-database-3']:
    new_path = os.path.join(path, dir)
    # new_path = os.path.join(path, 'youtube-database-2')
    # new_path = os.path.join(path, 'youtube-database-3')

    texts_total = 0

    for file in os.listdir(new_path):
        file_path = os.path.join(new_path, file)
        # print(file.split('-')[0])
        df_file = pd.read_csv(file_path)
        texts_total += df_file.shape[0]

print(texts_total)

139579


## Main function

In [None]:
# new_path = os.path.join(path, 'youtube-database-1')
# new_path = os.path.join(path, 'youtube-database-2')
# new_path = os.path.join(path, 'youtube-database-3')
new_path = 'test-dir'

file_count = 0

for file in os.listdir(new_path):
  file_path = os.path.join(new_path, file)
  print(file.split('-')[0])
  df_file = pd.read_csv(file_path)

  if df_file.size and (
                      'sentiment_neg' not in df_file.columns or
                      'sentiment_neu' not in df_file.columns or
                      'sentiment_pos' not in df_file.columns
                      ):

    # df_file['translation_en'] = df_file.apply(lambda row: translate_text(row['comment'], row['lang']), axis=1)
    apply_sentiment_analysis(df_file, 'translation_en')
    print('File labeled')

    df_file.to_csv(file_path, index=False)
    print(f'File {file} replaced')
    file_count += 1
    print(f'{file_count} files checked')

  else:
    print('Already analyzed file')
    file_count += 1
    print(f'{file_count} files checked')
    continue

## Sentiment processing

In [3]:
def sent_definition(neg, neu, pos):
    elements = (neg, neu, pos)
    if elements[0] < elements[2]:
        if elements[2] >= round(elements[1] - 0.3, 4):
            sentiment_res = 2
        else:
            sentiment_res = 1
    else:
        if elements[0] >= round(elements[1] - 0.2, 4):
            sentiment_res = 0
        else:
            sentiment_res = 1
    
    return sentiment_res

In [4]:
df = pd.read_csv('datasets/youtube-database-1/165219-comments.csv')
df['sent_res'] = df.apply(lambda row: sent_definition(row['sent_neg'], row['sent_neu'], row['sent_pos']), axis=1)
avg_sent = np.mean(df['sent_res'])
print(avg_sent)

1.5333333333333334


### Sentiment assigning

Updates the files with the sentiment criteria defined by the threshold:
- 0.2 for negative-neutral
- 0.3 for positive-neutral

In [6]:
path = 'datasets'

titles_df = pd.DataFrame([], columns=['id', 'titles_1', 'titles_2', 'titles_3'])
comments_df = pd.DataFrame([], columns=['id', 'comments_1', 'comments_2', 'comments_3'])

for dir in ('youtube-database-1', 'youtube-database-2', 'youtube-database-3'):
    
    new_path = os.path.join(path, dir)

    file_count = 0

    for file in os.listdir(new_path):
        file_path = os.path.join(new_path, file)
        df = pd.read_csv(file_path, index_col=0)
        if len(df):
            df['sent_res'] = df.apply(lambda row: sent_definition(row['sent_neg'], row['sent_neu'], row['sent_pos']), axis=1)
            df.to_csv(file_path)
            file_count += 1
            print(f'files modified: {file_count}')
        else:
            file_count += 1
            print(f'files modified: {file_count}')
        


files modified: 1
files modified: 2
files modified: 3
files modified: 4
files modified: 5
files modified: 6
files modified: 7
files modified: 8
files modified: 9
files modified: 10
files modified: 11
files modified: 12
files modified: 13
files modified: 14
files modified: 15
files modified: 16
files modified: 17
files modified: 18
files modified: 19
files modified: 20
files modified: 21
files modified: 22
files modified: 23
files modified: 24
files modified: 25
files modified: 26
files modified: 27
files modified: 28
files modified: 29
files modified: 30
files modified: 31
files modified: 32
files modified: 33
files modified: 34
files modified: 35
files modified: 36
files modified: 37
files modified: 38
files modified: 39
files modified: 40
files modified: 41
files modified: 42
files modified: 43
files modified: 44
files modified: 45
files modified: 46
files modified: 47
files modified: 48
files modified: 49
files modified: 50
files modified: 51
files modified: 52
files modified: 53
fi

### Average sentiment and player dataframe building

In [7]:
path = 'datasets'

titles_1 = pd.DataFrame([], columns=['id', 'titles_1'])
comments_1 = pd.DataFrame([], columns=['id', 'comments_1'])

titles_2 = pd.DataFrame([], columns=['id', 'titles_2'])
comments_2 = pd.DataFrame([], columns=['id', 'comments_2'])

titles_3 = pd.DataFrame([], columns=['id', 'titles_3'])
comments_3 = pd.DataFrame([], columns=['id', 'comments_3'])

for dir in ('youtube-database-1', 'youtube-database-2', 'youtube-database-3'):

    title_id = []
    title_sent = []
    num_title_sent = []
    title_views = []

    comment_id = []
    comment_sent = []
    num_comment_sent = []
    

    inst = dir[-1]

    new_path = os.path.join(path, dir)

    file_count = 0

    for file in os.listdir(new_path):
        file_path = os.path.join(new_path, file)
        df = pd.read_csv(file_path, index_col=0)
        if 'title' in file:
            if len(df):
                title_id.append(file.split('-')[0])
                title_sent.append(np.mean(df['sent_res']))
                num_title_sent.append(len(df))
                title_views.append(np.sum(df['view_count']))
            else:
                title_id.append(file.split('-')[0])
                title_sent.append(1)
                num_title_sent.append(0)
                title_views.append(0)

        else:
            if len(df):
                comment_id.append(file.split('-')[0])
                comment_sent.append(np.mean(df['sent_res']))
                num_comment_sent.append(len(df))
            else:
                comment_id.append(file.split('-')[0])
                comment_sent.append(1)
                num_comment_sent.append(0)

    if inst == '1':
        titles_1['id'] = title_id
        titles_1[f'titles_{inst}'] = title_sent
        titles_1[f'num_titles_{inst}'] = num_title_sent
        titles_1[f'views_titles_{inst}'] = title_views

        comments_1['id'] = comment_id
        comments_1[f'comments_{inst}'] = comment_sent
        comments_1[f'num_comments_{inst}'] = num_comment_sent
    elif inst == '2':
        titles_2['id'] = title_id
        titles_2[f'titles_{inst}'] = title_sent
        titles_2[f'num_titles_{inst}'] = num_title_sent
        titles_2[f'views_titles_{inst}'] = title_views

        comments_2['id'] = comment_id
        comments_2[f'comments_{inst}'] = comment_sent
        comments_2[f'num_comments_{inst}'] = num_comment_sent

    elif inst == '3':
        titles_3['id'] = title_id
        titles_3[f'titles_{inst}'] = title_sent
        titles_3[f'num_titles_{inst}'] = num_title_sent
        titles_3[f'views_titles_{inst}'] = title_views

        comments_3['id'] = comment_id
        comments_3[f'comments_{inst}'] = comment_sent
        comments_3[f'num_comments_{inst}'] = num_comment_sent

        

In [8]:
titles_1

Unnamed: 0,id,titles_1,num_titles_1,views_titles_1
0,198264,1.000000,23,60349.0
1,200141,1.032258,31,157147.0
2,211857,1.000000,18,645245.0
3,210915,1.000000,8,38075.0
4,192970,1.125000,32,177153.0
...,...,...,...,...
1139,212198,1.076923,26,728415.0
1140,195873,1.000000,10,18524.0
1141,193896,1.041667,24,4862323.0
1142,196889,1.076923,39,165732.0


In [25]:
titles_merged = titles_1.merge(titles_2, on='id', how='outer').merge(titles_3, on='id', how='outer')
titles_merged.rename(columns={'id': 'sf_id'}, inplace=True)
titles_merged['sf_id'] = titles_merged['sf_id'].astype('int64')
comments_merged = comments_1.merge(comments_2, on='id', how='outer').merge(comments_3, on='id', how='outer')
comments_merged.rename(columns={'id': 'sf_id'}, inplace=True)
comments_merged['sf_id'] = comments_merged['sf_id'].astype('int64')

In [26]:
titles_merged.head()

Unnamed: 0,sf_id,titles_1,num_titles_1,views_titles_1,titles_2,num_titles_2,views_titles_2,titles_3,num_titles_3,views_titles_3
0,165219,0.875,16,420671.0,1.027778,36,163148.0,1.090909,11,28562
1,167841,1.051282,39,2439429.0,1.190476,42,482776.0,1.302326,43,1780152
2,168984,1.0,10,20671.0,1.142857,14,58032.0,1.1,10,29492
3,169708,1.142857,35,87765.0,1.368421,38,190140.0,1.285714,42,258065
4,169710,1.130435,23,203131.0,0.833333,6,8422.0,0.875,8,27655


In [27]:
comments_merged.head()

Unnamed: 0,sf_id,comments_1,num_comments_1,comments_2,num_comments_2,comments_3,num_comments_3
0,165219,1.533333,15,1.363636,198,1.0,9
1,167841,1.14,100,1.065,200,1.051948,231
2,168984,1.214286,14,1.714286,7,1.666667,3
3,169708,0.875,192,1.305263,95,1.342857,105
4,169710,1.385965,57,2.0,2,1.111111,9


In [28]:
titles_merged.dtypes

sf_id               int64
titles_1          float64
num_titles_1        int64
views_titles_1    float64
titles_2          float64
num_titles_2        int64
views_titles_2    float64
titles_3          float64
num_titles_3        int64
views_titles_3      int64
dtype: object

In [33]:
df_players = pd.read_csv('player-dataset.csv')
df_players = df_players.merge(titles_merged, on='sf_id', how='outer')
df_players = df_players.merge(comments_merged, on='sf_id', how='outer')
df_players.head()

Unnamed: 0,player_x,player_link_fb,Nation,Lg,Pos,team,age_x,mp_0,mp_1,mp_2,...,views_titles_2,titles_3,num_titles_3,views_titles_3,comments_1,num_comments_1,comments_2,num_comments_2,comments_3,num_comments_3
0,jack hobbs,/en/players/5598b655/Jack-Hobbs,eng ENG,ENG2,DF,hull city,23.0,22.0,27.0,17.0,...,163148.0,1.090909,11,28562,1.533333,15,1.363636,198,1.0,9
1,chris maguire,/en/players/f5a71789/Chris-Maguire,sct SCO,ENG2,"FW,MF",sheffield wednesday,23.0,10.0,2.0,42.0,...,482776.0,1.302326,43,1780152,1.14,100,1.065,200,1.051948,231
2,ismael bangoura,/en/players/a15bd868/Ismael-Bangoura,gn GUI,ITA2,DF,cesena,17.0,1.0,2.0,,...,58032.0,1.1,10,29492,1.214286,14,1.714286,7,1.666667,3
3,sam hutchinson,/en/players/800a3404/Sam-Hutchinson,eng ENG,ENG2,"DF,MF",nottingham forest,22.0,9.0,1.0,20.0,...,190140.0,1.285714,42,258065,0.875,192,1.305263,95,1.342857,105
4,liam bridcutt,/en/players/d6066695/Liam-Bridcutt,sct SCO,ENG2,"DF,MF",brighton & hove albion,23.0,41.0,11.0,18.0,...,8422.0,0.875,8,27655,1.385965,57,2.0,2,1.111111,9


In [34]:
df_players.to_csv('player-dataset-sent.csv')