## Data Preprocessing

In this notebook, I will reshape data to make if appropriate to perform social network analysis.

### Import libraries

#### Basic libraries

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import math
from collections import Counter
import gzip
import re
import warnings
warnings.simplefilter("ignore")

#### Network analysis and Network data visualization

In [2]:
import networkx as nx
import itertools
import os
from pyvis.network import Network

#### Profane language detection library

In [3]:
import sklearn.externals as extjoblib
import joblib
from profanity_check import predict, predict_prob

### Load data

In [4]:
df = pd.read_csv('../data/chat_preprocessed.csv')
df

Unnamed: 0,match_id,unit,pf
0,0,6k Slayer,1
1,0,Double T,1
2,0,Kira,0
3,0,Monkey,1
4,0,Trash!!!,0
...,...,...,...
303878,49771,DamN.Sakura_Jr,0
303879,49771,Dante,0
303880,49771,PINAGPALA,1
303881,49771,StopDancing,0


## Find users who played more than 10 matches

### Player's number of match `id` : `number of matches played`

In [5]:
id_count = Counter(df['unit'])
id_count

Counter({'6k Slayer': 1,
         'Double T': 2,
         'Kira': 25,
         'Monkey': 12,
         'Trash!!!': 1,
         'u didnt see who highest here?': 1,
         'ｔｏｍｉａ～♥': 1,
         "Apin's": 1,
         'Buk lau': 1,
         'D o r a': 1,
         'DA BOSS': 3,
         'Titus': 7,
         'stormX---': 1,
         'Anonymous': 30,
         'BlackJack': 3,
         'Grötenelch': 1,
         'KaLixz': 1,
         'Star': 4,
         'WtW|Tweedle Dee & Tweedle Dum': 1,
         'Ａｌｉｃｅ': 1,
         'AresStar®': 1,
         'Arhey': 1,
         'Arsich': 1,
         'Fuck_Off': 4,
         'Matt': 21,
         'Porucznik Ku®wa': 5,
         'ahohuhahohu': 1,
         '¡No pasarán!': 1,
         'Desolation': 5,
         'FREAKGANG.haa': 1,
         'SIMBA': 1,
         'iceiceBABE': 1,
         '$uƒfer': 2,
         'Xiiao_Hung_': 2,
         'stadin.xxx': 1,
         'Ｌｕｉｓ': 2,
         'Faint': 2,
         'Jim Jim': 1,
         'Ravenclaw': 2,
         'qwewqewqewqewq': 1

#### Players and the number of matches they played

In [6]:
count_by_id = pd.DataFrame.from_dict(id_count, orient = 'index', columns = ['count']).reset_index()
count_by_id

Unnamed: 0,index,count
0,6k Slayer,1
1,Double T,2
2,Kira,25
3,Monkey,12
4,Trash!!!,1
...,...,...
177985,kabutee,1
177986,DamN.Pussy_Jr,1
177987,DamN.Sakura_Jr,1
177988,PINAGPALA,1


#### Players who played more than 10 matches

In [7]:
freq_user_df = count_by_id.loc[count_by_id['count'] > 10]
freq_user_df

Unnamed: 0,index,count
2,Kira,25
3,Monkey,12
13,Anonymous,30
24,Matt,21
45,melissachan♥,11
...,...,...
130782,MinDFirE,14
131143,Alright,11
132391,FraNigiri,12
135655,[V].Trying to walk away,11


In [8]:
freq_user_id = freq_user_df['index'].tolist()
freq_user_id

['Kira',
 'Monkey',
 'Anonymous',
 'Matt',
 'melissachan♥',
 'Kenny',
 'Doug Carl',
 'KingJonesGaming',
 'Max Tryhard',
 'bourgeoisie',
 'Joe',
 'Hades',
 'ебать',
 'Mithrandir',
 'AA',
 'Bruce Wayne',
 '†',
 'Kai',
 'shibui',
 'Maiku Mori',
 'ALWAYSWANNADIE',
 'GGWP',
 'BrianCooper1',
 'Kappa',
 'Bob Ross',
 'Audible Chocolate',
 'Saitama',
 '+25',
 'Fiddler',
 'Karl',
 'Light',
 'carry please ty',
 '???',
 'Banana',
 'Style',
 'Zzz',
 '?????',
 'Ez',
 'Miracle-',
 'Syndrome',
 'Vern Vandamn',
 '4Head',
 '『\u3000\u3000』',
 'O.o',
 'The Boy',
 'Doro',
 'Lurgo',
 'Jesus',
 'Rick',
 'Bob',
 'Tom Ganks',
 'Boss',
 'Flower',
 'Nero',
 'Sn0_Man',
 "What's Poppin'",
 'Player',
 'Eagle',
 'Caped Baldy',
 'XIII',
 '^_^',
 'D',
 '=)',
 'KITTY HAS TO GO',
 'Pourosis',
 '????',
 'Ace',
 'Nate Higgers',
 'confessor',
 'grillz',
 '._.',
 'feeder',
 'Axosh',
 '<><',
 'snoopRat',
 '< blank >',
 '-.-',
 'Hey b0ss',
 ':D',
 'John Cena',
 'Zero',
 '123',
 'art',
 'MinMorts',
 'Skodak',
 'Fred',
 'Lumos'

## Select chats and matches of frequently played players 

In [9]:
chat = pd.read_csv('../data/chat_original.csv')

In [10]:
chat.dropna(inplace = True)

### Select chats of frequently players

In [11]:
chat_freq = chat[chat['unit'].isin(freq_user_id)]

In [12]:
chat_freq.nunique()

match_id    24503
key         93923
slot           10
time         5001
unit         1807
dtype: int64

There are 24503 matches, 93293 chat, and 1807 players. 

#### Profanity Check

In [13]:
chat_freq.isnull().sum()

match_id    0
key         0
slot        0
time        0
unit        0
dtype: int64

In [14]:
chat_freq['pf'] = predict(chat_freq['key'])

### Replace match_id with unique values

Reassign match id by order

In [15]:
unique_match_id = list(range(chat_freq['match_id'].nunique()))

Unique match id list

In [16]:
freq_id_match_list = list(chat_freq['match_id'].drop_duplicates())

Overwrite match id

In [17]:
res = dict(zip(freq_id_match_list, unique_match_id))

In [18]:
chat_freq['match_id'] = chat_freq['match_id'].map(res)

## Reshape Dataframe

### Select columns required for analysis

In [19]:
df_freq = chat_freq[['unit', 'pf', 'match_id', 'time']]

### Restructure dataframe

In [20]:
df_freq = df_freq.groupby(['match_id', 'unit']).sum()

### Reset Index

In [21]:
df_freq = df_freq.reset_index()

## Function for restructurizing dataframe for SNA: Need source, receiver, and weight

### Reshape data: 
* Source(those who spoke abusive language), 
* Target(those who listen to abusive language during the match), 
* pf(sum of the abusive language produced during the match), 
* match_id_val(match_id)

In [22]:
df_freq_draft = []

for match_id in freq_id_match_list:
    df_by_match_id = df_freq[df_freq['match_id'] == match_id]
    receiver = df_by_match_id['unit'].to_list()
    source = df_by_match_id.loc[df_by_match_id['pf'] >= 1]['unit']
    pf = df_by_match_id.loc[df_by_match_id['pf'] >= 1]['pf']
    match_id_val = df_by_match_id['match_id']

    
    
    if len(df_by_match_id) != 0:
        source_val = list(np.repeat(source, len(receiver)))
        pf_val = list(np.repeat(pf, len(receiver)))

    source_list = list(zip(source_val, receiver, pf_val, match_id_val))
    
    df_freq_draft.extend(source_list)


In [23]:
df_freq_draft

[('Monkey', 'Kira', 1, 0),
 ('Monkey', 'Monkey', 1, 0),
 ('Joe', 'Joe', 2, 7),
 ('Kappa', 'Kappa', 1, 18),
 ('4Head', '4Head', 1, 40),
 ('feeder', '._.', 1, 71),
 ('feeder', 'feeder', 1, 71),
 ('Axosh', 'Axosh', 4, 77),
 (':D', ':D', 2, 82),
 ('Miracle-', '4Head', 1, 86),
 ('Miracle-', 'Miracle-', 1, 86),
 ('Fluffy', 'Fluffy', 1, 121),
 ('Fluffy', 'Horse', 1, 121),
 ('Banyu', 'Banyu', 2, 132),
 ('Casto', 'Casto', 1, 134),
 ('Arbi', 'Arbi', 1, 144),
 ('Arbi', 'Chill', 1, 144),
 ('Arbi', 'Psycho', 1, 144),
 ('Nuker', 'Nuker', 4, 178),
 ('Stallon', 'Reaper', 1, 179),
 ('Stallon', 'Stallon', 1, 179),
 ('TINKLES', 'TINKLES', 1, 180),
 ('@hiro', '@hiro', 1, 187),
 ('#ano.Ainsley Kek', '#ano.Ainsley Kek', 1, 204),
 ('#ano.Ainsley Kek', 'Dictionaries', 1, 204),
 ('Jesus', 'Jesus', 6, 207),
 ('Ho랑이', 'Ho랑이', 1, 208),
 ('Ho랑이', 'XXX', 1, 208),
 ('Tensai >.<', 'Tensai >.<', 4, 239),
 ('Hyy', '...', 1, 240),
 ('Hyy', 'Hyy', 1, 240),
 ('Hyy', 'Lặng Lẽ Tổn Thương', 1, 240),
 ('Hyy', 'Sleeping_[D]og'

### Reshape into dataframe

In [24]:
freq_user_df = pd.DataFrame(df_freq_draft, columns = ['source', 'target', 'weight', 'match'])

### Delete data with the same source and target

In [25]:
freq_user_df = freq_user_df.loc[freq_user_df['source']!= freq_user_df['target']]

#### Add column `type`, the type of network

In [26]:
freq_user_df['type'] = 'Directed'

In [27]:
freq_user_df.to_csv('../data/chat_sna_shaped.csv', index = False)