# Data Cleaning and Exploratory Analysis

### Import libraries

#### Basic libraries

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import math
from collections import Counter
import gzip
import re
import warnings
warnings.simplefilter("ignore")

#### Profane language detection libraries

In [3]:
import sklearn.externals as extjoblib
import joblib
from profanity_check import predict, predict_prob

### Load data

In [4]:
chat = pd.read_csv('../data/chat_original.csv')
chat

Unnamed: 0,match_id,key,slot,time,unit
0,0,force it,6,-8,6k Slayer
1,0,space created,1,5,Monkey
2,0,hah,1,6,Monkey
3,0,ez 500,6,9,6k Slayer
4,0,mvp ulti,4,934,Kira
...,...,...,...,...,...
1439483,49999,ez,8,2796,DamN.Pussy_Jr
1439484,49999,ya ya so ezx,1,2796,StopDancing
1439485,49999,ez,8,2797,DamN.Pussy_Jr
1439486,49999,hahaha,1,2798,StopDancing


##### Missing data

In [5]:
chat.isna().sum()

match_id     0
key         14
slot         0
time         0
unit        42
dtype: int64

In [6]:
chat.dropna(inplace = True)

#### EDA

##### Matches, Players, Chat logs

In [7]:
match = len(pd.unique(chat.match_id))
players = len(pd.unique(chat.unit))

In [8]:
print(f'{match} matches, {players} players, {chat.shape[0]} chat logs')

49772 matches, 177990 players, 1439432 chat logs


#### Abusive language usage detection: Chat

##### Created a column `pf` for profanity words

In [9]:
chat['pf'] = predict(chat['key'])

##### Created a new dataframe `df` for organizing dataset

In [10]:
df = chat[['unit', 'pf', 'match_id']]

In [11]:
foul_lang = len(df.loc[df['pf']==1]['match_id'].unique())

#### Matches with foul langauge

In [12]:
print(f'{round((foul_lang/match*100), 2)} of the match included more than one abusive langauge')


59.95 of the match included more than one abusive langauge


#### n% of chats with foul language

In [13]:
print(f'{round((chat.pf.sum()/chat.shape[0])*100, 2)}% of chats included abusive langauge')

7.09% of chats included abusive langauge


#### players who spoke abusive language

In [14]:
players_abusive = len(df.loc[df['pf'] == 1]['unit'].unique())

In [15]:
print(f'{round((players_abusive / len(df.unit.unique()))*100, 2)}% of players produced all the abusive languages in the chat')

24.44% of players produced all the abusive languages in the chat


#### Group data by `match_id` and `unit` (unique players for each match), and calculate the sum of bad words (weight)

In [16]:
df_pf = df.groupby(['match_id', 'unit']).sum()

##### Reset index

In [17]:
df_n_idx = df_pf.reset_index()

In [18]:
df_n_idx

Unnamed: 0,match_id,unit,pf
0,0,6k Slayer,1
1,0,Double T,1
2,0,Kira,0
3,0,Monkey,1
4,0,Trash!!!,0
...,...,...,...
303878,49999,DamN.Sakura_Jr,0
303879,49999,Dante,0
303880,49999,PINAGPALA,1
303881,49999,StopDancing,0


#### Reset `match_id`
In that there were matchs without chat, I changed `match_id` to align with the order of the matches. This will make it easier to change the shape of dataframe.

##### Unique match id list

In [19]:
match_id_list = df_n_idx['match_id'].unique()

##### Match id list by order

In [20]:
len_list_id = [x for x in range(0, len(match_id_list))]

##### Replace the match number by its order

In [21]:
res = dict(zip(match_id_list, len_list_id))

In [22]:
df_n_idx['match_id'] = df_n_idx['match_id'].map(res)

In [24]:
df_n_idx.to_csv('../data/chat_preprocessed.csv', index = False)