In [51]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
import re
import string


# Data Cleaning

In [52]:
# Import dataset
chatlogs = pd.read_csv('/content/chatlogs.csv')
chatlogs.head()

# Check for NaN values in the columns used to create player_id
missing_values = chatlogs[['message', 'chatlog_id', 'champion_name', 'association_to_offender']].isnull().sum()
print("Missing values per column:\n", missing_values)

# Drop rows with NaN values if they are not needed
chatlogs = chatlogs.dropna(subset=['chatlog_id', 'champion_name', 'association_to_offender'])

# Filter the chatlogs to only include games that have one 'offender'
games_with_offenders = chatlogs[chatlogs['association_to_offender'] == 'offender']

chatlogs_filtered = chatlogs[chatlogs['chatlog_id'].isin(games_with_offenders['chatlog_id'])]

# Check the shape of the new filtered dataset
print(chatlogs_filtered.shape)

# Ensure all entries in the 'messages' column are strings
chatlogs_filtered = chatlogs_filtered.copy()
chatlogs_filtered['message'] = chatlogs_filtered['message'].fillna("").astype(str)

# Create a unique player ID per game based on 'champion_name' and 'association_to_offender'
chatlogs_filtered['player_id'] = (
    chatlogs_filtered.groupby(['chatlog_id', 'champion_name', 'association_to_offender'])
    .ngroup()  # Assigns a unique integer group number
)

# Example label assignment based on a condition
chatlogs_filtered['label'] = chatlogs_filtered['association_to_offender'].apply(
    lambda x: 1 if x == 'offender' else 0
)

chatlogs_filtered.head()

# Drop unnecessary columns (optional)
chatlogs_filtered = chatlogs_filtered.drop(columns=["champion_name", "association_to_offender"])

# Input Dataset: Group chatlogs by game
input_data = (
    chatlogs_filtered
    .sort_values(by=['chatlog_id', 'time'])  # Sort by game and timestamp
    .groupby('player_id')
    .agg({
        'chatlog_id': 'first',  # Collect all player IDs and their games
        'message': list,       # Collect all messages aggregated
        'label': 'first'       # Get label of each player
    })
    .reset_index()
)

input_data.head()

# Convert messages back to string format
input_data['message'] = input_data['message'].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))
input_data.head()


Missing values per column:
 message                     29
chatlog_id                   0
champion_name              104
association_to_offender    104
dtype: int64
(1691001, 10)


Unnamed: 0,player_id,chatlog_id,message,label
0,0,1,report for unskilled player is useless thx <3 ...,0
1,1,1,mimimi,0
2,2,1,im comming for you riven pfft focus Zed always...,0
3,3,1,GG,0
4,4,1,thx top no flash for what ? he has 2 kill in l...,0


# Traditional Model

In [53]:
# Define a comprehensive dictionary of zero-tolerance words/phrases
zero_tolerance_words = [
    "noob", "trash", "report", "idiot", "stupid", "hate", "kill", "moron",
    "afk", "feed", "toxic", "inting", "grief", "loser", "kys", "troll",
    "garbage", "dumb", "useless", "worthless", "rage", "cry", "flame", "blame",
    "bot", "bronze", "no skill", "clown", "pathetic", "weak", "uninstall",
    "die", "mad", "tilt", "scrub", "cheat", "hacker", "smurf", "idiotic",
    "stfu", "fool", "fail", "trashcan", "garbagecan", "child", "baby",
    "nerd", "degenerate", "slow", "dunce", "coward", "losing", "hopeless",
    "bad", "awful", "terrible", "disgusting", "shame", "broken", "broke",
    "worthless", "failure", "toxic player", "clown fiesta", "rekt", "owned",
    "trash team", "ff", "boosted", "boost", "int", "intentionally feeding",
    "feeding", "kill yourself", "losing streak", "smurf account", "cheater",
    "reported", "banned", "trash champ", "useless player"
]


# Function to count toxic words in messages
def count_toxic_words(message, toxic_words):
    tokens = message.lower().split()  # Tokenize and lower case the message
    return sum(1 for token in tokens if token in toxic_words)

# Add a toxic word count column to the dataset
input_data['toxic_word_count'] = input_data['message'].apply(
    lambda x: count_toxic_words(x, zero_tolerance_words)
)

# Group by chatlog_id to identify the offender with the highest toxic word count
predictions = input_data.groupby('chatlog_id').apply(
    lambda group: pd.Series({
        'predicted_offender': group.loc[group['toxic_word_count'].idxmax(), 'player_id'],
        'flagged_players': group[group['toxic_word_count'] > 0]['player_id'].tolist()
    })
).reset_index()




  predictions = input_data.groupby('chatlog_id').apply(


#Model Accuracy

In [54]:
# Merge actual offenders for evaluation (if labels are available)
actual_offenders = input_data[input_data['label'] == 1][['chatlog_id', 'player_id']].rename(
    columns={'player_id': 'actual_offender'}
)
results = predictions.merge(actual_offenders, on='chatlog_id', how='left')

# Optional: Evaluate the accuracy of the predictions
results['is_correct'] = results['predicted_offender'] == results['actual_offender']
accuracy = results['is_correct'].mean()

# Display the results
print(results.head())
print(f"Model Accuracy: {accuracy:.2f}")

   chatlog_id  predicted_offender               flagged_players  \
0           1                   2               [0, 2, 4, 6, 7]   
1           2                  14              [11, 12, 14, 16]   
2           3                  20  [20, 22, 23, 24, 25, 26, 27]   
3           4                  36          [29, 35, 36, 37, 38]   
4           5                  39          [39, 40, 41, 42, 45]   

   actual_offender  is_correct  
0                7       False  
1               16       False  
2               25       False  
3               37       False  
4               39        True  
Model Accuracy: 0.33
