1. Import Libraries and Load Dataset

In [29]:


# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np

# Total number of games in the dataset
total_games = 10058

# Import the chatlogs dataset
chatlogs = pd.read_csv('chatlogs.csv')  # Load the chat logs data
print("Dataset Loaded Successfully")
print(chatlogs.head())  # Display the first few rows to inspect the dataset

Dataset Loaded Successfully
   Unnamed: 0              message association_to_offender      time  \
0           0           gold 2 zed                   enemy  00:00:21   
1           1                 IIII                   enemy  00:00:27   
2           2  nice premade lie :o                   enemy  00:00:27   
3           3                  ISI                   enemy  00:00:28   
4           4        smiteless pls                   enemy  00:00:43   

   case_total_reports  allied_report_count  enemy_report_count  \
0                   8                    0                   2   
1                   8                    0                   2   
2                   8                    0                   2   
3                   8                    0                   2   
4                   8                    0                   2   

  most_common_report_reason  chatlog_id champion_name  
0         Negative Attitude           1          Udyr  
1         Negative Attitude   


2. Data Cleaning and Processing


In [32]:
# Check for missing values in critical columns
missing_values = chatlogs[['message', 'chatlog_id', 'champion_name', 'association_to_offender']].isnull().sum()
print("Missing values per column:\n", missing_values)  # Print missing values for the columns used to create player_id

# Drop rows with missing critical information
chatlogs = chatlogs.dropna(subset=['chatlog_id', 'champion_name', 'association_to_offender'])

# Filter dataset to only include games with offenders
games_with_offenders = chatlogs[chatlogs['association_to_offender'] == 'offender']

# Filter the chatlogs to include only relevant chatlogs of games with offenders
chatlogs_filtered = chatlogs[chatlogs['chatlog_id'].isin(games_with_offenders['chatlog_id'])]

# Check the shape of the filtered dataset to ensure it's correctly filtered
print("Filtered Dataset Shape:", chatlogs_filtered.shape)

# Ensure all entries in the 'messages' column are strings (in case of NaN or mixed types)
chatlogs_filtered = chatlogs_filtered.copy()
chatlogs_filtered['message'] = chatlogs_filtered['message'].fillna("").astype(str)

# Create unique player IDs for each player in a game based on their champion and role
chatlogs_filtered['player_id'] = (
    chatlogs_filtered.groupby(['chatlog_id', 'champion_name', 'association_to_offender'])
    .ngroup()  # Assigns unique group number to each player
)

# Assign labels based on 'offender' or 'non-offender' status (1 for offender, 0 for non-offender)
chatlogs_filtered['label'] = chatlogs_filtered['association_to_offender'].apply(
    lambda x: 1 if x == 'offender' else 0  # 1 for offenders, 0 for non-offenders
)
print(chatlogs_filtered.head())  # Display a preview of the filtered dataset with labels


Missing values per column:
 message                     29
chatlog_id                   0
champion_name              104
association_to_offender    104
dtype: int64
Filtered Dataset Shape: (1691001, 10)
   Unnamed: 0              message association_to_offender      time  \
0           0           gold 2 zed                   enemy  00:00:21   
1           1                 IIII                   enemy  00:00:27   
2           2  nice premade lie :o                   enemy  00:00:27   
3           3                  ISI                   enemy  00:00:28   
4           4        smiteless pls                   enemy  00:00:43   

   case_total_reports  allied_report_count  enemy_report_count  \
0                   8                    0                   2   
1                   8                    0                   2   
2                   8                    0                   2   
3                   8                    0                   2   
4                   8             

3. Feature Engineering: Grouping and Aggregating Messages

In [34]:
# Group chatlogs by player_id, sorting by chatlog_id and timestamp to collect messages per player
input_data = (
    chatlogs_filtered
    .sort_values(by=['chatlog_id', 'time'])  # Sort by game ID and time
    .groupby('player_id')
    .agg({
        'message': list,  # Aggregate messages for each player as a list
        'label': 'first'  # Take the first label as the player's overall label
    })
    .reset_index()
)

print(input_data.head(10))  # Preview the grouped data for the first 10 players


   player_id                                            message  label
0          0  [report for unskilled player is useless, thx <...      0
1          1                                           [mimimi]      0
2          2  [im comming for you riven, pfft, focus Zed alw...      0
3          3                                               [GG]      0
4          4  [thx, top no flash, for what ? he has 2 kill i...      0
5          5  [IIII, ISI, K, udyr top, dnt us see it?, CAMP ...      0
6          6  [gold 2 zed, nice premade lie :o, smiteless pl...      0
7          7  [bait, Karma reported, Unskilled, No ranked fo...      1
8          8                                  [you should camp]      0
9          9  [lol, jinx, i ward abron, omg, you has my ult ...      0


4. Sentiment Analysis using VADER

In [36]:
# Initialize the sentiment analyzer (VADER)
analyzer = SentimentIntensityAnalyzer()

# Calculate sentiment score for each player's aggregated messages
input_data['sentiment_score'] = input_data['message'].apply(
    lambda x: analyzer.polarity_scores(" ".join(x))['compound']  # Use the 'compound' score for sentiment
)

# Display a preview of the data with sentiment scores
print(input_data.head())


   player_id                                            message  label  \
0          0  [report for unskilled player is useless, thx <...      0   
1          1                                           [mimimi]      0   
2          2  [im comming for you riven, pfft, focus Zed alw...      0   
3          3                                               [GG]      0   
4          4  [thx, top no flash, for what ? he has 2 kill i...      0   

   sentiment_score  
0           0.0487  
1           0.0000  
2          -0.5528  
3           0.2960  
4          -0.0000  


5. Model Training and Evaluation

In [39]:
# Prepare features (sentiment scores) and labels (offender or not)
X = input_data[['sentiment_score']].values  # Features: sentiment score
y = input_data['label'].values  # Labels: offender (1) or non-offender (0)

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model's performance using F1 score, confusion matrix, and classification report
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

# Confusion Matrix to evaluate true positives, false positives, etc.
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Detailed Classification Report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)


F1 Score: 0.09258532757944292
Confusion Matrix:
 [[15398   497]
 [ 1816   118]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.97      0.93     15895
           1       0.19      0.06      0.09      1934

    accuracy                           0.87     17829
   macro avg       0.54      0.51      0.51     17829
weighted avg       0.82      0.87      0.84     17829

