# EDA Good Practices

## **1. List the features and their meaning**

- **session_id** - the ID of the session the event took place in
- **index** - the index of the event for the session
- **elapsed_time** - how much time has passed (in milliseconds) between the start of the session and when the event was recorded
- **event_name** - the name of the event type
- **name** - the event name (e.g. identifies whether a notebook_click is is opening or closing the notebook)
- **level** - what level of the game the event occurred in (0 to 22)
- **page** - the page number of the event (only for notebook-related events)
- **room_coor_x** - the coordinates of the click in reference to the in-game room (only for click events)
- **room_coor_y** - the coordinates of the click in reference to the in-game room (only for click events)
- **screen_coor_x** - the coordinates of the click in reference to the player’s screen (only for click events)
- **screen_coor_y** - the coordinates of the click in reference to the player’s screen (only for click events)
- **hover_duration** - how long (in milliseconds) the hover happened for (only for hover events)
- **text** - the text the player sees during this event
- **fqid** - the fully qualified ID of the event
- **room_fqid** - the fully qualified ID of the room the event took place in
- **text_fqid** - the fully qualified ID of the
- **fullscreen** - whether the player is in fullscreen mode
- **hq** - whether the game is in high-quality
- **music** - whether the game music is on or off
- **level_group** - which group of levels - and group of questions - this row belongs to (0-4, 5-12, 13-22)


## **Die Holy Imports**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go


from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score


from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier


from matplotlib import ticker
import time
import warnings
warnings.filterwarnings('ignore')


from sklearn.model_selection import KFold, GroupKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

## Memory Management

In [None]:
def reduce_memory_usage(df):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    Reduces Float64 to Float32 and Int64 to Int32
    Reduces Object to Category

    Based on https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

    Reduces by checking the min and max values of each column and then assigning the smallest possible data type.
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype.name
        if ((col_type != 'datetime64[ns]') & (col_type != 'category')):
            if (col_type != 'object'):
                c_min = df[col].min()
                c_max = df[col].max()

                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)

                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        pass
            else:
                df[col] = df[col].astype('category')
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage became: ",mem_usg," MB")
    
    return df

In [None]:
train_df = pd.read_csv('../input/predict-student-performance-from-game-play/train.csv')

In [None]:
train_df.info()

In [None]:
train_df = reduce_memory_usage(train_df)
train_df.info()

In [None]:
# gc collect to free up memory
# Garbage Collector

import gc
gc.collect()

## **2. Load the Labeled Data**

In [None]:
# load the labeled dataset
# split the session_id into session and question and add them as new columns

train_label = pd.read_csv('../input/predict-student-performance-from-game-play/train_labels.csv')
train_label = reduce_memory_usage(train_label)
train_label['session'] = train_label.session_id.apply(lambda x: int(x.split('_')[0]) )
train_label['q'] = train_label.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )
print( 'shape of label dataset is:',train_label.shape )

In [None]:
train_label.head()

In [None]:
# run gc again
gc.collect()

## **3. Summ Function**

In [None]:
def summary(df):
    print(f'data shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values * 100
    summ['%missing'] = df.isnull().sum().values / len(df)
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
    summ['first value'] = df.loc[0].values
    summ['second value'] = df.loc[1].values
    summ['last value'] = df.loc[df.index[-1]].values
    
    return summ

In [None]:
summary_table = summary(train_df)
summary_table

```markdown
Notebook Creator:
- text matters
- level of game matters
- event type matters
- elapsed time matters

Also assumes:
- 'page', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y',
   'hover_duration', 'text_fqid', 'fullscreen', 'hq',
   'music', 'level_group'
these data is not useful. (As there are too many missing values....)
For coordinates variables, I am not sure how to leverage it due to lack of domain knowledge.


My Take: 

- coordinates seem to have some correlation to correct answer but needs to be investigated. 
- **Remember too many valuese are missing.**
- **Find out techniques to use features like coordinates and text.**

## **4. EDA & Feature Engineering**


```markdown
📌  Feature engineering:
* Very smart and useful function from https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664
* I assumed event is important factor for prediction. Therefore, I made dummies of event_name.
* I only added sum, count, mean values. You can create more variables thru EDA or domain knowledge.
* We will train with 16 features and train with 11779 users info

In [None]:
# Dummies from event_name
event_name_dummies = pd.get_dummies(train_df['event_name'])
event_name_dummies.head()
train_df = pd.concat([train_df, event_name_dummies], axis=1)
train_df.head()

In [None]:
# Categorical features 
CATS = ['event_name', 'name', 'text', 'fqid', 'room_fqid', 'text_fqid']

# Numerical features
NUMS = ['elapsed_time','level','page','room_coor_x', 'room_coor_y', 
        'screen_coor_x', 'screen_coor_y', 'hover_duration', 'cutscene_click', 'map_click', 'map_hover', 'navigate_click', 'notebook_click', 'notification_click', 'object_click', 'object_hover', 'observation_click', 'person_click']

# define the function to create the features
def feature_engineer(train):
    dfs = []
    for c in CATS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique' # unique characters
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('mean')
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    df = pd.concat(dfs,axis=1)
    df = df.fillna(-1)
    df = df.reset_index()
    df = df.set_index('session_id')
    return df

In [None]:
train_df.head()

In [None]:
%%time
train = feature_engineer(train_df)
print('shape of train dataset is:',train.shape)
train.head()

In [None]:
targets = train_label

## **5. Train Random Forest Model**

In [None]:
FEATURES = [c for c in train.columns if c != 'level_group'] # all features except the target
print('We will train with', len(FEATURES) ,'features')
ALL_USERS = train.index.unique() # treat each index as a user
print('We will train with', len(ALL_USERS) ,'users info')



gkf = GroupKFold(n_splits=10)
oof = pd.DataFrame(data=np.zeros((len(ALL_USERS),18)), index=ALL_USERS)
models = {}

# COMPUTE CV SCORE WITH 5 GROUP K FOLD
for i, (train_index, test_index) in enumerate(gkf.split(X=train, groups=train.index)):
    print('#'*25)
    print('### Fold',i+1)
    print('#'*25)
    
    # ITERATE THRU QUESTIONS 1 THRU 18
    for t in range(1,19):
        print(t,', ',end='')
        
        # USE THIS TRAIN DATA WITH THESE QUESTIONS
        if t<=3: grp = '0-4'
        elif t<=13: grp = '5-12'
        elif t<=22: grp = '13-22'
            
        # TRAIN DATA
        train_x = train.iloc[train_index]
        train_x = train_x.loc[train_x.level_group == grp]
        train_users = train_x.index.values
        train_y = targets.loc[targets.q==t].set_index('session').loc[train_users]
        
        # VALID DATA
        valid_x = train.iloc[test_index]
        valid_x = valid_x.loc[valid_x.level_group == grp]
        valid_users = valid_x.index.values
        valid_y = targets.loc[targets.q==t].set_index('session').loc[valid_users]
        
        # TRAIN MODEL
        clf = XGBClassifier()
        clf.fit(train_x[FEATURES].astype('float32'), train_y['correct'])
        
        # SAVE MODEL, PREDICT VALID OOF
        models[f'{grp}_{t}'] = clf
        oof.loc[valid_users, t-1] = clf.predict_proba(valid_x[FEATURES].astype('float32'))[:,1]
        
    print()

## **CV SCORE**

In [None]:
# PUT TRUE LABELS INTO DATAFRAME WITH 18 COLUMNS
true = oof.copy()
for k in range(18):
    # GET TRUE LABELS
    # q = question number
    # session = user
    # correct = 1 if correct, 0 if incorrect
    tmp = targets.loc[targets.q == k+1].set_index('session').loc[ALL_USERS]
    true[k] = tmp.correct.values

In [None]:
print('CV AUC:', roc_auc_score(true, oof))

In [None]:
# FIND BEST THRESHOLD TO CONVERT PROBS INTO 1s AND 0s
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.6,0.65,0.0025):
    print(f'{threshold:.02f}, ',end='')
    preds = (oof.values.reshape((-1))>threshold).astype('int')
    m = f1_score(true.values.reshape((-1)), preds, average='macro')   
    scores.append(m)
    thresholds.append(threshold)
    if m>best_score:
        best_score = m
        best_threshold = threshold

In [None]:
import matplotlib.pyplot as plt

# PLOT THRESHOLD VS. F1_SCORE
plt.figure(figsize=(20,5))
plt.plot(thresholds,scores,'-o',color='blue')
plt.scatter([best_threshold], [best_score], color='blue', s=300, alpha=1)
plt.xlabel('Threshold',size=14)
plt.ylabel('Validation F1 Score',size=14)
plt.title(f'Threshold vs. F1_Score with Best F1_Score = {best_score:.3f} at Best Threshold = {best_threshold:.3}',size=18)
plt.show()

In [None]:
print('When using optimal threshold...')
for k in range(18):
        
    # COMPUTE F1 SCORE PER QUESTION
    m = f1_score(true[k].values, (oof[k].values>best_threshold).astype('int'), average='macro')
    print(f'Q{k}: F1 =',m)
    
# COMPUTE F1 SCORE OVERALL
m = f1_score(true.values.reshape((-1)), (oof.values.reshape((-1))>best_threshold).astype('int'), average='macro')
print('==> Overall F1 =',m)

## infer on test data

In [None]:
import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

In [72]:
# Dummies from event_name
event_name_dummies = pd.get_dummies(test['event_name'])
event_name_dummies.head()
test = pd.concat([test, event_name_dummies], axis=1)
test.head()

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,...,cutscene_click,map_click,map_hover,navigate_click,notebook_click,notification_click,object_click,object_hover,observation_click,person_click
0,20090312331414616,489,928004,navigate_click,undefined,13,,297.729307,-260.559972,656.0,...,0,0,0,1,0,0,0,0,0,0
1,20090312331414616,490,928836,navigate_click,undefined,13,,459.877554,-262.914382,716.0,...,0,0,0,1,0,0,0,0,0,0
2,20090312331414616,491,929968,navigate_click,undefined,13,,622.914322,-354.95357,731.0,...,0,0,0,1,0,0,0,0,0,0
3,20090312331414616,492,930868,navigate_click,undefined,13,,779.643974,-258.638647,791.0,...,0,0,0,1,0,0,0,0,0,0
4,20090312331414616,493,931750,map_click,basic,13,,777.372366,-42.768409,773.0,...,0,1,0,0,0,0,0,0,0,0


In [73]:
limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

for (sample_submission, test) in iter_test:
    
    df = feature_engineer(test)
    grp = test.level_group.values[0]
    a,b = limits[grp]
    for t in range(a,b):
        clf = models[f'{grp}_{t}']
        p = clf.predict_proba(df[FEATURES].astype('float32'))[:,1]
        mask = sample_submission.session_id.str.contains(f'q{t}')
        sample_submission.loc[mask,'correct'] = int(p.item()>best_threshold)
    
    env.predict(sample_submission)

In [None]:
df = pd.read_csv('submission.csv')
print( df.shape )
df.head()

In [None]:
print(df.correct.mean())