# 1. Import Libraries

In [1]:
import gc
import re

import pandas as pd
import numpy as np
import polars as pl
from collections import Counter,defaultdict
from scipy.stats import skew, kurtosis

from xgboost import XGBClassifier
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR

from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

# Deep Learning Tools
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

seed=116
import random
np.random.seed(seed)
random.seed(seed)

import warnings#
warnings.filterwarnings('ignore')

## 1.1. Config

In [2]:
num_folds = 10

## 1.2. Load Files

In [3]:
train_logs=pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv")
print(f"len(train_logs):{len(train_logs)}")
train_logs=train_logs.sort_values(by=['id', 'down_time'])
train_logs = train_logs.reset_index(drop=True)
train_logs['event_id'] = train_logs.groupby('id').cumcount() + 1

train_scores=pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv")

test_logs=pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv")
print(f"len(test_logs):{len(test_logs)}")
test_logs=test_logs.sort_values(by=['id', 'down_time'])
test_logs = test_logs.reset_index(drop=True)
test_logs['event_id'] = test_logs.groupby('id').cumcount() + 1
test_logs.to_csv("test_logs.csv",index=None)

len(train_logs):8405898
len(test_logs):6


# 2. Preprocessing

## 2.1. Preprocess(1)

In [4]:
def getEssays(df):
    textInputDf = df[['id', 'activity', 'cursor_position', 'text_change']]
    textInputDf = textInputDf[textInputDf.activity != 'Nonproduction']
    valCountsArr = textInputDf['id'].value_counts(sort=False).values
    lastIndex = 0
    essaySeries = pd.Series()
    for index, valCount in enumerate(valCountsArr):
        currTextInput = textInputDf[['activity', 'cursor_position', 'text_change']].iloc[lastIndex : lastIndex + valCount]
        lastIndex += valCount
        essayText = ""
        for Input in currTextInput.values:
            if Input[0] == 'Replace':
                replaceTxt = Input[2].split(' => ')
                essayText = essayText[:Input[1] - len(replaceTxt[1])] + replaceTxt[1] +essayText[Input[1] - len(replaceTxt[1]) + len(replaceTxt[0]):]
                continue
            if Input[0] == 'Paste':
                essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
                continue
            if Input[0] == 'Remove/Cut':
                essayText = essayText[:Input[1]] + essayText[Input[1] + len(Input[2]):]
                continue
            if "M" in Input[0]:
                croppedTxt = Input[0][10:]
                splitTxt = croppedTxt.split(' To ')
                valueArr = [item.split(', ') for item in splitTxt]
                moveData = (int(valueArr[0][0][1:]), 
                            int(valueArr[0][1][:-1]), 
                            int(valueArr[1][0][1:]), 
                            int(valueArr[1][1][:-1]))
                if moveData[0] != moveData[2]:
                    if moveData[0] < moveData[2]:
                        essayText = essayText[:moveData[0]] + essayText[moveData[1]:moveData[3]] +\
                        essayText[moveData[0]:moveData[1]] + essayText[moveData[3]:]
                    else:
                        essayText = essayText[:moveData[2]] + essayText[moveData[0]:moveData[1]] +\
                        essayText[moveData[2]:moveData[0]] + essayText[moveData[1]:]
                continue
            essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
        essaySeries[index] = essayText
    essaySeries.index =  textInputDf['id'].unique()
    return pd.DataFrame(essaySeries, columns=['essay']).reset_index().rename(columns={"index":'id'})

## 2.2. Preprocess (2)

In [5]:
def q1(x):
    return x.quantile(0.25)
def q3(x):
    return x.quantile(0.75)
AGGREGATIONS = ['count', 'mean', 'std', 'min', 'max', 'first', 'last', 'sem', q1, 'median', q3, 'skew', kurtosis, 'sum']

def split_essays_into_words(df):
    essay_df = df
    essay_df['word'] = essay_df['essay'].apply(lambda x: re.split(' |\\n|\\.|\\?|\\!',x))
    essay_df = essay_df.explode('word')
    essay_df['word_len'] = essay_df['word'].apply(lambda x: len(x))
    essay_df = essay_df[essay_df['word_len'] != 0]
    return essay_df

def compute_word_aggregations(word_df):
    word_agg_df = word_df[['id','word_len']].groupby(['id']).agg(AGGREGATIONS)
    word_agg_df.columns = ['_'.join(x) for x in word_agg_df.columns]
    word_agg_df['id'] = word_agg_df.index
    for word_l in [5, 6, 7, 8, 9, 10, 11, 12]:
        word_agg_df[f'word_len_ge_{word_l}_count'] = word_df[word_df['word_len'] >= word_l].groupby(['id']).count().iloc[:, 0]
        word_agg_df[f'word_len_ge_{word_l}_count'] = word_agg_df[f'word_len_ge_{word_l}_count'].fillna(0)
    word_agg_df = word_agg_df.reset_index(drop=True)
    return word_agg_df

def split_essays_into_sentences(df):
    essay_df = df
    essay_df['sent'] = essay_df['essay'].apply(lambda x: re.split('\\.|\\?|\\!',x))
    essay_df = essay_df.explode('sent')
    essay_df['sent'] = essay_df['sent'].apply(lambda x: x.replace('\n','').strip())
    essay_df['sent_len'] = essay_df['sent'].apply(lambda x: len(x))
    essay_df['sent_word_count'] = essay_df['sent'].apply(lambda x: len(x.split(' ')))
    essay_df = essay_df[essay_df.sent_len!=0].reset_index(drop=True)
    return essay_df

def compute_sentence_aggregations(df):
    sent_agg_df = pd.concat(
        [df[['id','sent_len']].groupby(['id']).agg(AGGREGATIONS), df[['id','sent_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1
    )
    sent_agg_df.columns = ['_'.join(x) for x in sent_agg_df.columns]
    sent_agg_df['id'] = sent_agg_df.index

    for sent_l in [50, 60, 75, 100]:
        sent_agg_df[f'sent_len_ge_{sent_l}_count'] = df[df['sent_len'] >= sent_l].groupby(['id']).count().iloc[:, 0]
        sent_agg_df[f'sent_len_ge_{sent_l}_count'] = sent_agg_df[f'sent_len_ge_{sent_l}_count'].fillna(0)
    sent_agg_df = sent_agg_df.reset_index(drop=True)
    sent_agg_df.drop(columns=["sent_word_count_count"], inplace=True)
    sent_agg_df = sent_agg_df.rename(columns={"sent_len_count":"sent_count"})
    return sent_agg_df

def split_essays_into_paragraphs(df):
    essay_df = df
    essay_df['paragraph'] = essay_df['essay'].apply(lambda x: x.split('\n'))
    essay_df = essay_df.explode('paragraph')
    essay_df['paragraph_len'] = essay_df['paragraph'].apply(lambda x: len(x)) 
    essay_df['paragraph_word_count'] = essay_df['paragraph'].apply(lambda x: len(x.split(' ')))
    essay_df = essay_df[essay_df.paragraph_len!=0].reset_index(drop=True)
    return essay_df

def compute_paragraph_aggregations(df):
    paragraph_agg_df = pd.concat(
        [df[['id','paragraph_len']].groupby(['id']).agg(AGGREGATIONS), df[['id','paragraph_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1
    ) 
    paragraph_agg_df.columns = ['_'.join(x) for x in paragraph_agg_df.columns]
    paragraph_agg_df['id'] = paragraph_agg_df.index
    paragraph_agg_df = paragraph_agg_df.reset_index(drop=True)
    paragraph_agg_df.drop(columns=["paragraph_word_count_count"], inplace=True)
    paragraph_agg_df = paragraph_agg_df.rename(columns={"paragraph_len_count":"paragraph_count"})
    return paragraph_agg_df

In [6]:
print("train_essays")
train_essays = pd.read_csv('/kaggle/input/writing-quality-challenge-constructed-essays/train_essays_fast.csv')
print("train_word_agg_df")
train_word_agg_df = compute_word_aggregations(split_essays_into_words(train_essays))
print("train_sent_agg_df")
train_sent_agg_df = compute_sentence_aggregations(split_essays_into_sentences(train_essays))
print("train_paragraph_agg_df")
train_paragraph_agg_df = compute_paragraph_aggregations(split_essays_into_paragraphs(train_essays))
print("test_essays")
test_essays = getEssays(test_logs)
test_essays_copy=test_essays.copy()
print("test_word_agg_df")
test_word_agg_df = compute_word_aggregations(split_essays_into_words(test_essays))
print("test_sent_agg_df")
test_sent_agg_df = compute_sentence_aggregations(split_essays_into_sentences(test_essays))
print("test_paragraph_agg_df")
test_paragraph_agg_df = compute_paragraph_aggregations(split_essays_into_paragraphs(test_essays))

train_essays
train_word_agg_df
train_sent_agg_df
train_paragraph_agg_df
test_essays
test_word_agg_df
test_sent_agg_df
test_paragraph_agg_df


## 2.3. Preprocess (3)

In [7]:
class Preprocessor:#数据预处理的一个类
    
    def __init__(self):
        
        self.activities = ['Input', 'Remove/Cut', 'Nonproduction', 'Replace', 'Paste','Move From']
        self.events = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 
              'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']
        self.text_changes = ['q', ' ', 'NoChange', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']
        self.punctuations = ['"', '.', ',', "'", '-', ';', ':', '?', '!', '<', '>', '/',
                        '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+']
        self.gaps = [1, 2, 3, 5, 10, 20, 50, 100]
        
        self.idf = defaultdict(float)
    
    def activity_counts(self, df):
        tmp_df = df.groupby('id').agg({'activity': list}).reset_index()
        ret = list()
        for li in tmp_df['activity'].values:
            items = list(Counter(li).items())
            di = dict()
            for k in self.activities:
                di[k] = 0
            for item in items:
                k, v = item[0], item[1]
                if k in di:
                    di[k] = v
            ret.append(di)
        ret = pd.DataFrame(ret)
        cols = [f'activity_{i}_count' for i in range(len(ret.columns))]
        ret.columns = cols

        cnts = ret.sum(1)

        for col in cols:
            if col in self.idf.keys():
                idf = self.idf[col]
            else:
                idf = np.log(df.shape[0] / (ret[col].sum() + 1))
                self.idf[col] = idf
            ret[col] = 1 + np.log(ret[col] / cnts)
            ret[col] *= idf

        return ret

    def event_counts(self, df, colname):
        tmp_df = df.groupby('id').agg({colname: list}).reset_index()
        ret = list()
        for li in tmp_df[colname].values:
            items = list(Counter(li).items())
            di = dict()
            for k in self.events:
                di[k] = 0
            for item in items:
                k, v = item[0], item[1]
                if k in di:
                    di[k] = v
            ret.append(di)
        ret = pd.DataFrame(ret)
        cols = [f'{colname}_{i}_count' for i in range(len(ret.columns))]
        ret.columns = cols

        cnts = ret.sum(1)

        for col in cols:
            if col in self.idf.keys():
                idf = self.idf[col]
            else:
                idf = df.shape[0] / (ret[col].sum() + 1)
                idf = np.log(idf)
                self.idf[col] = idf
            
            ret[col] = 1 + np.log(ret[col] / cnts)
            ret[col] *= idf

        return ret

    def text_change_counts(self, df):
        tmp_df = df.groupby('id').agg({'text_change': list}).reset_index()
        ret = list()
        for li in tmp_df['text_change'].values:
            items = list(Counter(li).items())
            di = dict()
            for k in self.text_changes:
                di[k] = 0
            for item in items:
                k, v = item[0], item[1]
                if k in di:
                    di[k] = v
            ret.append(di)
        ret = pd.DataFrame(ret)
        cols = [f'text_change_{i}_count' for i in range(len(ret.columns))]
        ret.columns = cols

        cnts = ret.sum(1)

        for col in cols:
            if col in self.idf.keys():
                idf = self.idf[col]
            else:
                idf = df.shape[0] / (ret[col].sum() + 1)
                idf = np.log(idf)
                self.idf[col] = idf
            
            ret[col] = 1 + np.log(ret[col] / cnts)
            ret[col] *= idf
            
        return ret
    
    def match_punctuations(self, df):
        tmp_df = df.groupby('id').agg({'down_event': list}).reset_index()
        ret = list()
        for li in tmp_df['down_event'].values:
            cnt = 0
            items = list(Counter(li).items())
            for item in items:
                k, v = item[0], item[1]
                if k in self.punctuations:
                    cnt += v
            ret.append(cnt)
        ret = pd.DataFrame({'punct_cnt': ret})
        return ret


    def get_input_words(self, df):
        tmp_df = df[(~df['text_change'].str.contains('=>'))&(df['text_change'] != 'NoChange')].reset_index(drop=True)
        tmp_df = tmp_df.groupby('id').agg({'text_change': list}).reset_index()
        tmp_df['text_change'] = tmp_df['text_change'].apply(lambda x: ''.join(x))
        tmp_df['text_change'] = tmp_df['text_change'].apply(lambda x: re.findall(r'q+', x))
        tmp_df['input_word_count'] = tmp_df['text_change'].apply(len)
        tmp_df['input_word_length_mean'] = tmp_df['text_change'].apply(lambda x: np.mean([len(i) for i in x] if len(x) > 0 else 0))
        tmp_df['input_word_length_max'] = tmp_df['text_change'].apply(lambda x: np.max([len(i) for i in x] if len(x) > 0 else 0))
        tmp_df['input_word_length_std'] = tmp_df['text_change'].apply(lambda x: np.std([len(i) for i in x] if len(x) > 0 else 0))
        tmp_df.drop(['text_change'], axis=1, inplace=True)
        return tmp_df
    
    def make_feats(self, df):
        print("Starting to engineer features")
        feats = pd.DataFrame({'id': df['id'].unique().tolist()})
        print("Engineering time data")
        for gap in self.gaps:
            print(f"-> for gap {gap}")
            df[f'up_time_shift{gap}'] = df.groupby('id')['up_time'].shift(gap)
            df[f'action_time_gap{gap}'] = df['down_time'] - df[f'up_time_shift{gap}']
        df.drop(columns=[f'up_time_shift{gap}' for gap in self.gaps], inplace=True)

        print("Engineering cursor position data")
        for gap in self.gaps:
            print(f"-> for gap {gap}")
            df[f'cursor_position_shift{gap}'] = df.groupby('id')['cursor_position'].shift(gap)
            df[f'cursor_position_change{gap}'] = df['cursor_position'] - df[f'cursor_position_shift{gap}']
            df[f'cursor_position_abs_change{gap}'] = np.abs(df[f'cursor_position_change{gap}'])
        df.drop(columns=[f'cursor_position_shift{gap}' for gap in self.gaps], inplace=True)

        print("Engineering word count data")
        for gap in self.gaps:
            print(f"-> for gap {gap}")
            df[f'word_count_shift{gap}'] = df.groupby('id')['word_count'].shift(gap)
            df[f'word_count_change{gap}'] = df['word_count'] - df[f'word_count_shift{gap}']
            df[f'word_count_abs_change{gap}'] = np.abs(df[f'word_count_change{gap}'])
        df.drop(columns=[f'word_count_shift{gap}' for gap in self.gaps], inplace=True)
        
        print("Engineering statistical summaries for features")
        feats_stat = [
            ('event_id', ['max']),
            ('down_time',['mean', 'std', 'min', 'max', 'last', 'first', 'sem', 'median', 'sum']),
            ('up_time',['mean', 'std', 'min', 'max', 'last', 'first', 'sem', 'median', 'sum']),
            ('action_time', ['max', 'min', 'mean', 'std', 'quantile', 'sem', 'sum', 'skew', pd.DataFrame.kurt,'last', 'first','median']),
            ('activity', ['nunique']),
            ('down_event', ['nunique']),
            ('up_event', ['nunique']),
            ('text_change', ['nunique']),
            ('cursor_position', ['nunique', 'max', 'quantile', 'sem', 'mean', 'std', 'min','last', 'first',  'median', 'sum']),
            ('word_count', ['nunique', 'max', 'quantile', 'sem', 'mean', 'std', 'min', 'last', 'first','median', 'sum'])]
        for gap in self.gaps:
            feats_stat.extend([
                (f'action_time_gap{gap}', ['max', 'min', 'mean', 'std', 'quantile', 'sem', 'sum', 'skew', pd.DataFrame.kurt]),
                (f'cursor_position_change{gap}', ['max', 'mean', 'std', 'quantile', 'sem', 'sum', 'skew', pd.DataFrame.kurt]),
                (f'word_count_change{gap}', ['max', 'mean', 'std', 'quantile', 'sem', 'sum', 'skew', pd.DataFrame.kurt])
            ])
        
        pbar = feats_stat
        for item in pbar:
            colname, methods = item[0], item[1]
            for method in methods:
                if isinstance(method, str):
                    method_name = method
                else:
                    method_name = method.__name__
                tmp_df = df.groupby(['id']).agg({colname: method}).reset_index().rename(columns={colname: f'{colname}_{method_name}'})
                feats = feats.merge(tmp_df, on='id', how='left')

        print("Engineering activity counts data")
        tmp_df = self.activity_counts(df)
        feats = pd.concat([feats, tmp_df], axis=1)
        print("Engineering event counts data")
        tmp_df = self.event_counts(df, 'down_event')
        feats = pd.concat([feats, tmp_df], axis=1)
        tmp_df = self.event_counts(df, 'up_event')
        feats = pd.concat([feats, tmp_df], axis=1)
        
        print("Engineering text change counts data")
        tmp_df = self.text_change_counts(df)
        feats = pd.concat([feats, tmp_df], axis=1)
        
        print("Engineering punctuation counts data")
        tmp_df = self.match_punctuations(df)
        feats = pd.concat([feats, tmp_df], axis=1)

        print("Engineering input words data")
        tmp_df = self.get_input_words(df)
        feats = pd.merge(feats, tmp_df, on='id', how='left')

        print("Engineering ratios data")
        feats['word_time_ratio'] = feats['word_count_max'] / feats['up_time_max']
        feats['word_event_ratio'] = feats['word_count_max'] / feats['event_id_max']
        feats['event_time_ratio'] = feats['event_id_max']  / feats['up_time_max']
        feats['idle_time_ratio'] = feats['action_time_gap1_sum'] / feats['up_time_max']
        print("Done!")
        return feats

preprocessor = Preprocessor()
print("Engineering features for training data")

train_feats = preprocessor.make_feats(train_logs)
print("-"*25)
print("Engineering features for test data")
test_feats = preprocessor.make_feats(test_logs)

Engineering features for training data
Starting to engineer features
Engineering time data
-> for gap 1
-> for gap 2
-> for gap 3
-> for gap 5
-> for gap 10
-> for gap 20
-> for gap 50
-> for gap 100
Engineering cursor position data
-> for gap 1
-> for gap 2
-> for gap 3
-> for gap 5
-> for gap 10
-> for gap 20
-> for gap 50
-> for gap 100
Engineering word count data
-> for gap 1
-> for gap 2
-> for gap 3
-> for gap 5
-> for gap 10
-> for gap 20
-> for gap 50
-> for gap 100
Engineering statistical summaries for features
Engineering activity counts data
Engineering event counts data
Engineering text change counts data
Engineering punctuation counts data
Engineering input words data
Engineering ratios data
Done!
-------------------------
Engineering features for test data
Starting to engineer features
Engineering time data
-> for gap 1
-> for gap 2
-> for gap 3
-> for gap 5
-> for gap 10
-> for gap 20
-> for gap 50
-> for gap 100
Engineering cursor position data
-> for gap 1
-> for gap 2

## 2.4. Preprocess (4)

In [8]:
data = []

for logs in [train_logs, test_logs]:
    logs['up_time_lagged'] = logs.groupby('id')['up_time'].shift(1).fillna(logs['down_time'])
    logs['time_diff'] = abs(logs['down_time'] - logs['up_time_lagged']) / 1000

    group = logs.groupby('id')['time_diff']
    largest_lantency = group.max()
    smallest_lantency = group.min()
    median_lantency = group.median()
    initial_pause = logs.groupby('id')['down_time'].first() / 1000
    pauses_half_sec = group.apply(lambda x: ((x > 0.5) & (x <= 1)).sum())
    pauses_1_sec = group.apply(lambda x: ((x > 1) & (x <= 1.5)).sum())
    pauses_1_half_sec = group.apply(lambda x: ((x > 1.5) & (x <= 2)).sum())
    pauses_2_sec = group.apply(lambda x: ((x > 2) & (x <= 3)).sum())
    pauses_3_sec = group.apply(lambda x: (x > 3).sum())

    data.append(pd.DataFrame({
        'id': logs['id'].unique(),
        'largest_lantency': largest_lantency,
        'smallest_lantency': smallest_lantency,
        'median_lantency': median_lantency,
        'initial_pause': initial_pause,
        'pauses_half_sec': pauses_half_sec,
        'pauses_1_sec': pauses_1_sec,
        'pauses_1_half_sec': pauses_1_half_sec,
        'pauses_2_sec': pauses_2_sec,
        'pauses_3_sec': pauses_3_sec,
    }).reset_index(drop=True))

train_eD592674, test_eD592674 = data

gc.collect()

train_feats = train_feats.merge(train_eD592674, on='id', how='left')
test_feats = test_feats.merge(test_eD592674, on='id', how='left')
train_feats = train_feats.merge(train_scores, on='id', how='left')

## 2.5. Preprocess Final

In [9]:
train_feats=train_feats.merge(train_word_agg_df,on='id', how='left')
train_feats=train_feats.merge(train_sent_agg_df,on='id', how='left')
train_feats=train_feats.merge(train_paragraph_agg_df,on='id', how='left')

test_feats=test_feats.merge(test_word_agg_df,on='id', how='left')
test_feats=test_feats.merge(test_sent_agg_df,on='id', how='left')
test_feats=test_feats.merge(test_paragraph_agg_df,on='id', how='left')

In [10]:
import pandas as pd

# Assuming train_set and test_set are your dataframes
train_columns = set(train_feats.columns)
test_columns = set(test_feats.columns)

# Columns present in train set but not in test set
columns_only_in_train = train_columns - test_columns

# Columns present in test set but not in train set
columns_only_in_test = test_columns - train_columns

print("Columns only in train set:", columns_only_in_train)
print("Columns only in test set:", columns_only_in_test)


Columns only in train set: {'score'}
Columns only in test set: set()


# 3. Train Test 

In [11]:
train_X = train_feats.drop(['id','score'],axis=1)
train_y = train_feats['score']

train_X.replace([np.inf, -np.inf], 0, inplace=True)
train_y.replace([np.inf, -np.inf], 0, inplace = True)

columns_to_drop = train_X.columns[train_X.isnull().sum() > 10]

train_X.drop(columns_to_drop, axis=1, inplace=True)
train_X.fillna(train_X.mean(), inplace=True)

test_X = test_feats.drop('id',axis=1)

test_X.replace([np.inf, -np.inf], 0, inplace = True)
test_X.drop(columns_to_drop, axis=1, inplace=True)
test_X.fillna(test_X.mean(), inplace=True)

X_features = train_X.columns.tolist()

# should be size of (395, ) 
print(f'train_X columns: {train_X.columns.shape}')
print(f'test_X columns: {test_X.columns.shape}')

train_X columns: (400,)
test_X columns: (400,)


# 4. Modeling

## 4.1. Parameters

In [12]:
def make_model():


    lgb_params_1 = {'reg_alpha': 0.007678095440286993, 
                    'reg_lambda': 0.34230534302168353, 
                    'colsample_bytree': 0.627061253588415, 
                    'subsample': 0.854942238828458, 
                    'learning_rate': 0.038697981947473245, 
                    'num_leaves': 22, 
                    'max_depth': 37, 
                    'min_child_samples': 18,
                    'random_state': seed,
                    'n_estimators': 150,
                    "objective": "regression",
                    "metric": "rmse",
                    'force_col_wise': True,
                    "verbosity": 0,}

    lgb_params_2 =  {'boosting_type': 'gbdt', 
                     'metric': 'rmse',
                     'random_state': seed,
                     'reg_alpha': 0.003188447814669599, 
                     'reg_lambda': 0.0010228604507564066, 
                     'colsample_bytree': 0.5420247656839267, 
                     'subsample': 0.9778252382803456, 
                     'feature_fraction': 0.8,
                     'bagging_freq': 1,
                     'bagging_fraction': 0.75,
                     'learning_rate': 0.01716485155812008, 
                     'num_leaves': 19, 
                     'min_child_samples': 46,
                     'verbosity': -1,
                     'random_state': 42,
                     'n_estimators': 500,
                     'device_type': 'cpu'}

    catboost_params = {"iterations": 1000,
                       "learning_rate": 0.1,
                       "depth": 6,
                       "eval_metric": 'RMSE',
                       "random_seed": seed,  # Ensure 'seed' is defined somewhere in your code
                       "bagging_temperature": 0.2,
                       "od_type": 'Iter',
                       "metric_period": 50,
                       "od_wait": 20,
                       "verbose": False}

    xgb_params={'reg_alpha': 0.0008774661176012108,
                'reg_lambda': 2.542812743920178,
                'colsample_bynode': 0.7839026197349153,
                'subsample': 0.8994226268096415, 
                'eta': 0.04730766698056879, 
                'max_depth': 3, 
                'n_estimators': 1024,
                'random_state': 42,
                'eval_metric': 'rmse'}

    # 5 models
    lgb_model_1 = LGBMRegressor(**lgb_params_1)
    lgb_model_2 = LGBMRegressor(**lgb_params_2)
    cb_model = CatBoostRegressor(**catboost_params)
    xgb_model = XGBRegressor(**xgb_params)
    svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)

    models = []
    models.append((lgb_model_1, 'lgbm_1'))
    models.append((lgb_model_2, 'lgbm_2'))
    models.append((cb_model, 'catboost'))
    models.append((xgb_model, 'xgboost'))
    models.append((svr_model, 'svr'))
    # models.append((svr_model, 'svr'))
#     print(models)
    return models

In [13]:
# model = pd.DataFrame(make_model())[0][1]
# model.fit(train_X, train_y)
# prediction = model.predict(test_X)
# clipped_predictions = np.clip(prediction, 0, 6)

In [14]:
# test_ids = test_feats['id'].values
# submission = pd.DataFrame({'id': test_ids, 'score': clipped_predictions})
# submission.to_csv('submission.csv', index=False)
# submission.head()

## 4.2. First Training & Inference (to get Feature Importances)

In [15]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

def RMSE(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Assuming num_folds, seed, train_X, and train_y are defined
kf = KFold(n_splits=num_folds, shuffle=True, random_state=seed + num_folds)
models_and_errors_dict = {}

# Initialize dictionaries for cumulative feature importances
cumulative_importance = {model_type: {col: 0 for col in train_X.columns} for _, model_type in make_model()}

for model, model_type in make_model():
    print(f'---{model_type}---')
    model_averaged_rmse = 0
    oof_pred = np.zeros((len(train_X)))

    for fold, (train_index, val_index) in enumerate(kf.split(train_X), start=1):
        print(f'--- | Fold # {fold} | ---')

        X_train, X_val = train_X.iloc[train_index], train_X.iloc[val_index]
        y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]

        # Scale data for models that need it
        if model_type == 'svr':
            imputer = SimpleImputer(strategy='mean')
            X_train_imputed = imputer.fit_transform(X_train.copy())
            X_val_imputed = imputer.transform(X_val.copy())
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train_imputed)
            X_val_scaled = scaler.transform(X_val_imputed)
            
            X_train = X_train_scaled
            X_val = X_val_scaled
            
        # Fit the model
        model.fit(X_train, y_train)

        # Make predictions
        predictions = model.predict(X_val)
        oof_pred[val_index] = predictions

        # Calculate and store RMSE
        rmse = RMSE(y_val, predictions)
        model_averaged_rmse += rmse / num_folds  
        print(f'Fold #{fold}, RMSE: {rmse}')

        # Store results
        if model_type not in models_and_errors_dict:
            models_and_errors_dict[model_type] = []
            if model_type == 'svr':
                models_and_errors_dict[model_type].append((model, rmse, imputer, scaler,oof_pred))
            else:
                models_and_errors_dict[model_type].append((model, rmse, None, None, oof_pred))  

        # Accumulate feature importances for tree-based models
        if model_type in ['lgbm_1', 'lgbm_2', 'catboost', 'xgboost']:
            feature_importance = model.feature_importances_
            for i, col in enumerate(train_X.columns):
                cumulative_importance[model_type][col] += feature_importance[i]

    print(f'Average RMSE for {model_type}: {model_averaged_rmse}')

---lgbm_1---
--- | Fold # 1 | ---
Fold #1, RMSE: 0.6199676481116945
--- | Fold # 2 | ---
Fold #2, RMSE: 0.5616024284068566
--- | Fold # 3 | ---
Fold #3, RMSE: 0.6555362603354432
--- | Fold # 4 | ---
Fold #4, RMSE: 0.6508981346607098
--- | Fold # 5 | ---
Fold #5, RMSE: 0.613181560711195
--- | Fold # 6 | ---
Fold #6, RMSE: 0.5980724109799842
--- | Fold # 7 | ---
Fold #7, RMSE: 0.5863035851841022
--- | Fold # 8 | ---
Fold #8, RMSE: 0.6797684763204279
--- | Fold # 9 | ---
Fold #9, RMSE: 0.5968646737783446
--- | Fold # 10 | ---
Fold #10, RMSE: 0.6348282383490511
Average RMSE for lgbm_1: 0.6197023416837808
---lgbm_2---
--- | Fold # 1 | ---
Fold #1, RMSE: 0.6237130093066905
--- | Fold # 2 | ---
Fold #2, RMSE: 0.566245254738372
--- | Fold # 3 | ---
Fold #3, RMSE: 0.6488900539991386
--- | Fold # 4 | ---
Fold #4, RMSE: 0.6479376114499283
--- | Fold # 5 | ---
Fold #5, RMSE: 0.5968239589971027
--- | Fold # 6 | ---
Fold #6, RMSE: 0.6029094966773192
--- | Fold # 7 | ---
Fold #7, RMSE: 0.587286228383

## 4.3. Rank Feature Importances & Get Intersection

In [16]:
top_k = 350

top_k_features_per_model = {}

for model_type, importances in cumulative_importance.items():
    if model_type not in ['svr']: 
        total_importance = sum(importances.values())
        avg_importance = {col: (imp / num_folds) / total_importance for col, imp in importances.items()}

        ranked_features = sorted(avg_importance.items(), key=lambda x: x[1], reverse=True)
        top_k_features = [feature for feature, _ in ranked_features[:top_k]]

        top_k_features_per_model[model_type] = top_k_features

common_features = set(top_k_features_per_model[next(iter(top_k_features_per_model))])
for features in top_k_features_per_model.values():
    common_features.intersection_update(features)

print(f"Common Top {top_k} Features across all models:")
print(common_features)

common_features_list = list(common_features)
print("Common Features as List:", len(common_features_list))

Common Top 350 Features across all models:
{'input_word_count', 'cursor_position_change10_mean', 'up_event_8_count', 'action_time_gap50_quantile', 'text_change_5_count', 'word_count_change10_std', 'down_event_8_count', 'action_time_gap20_quantile', 'action_time_gap100_sum', 'word_count_last', 'action_time_gap10_std', 'cursor_position_change5_sum', 'cursor_position_change20_mean', 'action_time_gap100_kurt', 'word_len_ge_7_count', 'action_time_gap50_skew', 'up_event_7_count', 'cursor_position_change100_std', 'input_word_length_std', 'down_event_7_count', 'action_time_max', 'cursor_position_std', 'paragraph_count', 'action_time_gap1_kurt', 'cursor_position_change100_sum', 'action_time_gap2_skew', 'action_time_gap2_sem', 'word_count_change3_std', 'action_time_gap5_sum', 'text_change_nunique', 'up_event_nunique', 'action_time_gap50_std', 'down_event_6_count', 'action_time_gap1_mean', 'down_time_std', 'paragraph_word_count_last', 'cursor_position_change20_skew', 'text_change_9_count', 'curso

## 4.4. Use common Features to train & get oof

In [17]:
models_and_errors_dict = {}

for model, model_type in make_model():
    print(f'---{model_type}---')
    model_averaged_rmse = 0
    oof_pred = np.zeros((len(train_X)))

    for fold, (train_index, val_index) in enumerate(kf.split(train_X), start=1):
        print(f'--- | Fold # {fold} | ---')

        X_train, X_val = train_X[common_features_list].iloc[train_index], train_X[common_features_list].iloc[val_index]
        y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]

        if model_type == 'svr':
            imputer = SimpleImputer(strategy='mean')
            X_train_imputed = imputer.fit_transform(X_train.copy())
            X_val_imputed = imputer.transform(X_val.copy())
            
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train_imputed)
            X_val_scaled = scaler.transform(X_val_imputed)
            
            X_train = X_train_scaled
            X_val = X_val_scaled

        model.fit(X_train, y_train)

        predictions = model.predict(X_val)
        oof_pred[val_index] = predictions

        rmse = RMSE(y_val, predictions)
        model_averaged_rmse += rmse / num_folds  
        print(f'Fold #{fold}, RMSE: {rmse}')

        if model_type not in models_and_errors_dict:
            models_and_errors_dict[model_type] = []
            
        if model_type == 'svr':
            models_and_errors_dict[model_type].append((model, rmse, imputer, scaler, oof_pred))
        else:
            models_and_errors_dict[model_type].append((model, rmse, None, None, oof_pred))  
    print(f'Average RMSE for {model_type}: {model_averaged_rmse}')

---lgbm_1---
--- | Fold # 1 | ---
Fold #1, RMSE: 0.6265812635695837
--- | Fold # 2 | ---
Fold #2, RMSE: 0.5634821519635933
--- | Fold # 3 | ---
Fold #3, RMSE: 0.6552093163784364
--- | Fold # 4 | ---
Fold #4, RMSE: 0.643096753889907
--- | Fold # 5 | ---
Fold #5, RMSE: 0.6127125514743366
--- | Fold # 6 | ---
Fold #6, RMSE: 0.5925064747977018
--- | Fold # 7 | ---
Fold #7, RMSE: 0.5917254798334421
--- | Fold # 8 | ---
Fold #8, RMSE: 0.6736208430819208
--- | Fold # 9 | ---
Fold #9, RMSE: 0.5987980545194814
--- | Fold # 10 | ---
Fold #10, RMSE: 0.6229100529346661
Average RMSE for lgbm_1: 0.6180642942443069
---lgbm_2---
--- | Fold # 1 | ---
Fold #1, RMSE: 0.6227447494796925
--- | Fold # 2 | ---
Fold #2, RMSE: 0.5679092353440874
--- | Fold # 3 | ---
Fold #3, RMSE: 0.6489429352500286
--- | Fold # 4 | ---
Fold #4, RMSE: 0.6454351225293053
--- | Fold # 5 | ---
Fold #5, RMSE: 0.5989294291440892
--- | Fold # 6 | ---
Fold #6, RMSE: 0.6006847961918101
--- | Fold # 7 | ---
Fold #7, RMSE: 0.58955274234

## 4.5. Get Optimized Weights

In [18]:
from tqdm.notebook import tqdm
lgb1_oof_pred = models_and_errors_dict['lgbm_1'][9][4]
lgb2_oof_pred = models_and_errors_dict['lgbm_2'][9][4]
cb_oof_pred = models_and_errors_dict['catboost'][9][4]
xgb_oof_pred = models_and_errors_dict['xgboost'][9][4]
svr_oof_pred = models_and_errors_dict['svr'][9][4]

target = train_y.values

current_RMSE = RMSE(target, (lgb1_oof_pred + lgb2_oof_pred + cb_oof_pred + xgb_oof_pred + svr_oof_pred) / 5)
print(f'Current RMSE Score: {current_RMSE}')

best_i, best_j, best_k, best_l = 0, 0, 0, 0
margin = 300  
step_size = 5 

for i in tqdm(range(0, margin, step_size)):
    for j in range(0, margin - i, step_size):
        for k in range(0, margin - i - j, step_size):
            for l in range(0, margin - i - j - k, step_size):
                blend_oof_pred = (i * lgb1_oof_pred + j * cb_oof_pred + k * xgb_oof_pred + l * svr_oof_pred + (margin - i - j - k - l) * lgb2_oof_pred) / margin
                new_RMSE = RMSE(target, blend_oof_pred)
                if new_RMSE < current_RMSE:
                    print(f"current_RMSE:{current_RMSE}, new_RMSE:{new_RMSE}")
                    current_RMSE = new_RMSE
                    best_i, best_j, best_k, best_l = i, j, k, l

blending_weights = {
    'lgbm_1': best_i / margin,
    'catboost': best_j / margin,
    'xgboost': best_k / margin,
    'svr': best_l / margin,
    'lgbm_2': (margin - best_i - best_j - best_k - best_l) / margin}

print(f"blending_weights: {blending_weights}")

Current RMSE Score: 0.6147491465442274


  0%|          | 0/60 [00:00<?, ?it/s]

current_RMSE:0.6147491465442274, new_RMSE:0.6147490356275602
current_RMSE:0.6147490356275602, new_RMSE:0.6147290807504068
current_RMSE:0.6147290807504068, new_RMSE:0.614727658254777
current_RMSE:0.614727658254777, new_RMSE:0.6147235696815408
current_RMSE:0.6147235696815408, new_RMSE:0.6147077971609938
current_RMSE:0.6147077971609938, new_RMSE:0.6146981690272634
current_RMSE:0.6146981690272634, new_RMSE:0.6146928055572937
current_RMSE:0.6146928055572937, new_RMSE:0.6146858966491054
current_RMSE:0.6146858966491054, new_RMSE:0.6146714962581526
current_RMSE:0.6146714962581526, new_RMSE:0.6146497708795445
current_RMSE:0.6146497708795445, new_RMSE:0.6146463178312245
current_RMSE:0.6146463178312245, new_RMSE:0.6146359337380163
current_RMSE:0.6146359337380163, new_RMSE:0.614618390894158
current_RMSE:0.614618390894158, new_RMSE:0.6146120269864865
current_RMSE:0.6146120269864865, new_RMSE:0.6145986673863187
current_RMSE:0.6145986673863187, new_RMSE:0.6145906014782694
current_RMSE:0.6145906014782

# 5. Inference to Test Set 

In [19]:
y_hats = dict()

submission_df = pd.DataFrame(test_feats['id'])
submission_df['score'] = 3.5

X_unseen = test_X.copy()[common_features_list]
X_unseen.replace([np.inf, -np.inf], np.nan, inplace=True)

for model_name, model_info in models_and_errors_dict.items():
    print(f'\n--- {model_name} ---\n')
    X_unseen_copy = X_unseen.copy()
    y_hats[model_name] = []
    
    for ix, (trained_model, error, imputer, scaler,oof_pred) in enumerate(model_info, start=1):
        print(f"Using model {ix} with error {error}")
        if model_name == 'svr':
            X_unseen_imputed = imputer.transform(X_unseen_copy)
            X_unseen_scaled = scaler.transform(X_unseen_imputed)
            y_hats[model_name].append(trained_model.predict(X_unseen_scaled))
            
        else:
            print(X_unseen_copy.shape)
            y_hats[model_name].append(trained_model.predict(X_unseen_copy))
    if y_hats[model_name]:
        y_hat_avg = np.mean(y_hats[model_name], axis=0)
        submission_df['score_' + model_name] = y_hat_avg
    print("Done.")
    
print("blending")
blended_score=np.zeros((len(test_essays_copy)))
for k, v in blending_weights.items():
    blended_score += submission_df['score_' + k] * v
print(f"blended_score:{blended_score}")


--- lgbm_1 ---

Using model 1 with error 0.6265812635695837
(3, 310)
Using model 2 with error 0.5634821519635933
(3, 310)
Using model 3 with error 0.6552093163784364
(3, 310)
Using model 4 with error 0.643096753889907
(3, 310)
Using model 5 with error 0.6127125514743366
(3, 310)
Using model 6 with error 0.5925064747977018
(3, 310)
Using model 7 with error 0.5917254798334421
(3, 310)
Using model 8 with error 0.6736208430819208
(3, 310)
Using model 9 with error 0.5987980545194814
(3, 310)
Using model 10 with error 0.6229100529346661
(3, 310)
Done.

--- lgbm_2 ---

Using model 1 with error 0.6227447494796925
(3, 310)
Using model 2 with error 0.5679092353440874
(3, 310)
Using model 3 with error 0.6489429352500286
(3, 310)
Using model 4 with error 0.6454351225293053
(3, 310)
Using model 5 with error 0.5989294291440892
(3, 310)
Using model 6 with error 0.6006847961918101
(3, 310)
Using model 7 with error 0.5895527423412364
(3, 310)
Using model 8 with error 0.6615163183258349
(3, 310)
Using 

# 6. Submit

In [20]:
test_ids = test_feats['id'].values
blended_score = np.clip(blended_score, 0, 6)
submission = pd.DataFrame({'id': test_ids, 'score': blended_score})
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,score
0,0000aaaa,1.848978
1,2222bbbb,1.802426
2,4444cccc,1.845421
