In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

## Train_data

### CurrentTask, LastTaskCompleted-Encoding 

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

train_data_path = 'data/train_data.csv'
train_df = pd.read_csv(train_data_path)

label_encoder = LabelEncoder()

if train_df['CurrentTask'].dtype == 'object':
    train_df['CurrentTask_TargetEncoded'] = label_encoder.fit_transform(train_df['CurrentTask'].astype(str))

if train_df['LastTaskCompleted'].dtype == 'object':
    train_df['LastTaskCompleted_TargetEncoded'] = label_encoder.fit_transform(train_df['LastTaskCompleted'].astype(str))

output_file_path = 'train_data_knn_imputed.csv'
train_df.to_csv(output_file_path, index=False)

### CurrentTask, LastTaskCompleted-KNN

In [3]:
from sklearn.impute import KNNImputer

train_data = pd.read_csv('train_data_knn_imputed.csv')

# Select the two features 'CurrentTask_TargetEncoded' and 'LastTaskCompleted_TargetEncoded'
features_to_impute = ['CurrentTask_TargetEncoded', 'LastTaskCompleted_TargetEncoded']

# Initialize KNN imputer
imputer = KNNImputer(n_neighbors=5)

# Impute the missing values
imputed_data = imputer.fit_transform(train_data[features_to_impute])

# Transfer the filled data back to the DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=features_to_impute, index=train_data.index)

# Re-assign the filled features back to the original dataset
train_data[features_to_impute] = imputed_df[features_to_impute]

print("Processed training data with imputed values:")
print(train_data.head())

# Save the imputed data to a new csv file
train_output_file_path = 'train_data_knn_imputed.csv'
train_data.to_csv(train_output_file_path, index=False)


Processed training data with imputed values:
  UserID    QuestionTiming              TimeUtc CurrentGameMode  \
0     p1    User Initiated  2022-08-18 22:55:27             NaN   
1     p1  System Initiated  2022-08-18 23:38:31             NaN   
2     p1    User Initiated  2022-08-18 23:39:24          Career   
3     p1  System Initiated  2022-08-18 23:45:01          Career   
4     p1  System Initiated  2022-08-18 23:51:22          Career   

                 CurrentTask  CurrentSessionLength LastTaskCompleted  \
0                        NaN                     2               NaN   
1                        NaN                     0               NaN   
2                   HOME_VAN                     1        WASH_PWVan   
3  RESIDENTIALSMALL_BACKYARD                     6        WASH_PWVan   
4  RESIDENTIALSMALL_BACKYARD                    13        WASH_PWVan   

   LevelProgressionAmount QuestionType  ResponseValue  \
0                     NaN    Wellbeing          509.0   
1    

### CurrentGameMode-Encoding

In [4]:
from sklearn.preprocessing import LabelEncoder

# load the data
train_data = pd.read_csv('train_data_knn_imputed.csv')
print(train_data['CurrentGameMode'].unique())

# Initialize LabelEncoder
le = LabelEncoder()

# Encoding 'CurrentGameMode' using Label Encoding
train_data['CurrentGameMode_LabelEncoded'] = le.fit_transform(train_data['CurrentGameMode'])

train_data.head()


[nan 'Career' 'Special' 'FreePlay' 'Challenge']


Unnamed: 0,UserID,QuestionTiming,TimeUtc,CurrentGameMode,CurrentTask,CurrentSessionLength,LastTaskCompleted,LevelProgressionAmount,QuestionType,ResponseValue,CurrentTask_TargetEncoded,LastTaskCompleted_TargetEncoded,CurrentGameMode_LabelEncoded
0,p1,User Initiated,2022-08-18 22:55:27,,,2,,,Wellbeing,509.0,49.0,51.0,4
1,p1,System Initiated,2022-08-18 23:38:31,,,0,,,Wellbeing,653.0,49.0,51.0,4
2,p1,User Initiated,2022-08-18 23:39:24,Career,HOME_VAN,1,WASH_PWVan,1.0,Wellbeing,705.0,21.0,32.0,0
3,p1,System Initiated,2022-08-18 23:45:01,Career,RESIDENTIALSMALL_BACKYARD,6,WASH_PWVan,0.168267,Wellbeing,817.0,33.0,32.0,0
4,p1,System Initiated,2022-08-18 23:51:22,Career,RESIDENTIALSMALL_BACKYARD,13,WASH_PWVan,0.429364,Wellbeing,810.0,33.0,32.0,0


### CurrentGameMode-KNN

In [5]:
# Initialize KNN Imputer
imputer = KNNImputer(n_neighbors=5)

# Select feature 'CurrentGameMode_LabelEncoded' to be imputed
features_to_impute = ['CurrentGameMode_LabelEncoded', 'CurrentTask_TargetEncoded']

# Impute the missing values
imputed_data = imputer.fit_transform(train_data[features_to_impute])

# Transfer the filled data back to the DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=features_to_impute, index=train_data.index)

# Re-new the original 'CurrentGameMode_LabelEncoded' data 
train_data['CurrentGameMode_LabelEncoded'] = imputed_df['CurrentGameMode_LabelEncoded']

print("Processed training data with imputed CurrentGameMode:")
print(train_data.head())

# Save and renew the file
train_output_file_path = 'train_data_knn_imputed.csv'
train_data.to_csv(train_output_file_path, index=False)

Processed training data with imputed CurrentGameMode:
  UserID    QuestionTiming              TimeUtc CurrentGameMode  \
0     p1    User Initiated  2022-08-18 22:55:27             NaN   
1     p1  System Initiated  2022-08-18 23:38:31             NaN   
2     p1    User Initiated  2022-08-18 23:39:24          Career   
3     p1  System Initiated  2022-08-18 23:45:01          Career   
4     p1  System Initiated  2022-08-18 23:51:22          Career   

                 CurrentTask  CurrentSessionLength LastTaskCompleted  \
0                        NaN                     2               NaN   
1                        NaN                     0               NaN   
2                   HOME_VAN                     1        WASH_PWVan   
3  RESIDENTIALSMALL_BACKYARD                     6        WASH_PWVan   
4  RESIDENTIALSMALL_BACKYARD                    13        WASH_PWVan   

   LevelProgressionAmount QuestionType  ResponseValue  \
0                     NaN    Wellbeing          509.0

### TimeUtc-Convert Format

In [6]:
train_data = pd.read_csv("train_data_knn_imputed.csv")

# Ensure TimeUtc is a datetime type
train_data['TimeUtc'] = pd.to_datetime(train_data['TimeUtc'])

# Extract components
train_data['Year'] = train_data['TimeUtc'].dt.year
train_data['Month'] = train_data['TimeUtc'].dt.month
train_data['Day'] = train_data['TimeUtc'].dt.day
train_data['Hour'] = train_data['TimeUtc'].dt.hour
train_data['Minute'] = train_data['TimeUtc'].dt.minute
train_data['Second'] = train_data['TimeUtc'].dt.second

# Separate weekday and weekend days
train_data['Weekday'] = train_data['TimeUtc'].dt.dayofweek
train_data['WeekendFlag'] = (train_data['TimeUtc'].dt.weekday >= 5).astype(int)

# Categorize different times of day into periods
time_bins = [0, 6, 12, 18, 24]  # Define time_bins as 0-6, 6-12, 12-18, 18-24
time_labels = ['Night', 'Morning', 'Afternoon', 'Evening']
train_data['PeriodOfDay'] = pd.cut(train_data['TimeUtc'].dt.hour, bins=time_bins, labels=time_labels, right=False)

# Remove the original Timestamp column to avoid redundancy
train_data.drop('TimeUtc', axis=1, inplace=True)

train_data.head()

Unnamed: 0,UserID,QuestionTiming,CurrentGameMode,CurrentTask,CurrentSessionLength,LastTaskCompleted,LevelProgressionAmount,QuestionType,ResponseValue,CurrentTask_TargetEncoded,...,CurrentGameMode_LabelEncoded,Year,Month,Day,Hour,Minute,Second,Weekday,WeekendFlag,PeriodOfDay
0,p1,User Initiated,,,2,,,Wellbeing,509.0,49.0,...,4.0,2022,8,18,22,55,27,3,0,Evening
1,p1,System Initiated,,,0,,,Wellbeing,653.0,49.0,...,4.0,2022,8,18,23,38,31,3,0,Evening
2,p1,User Initiated,Career,HOME_VAN,1,WASH_PWVan,1.0,Wellbeing,705.0,21.0,...,0.0,2022,8,18,23,39,24,3,0,Evening
3,p1,System Initiated,Career,RESIDENTIALSMALL_BACKYARD,6,WASH_PWVan,0.168267,Wellbeing,817.0,33.0,...,0.0,2022,8,18,23,45,1,3,0,Evening
4,p1,System Initiated,Career,RESIDENTIALSMALL_BACKYARD,13,WASH_PWVan,0.429364,Wellbeing,810.0,33.0,...,0.0,2022,8,18,23,51,22,3,0,Evening


### TimeUtc-Encoding

In [7]:
#encoding for TimeOfDay.
train_data = pd.get_dummies(train_data, columns=['PeriodOfDay'], drop_first=False)

train_data.head()

Unnamed: 0,UserID,QuestionTiming,CurrentGameMode,CurrentTask,CurrentSessionLength,LastTaskCompleted,LevelProgressionAmount,QuestionType,ResponseValue,CurrentTask_TargetEncoded,...,Day,Hour,Minute,Second,Weekday,WeekendFlag,PeriodOfDay_Night,PeriodOfDay_Morning,PeriodOfDay_Afternoon,PeriodOfDay_Evening
0,p1,User Initiated,,,2,,,Wellbeing,509.0,49.0,...,18,22,55,27,3,0,False,False,False,True
1,p1,System Initiated,,,0,,,Wellbeing,653.0,49.0,...,18,23,38,31,3,0,False,False,False,True
2,p1,User Initiated,Career,HOME_VAN,1,WASH_PWVan,1.0,Wellbeing,705.0,21.0,...,18,23,39,24,3,0,False,False,False,True
3,p1,System Initiated,Career,RESIDENTIALSMALL_BACKYARD,6,WASH_PWVan,0.168267,Wellbeing,817.0,33.0,...,18,23,45,1,3,0,False,False,False,True
4,p1,System Initiated,Career,RESIDENTIALSMALL_BACKYARD,13,WASH_PWVan,0.429364,Wellbeing,810.0,33.0,...,18,23,51,22,3,0,False,False,False,True


In [8]:
# Save the transformation and encoding for 'TimeUtc' to the 'train_data_knn_imputed.csv' 
train_output_file_path = 'train_data_knn_imputed.csv'
train_data.to_csv(train_output_file_path, index=False)

### LevelProgressionAmount-KNN

In [9]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Read the CSV file
file_path = 'train_data_knn_imputed.csv'
df = pd.read_csv(file_path)

# Select the column to encode and impute
column_to_encode_and_impute = 'LevelProgressionAmount'

# Encode the column
label_encoder = LabelEncoder()
df[column_to_encode_and_impute] = label_encoder.fit_transform(df[column_to_encode_and_impute].astype(str))

# Create KNNImputer instance
imputer = KNNImputer(n_neighbors=5)

# Perform KNN imputation on the selected column
df[[column_to_encode_and_impute]] = imputer.fit_transform(df[[column_to_encode_and_impute]])

# Standardize the imputed column
scaler = StandardScaler()
df[[column_to_encode_and_impute]] = scaler.fit_transform(df[[column_to_encode_and_impute]])

# Save the modified data back to the original file
df.to_csv(file_path, index=False)

print(f'Successfully encoded and performed KNN imputation on {column_to_encode_and_impute}. The result has been saved back to the original file {file_path}')


Successfully encoded and performed KNN imputation on LevelProgressionAmount. The result has been saved back to the original file train_data_knn_imputed.csv


### QuestionTiming-Dummy

In [10]:
import pandas as pd

file_path = 'train_data_knn_imputed.csv'
df = pd.read_csv(file_path)

df_encoded = pd.get_dummies(df, columns=['QuestionTiming'])

df_encoded.to_csv(file_path, index=False)

print(f'{file_path}')


train_data_knn_imputed.csv


### Mapping UserID with ResponseValue

In [11]:
file_path = 'train_data_knn_imputed.csv'
df = pd.read_csv(file_path)

# average responsevalue per user
user_means = df.groupby('UserID')['ResponseValue'].mean()

# overall average responsevalue
global_mean = df['ResponseValue'].mean()

# Creating new feature using average responsevalue per user
df['UserAvgResponse'] = df['UserID'].map(user_means)

# imputing missing value
df['UserAvgResponse'].fillna(global_mean, inplace=True)

# save new feature to file
df.to_csv(file_path, index=False)

print(f'Successfully updated UserAvgResponse and saved the data back to the file {file_path}')

Successfully updated UserAvgResponse and saved the data back to the file train_data_knn_imputed.csv


## Test_data

In [12]:
test_data= pd.read_csv('data/test_data.csv')

### CurrentTask, LastTaskCompleted-Encoding 

In [13]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

test_data_path = 'data/test_data.csv'
test_df = pd.read_csv(test_data_path)

label_encoder = LabelEncoder()

if test_df['CurrentTask'].dtype == 'object':
    test_df['CurrentTask_TargetEncoded'] = label_encoder.fit_transform(test_df['CurrentTask'].astype(str))

if test_df['LastTaskCompleted'].dtype == 'object':
    test_df['LastTaskCompleted_TargetEncoded'] = label_encoder.fit_transform(test_df['LastTaskCompleted'].astype(str))

output_file_path = 'test_data_knn_imputed.csv'
test_df.to_csv(output_file_path, index=False)


### CurrentTask, LastTaskCompleted-KNN

In [14]:
from sklearn.impute import KNNImputer

test_data = pd.read_csv('test_data_knn_imputed.csv')

# Select the two features 'CurrentTask_TargetEncoded' and 'LastTaskCompleted_TargetEncoded'
features_to_impute = ['CurrentTask_TargetEncoded', 'LastTaskCompleted_TargetEncoded']

# Initialize KNN imputer
imputer = KNNImputer(n_neighbors=5)

# Impute the missing values
imputed_data = imputer.fit_transform(test_data[features_to_impute])

# Transfer the filled data back to the DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=features_to_impute, index=test_data.index)

# Re-assign the filled features back to the original dataset
test_data[features_to_impute] = imputed_df[features_to_impute]

print("Processed testing data with imputed values:")
print(test_data.head())

# Save the imputed data to a new csv file
test_output_file_path = 'test_data_knn_imputed.csv'
test_data.to_csv(test_output_file_path, index=False)

Processed testing data with imputed values:
  UserID    QuestionTiming              TimeUtc CurrentGameMode  \
0     p1  System Initiated  2022-08-28 15:50:22             NaN   
1     p1    User Initiated  2022-08-28 16:05:02          Career   
2     p1    User Initiated  2022-09-07 03:31:50             NaN   
3     p1  System Initiated  2022-09-08 01:30:05             NaN   
4     p1  System Initiated  2022-09-08 01:43:45          Career   

                  CurrentTask  CurrentSessionLength LastTaskCompleted  \
0                         NaN                     0               NaN   
1  RECREATIONGROUND_SKATEPARK                    14               NaN   
2                         NaN                     0               NaN   
3                         NaN                     0               NaN   
4  RECREATIONGROUND_SKATEPARK                    13               NaN   

   LevelProgressionAmount QuestionType  CurrentTask_TargetEncoded  \
0                     NaN    Wellbeing       

### CurrentGameMode-Encoding

In [15]:
from sklearn.preprocessing import LabelEncoder

# load the data
test_data = pd.read_csv('test_data_knn_imputed.csv')
print(test_data['CurrentGameMode'].unique())

# Initialize LabelEncoder
le = LabelEncoder()

# Encoding 'CurrentGameMode' using Label Encoding
test_data['CurrentGameMode_LabelEncoded'] = le.fit_transform(test_data['CurrentGameMode'])

[nan 'Career' 'FreePlay' 'Special' 'Challenge']


### CurrentGameMode-KNN

In [16]:
# Initialize KNN Imputer
imputer = KNNImputer(n_neighbors=5)

# Select feature 'CurrentGameMode_LabelEncoded' to be imputed
features_to_impute = ['CurrentGameMode_LabelEncoded', 'CurrentTask_TargetEncoded']

# Impute the missing values
imputed_data = imputer.fit_transform(test_data[features_to_impute])

# Transfer the filled data back to the DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=features_to_impute, index=test_data.index)

# Re-new the original 'CurrentGameMode_LabelEncoded' data 
test_data['CurrentGameMode_LabelEncoded'] = imputed_df['CurrentGameMode_LabelEncoded']

print("Processed testing data with imputed CurrentGameMode:")
print(test_data.head())

# Save and renew the file
test_output_file_path = 'test_data_knn_imputed.csv'
test_data.to_csv(test_output_file_path, index=False)

Processed testing data with imputed CurrentGameMode:
  UserID    QuestionTiming              TimeUtc CurrentGameMode  \
0     p1  System Initiated  2022-08-28 15:50:22             NaN   
1     p1    User Initiated  2022-08-28 16:05:02          Career   
2     p1    User Initiated  2022-09-07 03:31:50             NaN   
3     p1  System Initiated  2022-09-08 01:30:05             NaN   
4     p1  System Initiated  2022-09-08 01:43:45          Career   

                  CurrentTask  CurrentSessionLength LastTaskCompleted  \
0                         NaN                     0               NaN   
1  RECREATIONGROUND_SKATEPARK                    14               NaN   
2                         NaN                     0               NaN   
3                         NaN                     0               NaN   
4  RECREATIONGROUND_SKATEPARK                    13               NaN   

   LevelProgressionAmount QuestionType  CurrentTask_TargetEncoded  \
0                     NaN    Wellbei

### TimeUtc-Convert Format

In [17]:
test_data = pd.read_csv("test_data_knn_imputed.csv")

# Ensure TimeUtc is a datetime type
test_data['TimeUtc'] = pd.to_datetime(test_data['TimeUtc'])

# Extract components
test_data['Year'] = test_data['TimeUtc'].dt.year
test_data['Month'] = test_data['TimeUtc'].dt.month
test_data['Day'] = test_data['TimeUtc'].dt.day
test_data['Hour'] = test_data['TimeUtc'].dt.hour
test_data['Minute'] = test_data['TimeUtc'].dt.minute
test_data['Second'] = test_data['TimeUtc'].dt.second

# Separate weekday and weekend days
test_data['Weekday'] = test_data['TimeUtc'].dt.dayofweek
test_data['WeekendFlag'] = (test_data['TimeUtc'].dt.weekday >= 5).astype(int)

# Categorize different times of day into periods
time_bins = [0, 6, 12, 18, 24]  # Define time_bins as 0-6, 6-12, 12-18, 18-24
time_labels = ['Night', 'Morning', 'Afternoon', 'Evening']
test_data['PeriodOfDay'] = pd.cut(test_data['TimeUtc'].dt.hour, bins=time_bins, labels=time_labels, right=False)

# Remove the original Timestamp column to avoid redundancy
test_data.drop('TimeUtc', axis=1, inplace=True)


### TimeUtc-Encoding

In [18]:
#encoding for TimeOfDay.
test_data = pd.get_dummies(test_data, columns=['PeriodOfDay'], drop_first=False)

test_data.head()

Unnamed: 0,UserID,QuestionTiming,CurrentGameMode,CurrentTask,CurrentSessionLength,LastTaskCompleted,LevelProgressionAmount,QuestionType,CurrentTask_TargetEncoded,LastTaskCompleted_TargetEncoded,...,Day,Hour,Minute,Second,Weekday,WeekendFlag,PeriodOfDay_Night,PeriodOfDay_Morning,PeriodOfDay_Afternoon,PeriodOfDay_Evening
0,p1,System Initiated,,,0,,,Wellbeing,50.0,51.0,...,28,15,50,22,6,1,False,False,True,False
1,p1,User Initiated,Career,RECREATIONGROUND_SKATEPARK,14,,0.563458,Wellbeing,32.0,51.0,...,28,16,5,2,6,1,False,False,True,False
2,p1,User Initiated,,,0,,,Wellbeing,50.0,51.0,...,7,3,31,50,2,0,True,False,False,False
3,p1,System Initiated,,,0,,,Wellbeing,50.0,51.0,...,8,1,30,5,3,0,True,False,False,False
4,p1,System Initiated,Career,RECREATIONGROUND_SKATEPARK,13,,0.829827,Wellbeing,32.0,51.0,...,8,1,43,45,3,0,True,False,False,False


In [19]:
# Save the transformation and encoding for 'TimeUtc' to the 'train_data_knn_imputed.csv' 
test_output_file_path = 'test_data_knn_imputed.csv'
test_data.to_csv(test_output_file_path, index=False)

### LevelProgressionAmount-KNN

In [20]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Read the CSV file
file_path = 'test_data_knn_imputed.csv'
df = pd.read_csv(file_path)

# Select the column to encode and impute
column_to_encode_and_impute = 'LevelProgressionAmount'

# Encode the column
label_encoder = LabelEncoder()
df[column_to_encode_and_impute] = label_encoder.fit_transform(df[column_to_encode_and_impute].astype(str))

# Create KNNImputer instance
imputer = KNNImputer(n_neighbors=5)

# Perform KNN imputation on the selected column
df[[column_to_encode_and_impute]] = imputer.fit_transform(df[[column_to_encode_and_impute]])

# Standardize the imputed column
scaler = StandardScaler()
df[[column_to_encode_and_impute]] = scaler.fit_transform(df[[column_to_encode_and_impute]])

# Save the modified data back to the original file
df.to_csv(file_path, index=False)

print(f'Successfully encoded and performed KNN imputation on {column_to_encode_and_impute}. The result has been saved back to the original file {file_path}')


Successfully encoded and performed KNN imputation on LevelProgressionAmount. The result has been saved back to the original file test_data_knn_imputed.csv


### QuestionTiming-Dummy

In [21]:
import pandas as pd

file_path = 'test_data_knn_imputed.csv'
df = pd.read_csv(file_path)

df_encoded = pd.get_dummies(df, columns=['QuestionTiming'])

df_encoded.to_csv(file_path, index=False)

print(f'{file_path}')

test_data_knn_imputed.csv


### Adding UserAvgResponse

In [22]:
import pandas as pd

train_data_path = 'data/train_data.csv'
train_df = pd.read_csv(train_data_path)

user_means = train_df.groupby('UserID')['ResponseValue'].mean()

global_mean = train_df['ResponseValue'].mean()

test_data_path = 'test_data_knn_imputed.csv'
test_df = pd.read_csv(test_data_path)

test_df['UserAvgResponse'] = test_df['UserID'].map(user_means)

test_df['UserAvgResponse'].fillna(global_mean, inplace=True)

test_df.to_csv(test_data_path, index=False)

print(f'successfully add {test_data_path}')

successfully add test_data_knn_imputed.csv


## Modeling

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import lightgbm as lgb

# Load the data
data_path = 'train_data_knn_imputed.csv'
data = pd.read_csv(data_path)

# Selecting features and target variable
features = [
    "CurrentSessionLength", "CurrentGameMode_LabelEncoded", "CurrentTask_TargetEncoded",
    "LastTaskCompleted_TargetEncoded", "LevelProgressionAmount", "Month",
    "WeekendFlag", "PeriodOfDay_Night", "QuestionTiming_System Initiated","UserAvgResponse"
]
X = data[features]
y = data['ResponseValue']

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating and training the LightGBM model
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Setting parameters
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l2', 'mae'],
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# Training the model
num_round = 100
bst = lgb.train(params, train_data, num_round, valid_sets=[test_data], callbacks=[lgb.early_stopping(stopping_rounds=10)])

# Predicting and evaluating the model
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Print the evaluation results
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")
print(f"Mean Absolute Error: {mae}")

Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 16650.8	valid_0's l1: 88.267
Mean Squared Error: 16650.827745675826
R-squared Score: 0.631057152155859
Mean Absolute Error: 88.26697410140719


In [25]:
import pandas as pd
import lightgbm as lgb


test_data_path = 'test_data_knn_imputed.csv'  
test_data = pd.read_csv(test_data_path)

features = [
    "CurrentSessionLength", "CurrentGameMode_LabelEncoded", "CurrentTask_TargetEncoded",
    "LastTaskCompleted_TargetEncoded", "LevelProgressionAmount", "Month",
    "WeekendFlag", "PeriodOfDay_Night", "QuestionTiming_System Initiated","UserAvgResponse"
]
X_test = test_data[features]

y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)

predicted_data = pd.DataFrame(y_pred)

output_file_path = 'predicted.csv'
predicted_data.to_csv(output_file_path, index=False, header=False)

print(f"Predicted data saved to: {output_file_path}")

Predicted data saved to: predicted.csv
