In [61]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt

base_path = "/kaggle/input/trends-assessment-prediction/"

In [62]:
# read non .mat data into dataframes
fnc = pd.read_csv(base_path + "fnc.csv")
loading = pd.read_csv(base_path + "loading.csv")
train_scores = pd.read_csv(base_path + "train_scores.csv")

In [63]:
# basic preproccessing

data_list = [loading, train_scores]

for data in data_list:
    for col in data.columns:
        #mean = np.mean(data[col])
        data[col].fillna(data[col].mean(), inplace=True)

In [64]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5877 entries, 0 to 5876
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Id            5877 non-null   int64  
 1   age           5877 non-null   float64
 2   domain1_var1  5877 non-null   float64
 3   domain1_var2  5877 non-null   float64
 4   domain2_var1  5877 non-null   float64
 5   domain2_var2  5877 non-null   float64
dtypes: float64(5), int64(1)
memory usage: 275.6 KB


In [65]:
# splitting into train and pred without .mat files
loading_train_test = loading[loading.Id.isin(train_scores.Id)]
loading_for_pred = loading[~loading.Id.isin(train_scores.Id)]
print(loading_train_test.shape)
print(loading_for_pred.shape)

# fnc_train_test = fnc[fnc.Id.isin(train_scores.Id)]
# fnc_for_pred = fnc[~fnc.Id.isin(train_scores.Id)]
# print(fnc_train.shape)
# print(fnc_test.shape)

(5877, 27)
(5877, 27)


In [66]:
# splitting data into further train and test
# can be improved by training several models on several folds eg. Stratified shuffle split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(loading_train_test, train_scores, test_size=0.2,random_state=42)

# first row coltains int64 which cannot be handled by regressor

X_train = X_train.drop(columns=['Id']).dropna()
y_train = y_train.drop(columns=['Id']).dropna()

In [67]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4701 entries, 5044 to 860
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   age           4701 non-null   float64
 1   domain1_var1  4701 non-null   float64
 2   domain1_var2  4701 non-null   float64
 3   domain2_var1  4701 non-null   float64
 4   domain2_var2  4701 non-null   float64
dtypes: float64(5)
memory usage: 220.4 KB


In [68]:
print(y_train.isnull().sum())


age             0
domain1_var1    0
domain1_var2    0
domain2_var1    0
domain2_var2    0
dtype: int64


In [69]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4701 entries, 10128 to 1744
Data columns (total 26 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   IC_01   4701 non-null   float64
 1   IC_07   4701 non-null   float64
 2   IC_05   4701 non-null   float64
 3   IC_16   4701 non-null   float64
 4   IC_26   4701 non-null   float64
 5   IC_06   4701 non-null   float64
 6   IC_10   4701 non-null   float64
 7   IC_09   4701 non-null   float64
 8   IC_18   4701 non-null   float64
 9   IC_04   4701 non-null   float64
 10  IC_12   4701 non-null   float64
 11  IC_24   4701 non-null   float64
 12  IC_15   4701 non-null   float64
 13  IC_13   4701 non-null   float64
 14  IC_17   4701 non-null   float64
 15  IC_02   4701 non-null   float64
 16  IC_08   4701 non-null   float64
 17  IC_03   4701 non-null   float64
 18  IC_21   4701 non-null   float64
 19  IC_28   4701 non-null   float64
 20  IC_11   4701 non-null   float64
 21  IC_20   4701 non-null   float64
 

In [70]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, GridSearchCV

features = ('age', 'domain1_var1', 'domain1_var2','domain2_var1','domain2_var2')

model = RandomForestRegressor(
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5
)
cv = KFold(n_splits = 5, shuffle=True, random_state=29)
grid = {'n_estimators':[5,10,20,100]}
gs = GridSearchCV(model, grid, n_jobs=-1, cv=cv, verbose=1, scoring='neg_mean_absolute_error')

In [71]:

best_models = {}
for col in features:
    gs.fit(X_train, y_train[col])
    best_models[col] = gs.best_estimator_
    print(gs.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   17.3s finished


-8.122770630441561
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   17.1s finished


-7.448719079202265
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   18.3s finished


-8.341537428765125
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   17.4s finished


-8.660151743492072
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   18.1s finished


-9.26527616909421


In [72]:
y_train.head()

Unnamed: 0,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2
5044,59.580851,69.21256,69.657537,50.947179,53.575847
3826,64.203107,64.619363,52.109189,41.002507,60.413587
4828,55.456978,64.130126,42.995646,46.26066,43.965746
5475,53.583805,39.386597,56.733228,52.841295,58.998451
5269,55.456978,54.092357,64.068458,45.002598,30.172197


In [75]:
predictions = pd.DataFrame(loading_for_pred, columns=['Id'], dtype=str)

In [79]:
for col in features:
    predictions[col] = best_models[col].predict(loading_for_pred.drop(columns=['Id']))

In [82]:
def make_sub(predictions):
    features = ('age', 'domain1_var1', 'domain1_var2','domain2_var1','domain2_var2')
    _columns = (0,1,2,3,4)
    tests = predictions.rename(columns=dict(zip(features, _columns)))
    tests = tests.melt(id_vars='Id',value_vars=_columns,value_name='Predicted')
    tests['target'] = tests.variable.map(dict(zip(_columns, features)))
    tests['Id_'] = tests[['Id', 'target']].apply(lambda x: '_'.join((str(x[0]), str(x[1]))), axis=1)
  
    return tests.sort_values(by=['Id', 'variable'])\
              .drop(['Id', 'variable', 'target'],axis=1)\
              .rename(columns={'Id_':'Id'})\
              .reset_index(drop=True)\
              [['Id', 'Predicted']]

In [83]:
sub = make_sub(predictions)
sub.head()

Unnamed: 0,Id,Predicted
0,10003_age,44.995419
1,10003_domain1_var1,49.333563
2,10003_domain1_var2,59.177037
3,10003_domain2_var1,48.042128
4,10003_domain2_var2,54.314531


In [None]:
sub.to_csv('firstmodel', index=False)