In [1]:
# https://www.kaggle.com/code/matanivanov/lgbm-with-fourier-transform

In [2]:
import pandas as pd
train = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
labels = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')

In [3]:
train.groupby(['subject', 'sequence'])['step'].nunique().value_counts()

60    25968
Name: step, dtype: int64

In [4]:
train.head()

Unnamed: 0,sequence,subject,step,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,sensor_09,sensor_10,sensor_11,sensor_12
0,0,47,0,-0.196291,0.112395,1.0,0.329204,-1.00466,-0.131638,-0.127505,0.368702,-0.1,-0.963873,-0.985069,0.531893,4.751492
1,0,47,1,-0.44745,0.134454,1.0,-0.658407,0.162495,0.340314,-0.209472,-0.867176,0.2,-0.301301,0.082733,-0.231481,0.45439
2,0,47,2,0.326893,-0.694328,1.0,0.330088,0.473678,1.280479,-0.094718,0.535878,1.4,1.002168,0.449221,-0.58642,-4.736147
3,0,47,3,0.523184,0.75105,1.0,0.976991,-0.563287,-0.720269,0.79326,0.951145,-0.3,-0.995665,-0.43429,1.34465,0.429241
4,0,47,4,0.272025,1.07458,1.0,-0.136283,0.398579,0.044877,0.560109,-0.541985,-0.9,1.055636,0.812631,0.123457,-0.223359


In [5]:
# Make frequency features with Fourier transform

from scipy.fft import rfft
# computes the 1-D n-point discrete Fourier Transform (DFT) 
# of a real-valued array by means of an efficient algorithm 
# called the Fast Fourier Transform (FFT).
import numpy as np

def make_fft_features(group):
    res = pd.Series()
    for col in group.columns:
        if col not in ['sequence', 'subject', 'step']:
            x = pd.Series(np.abs(rfft(group[col].values)), index=[f'{col}_freq_{i}' for i in range(31)])
            res = pd.concat([res, x])
        
    return res

train_df = train.sort_values(['subject', 'sequence', 'step'])\
    .groupby(['sequence', 'subject']).apply(make_fft_features)
train_df.head()

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0_level_0,Unnamed: 1_level_0,sensor_00_freq_0,sensor_00_freq_1,sensor_00_freq_2,sensor_00_freq_3,sensor_00_freq_4,sensor_00_freq_5,sensor_00_freq_6,sensor_00_freq_7,sensor_00_freq_8,sensor_00_freq_9,...,sensor_12_freq_21,sensor_12_freq_22,sensor_12_freq_23,sensor_12_freq_24,sensor_12_freq_25,sensor_12_freq_26,sensor_12_freq_27,sensor_12_freq_28,sensor_12_freq_29,sensor_12_freq_30
sequence,subject,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,47,2.504637,2.595997,2.94897,2.906888,2.510662,3.610021,6.967675,5.406874,5.151776,5.505501,...,56.505338,25.495893,45.878652,32.379605,32.416223,41.594143,34.651699,39.046842,31.815038,27.425405
1,66,4.157651,4.200879,4.329791,4.91533,4.735261,4.744518,6.233446,7.827523,2.507145,7.286906,...,153.04273,220.163202,312.540744,186.496505,134.465946,256.337778,176.815707,163.391993,181.534795,84.116368
2,66,0.092736,1.069003,0.969934,2.543404,1.580156,0.734563,5.417217,5.094902,5.865716,5.179019,...,0.538722,1.898862,2.276829,1.216976,3.7936,2.116767,1.104315,2.443818,3.374227,5.546036
3,542,1.792117,2.226034,2.929472,1.060576,2.625198,4.091992,3.41815,2.407677,6.570212,7.279072,...,2.908066,2.280553,2.176175,1.438037,1.131137,3.959018,0.655095,3.020249,2.022848,5.278772
4,437,3.547913,3.386796,2.926308,4.566577,5.807639,0.868438,14.607558,5.334614,6.761871,22.263778,...,0.299693,0.616626,0.288449,0.361617,0.672647,0.051619,0.522704,0.215556,0.369711,0.339301


In [6]:
n_sequence = train.groupby('subject')['sequence'].nunique()
n_sequence.head()

subject
0     20
1    175
2     38
3     36
4     26
Name: sequence, dtype: int64

In [7]:
# Normalize by maximum
perc_sequence = n_sequence.rank(method='max').apply(lambda x: 100.0*(x-1)/len(n_sequence))
perc_sequence.name = 'n_sequence_percentile'
perc_sequence.head()

subject
0    14.285714
1    99.702381
2    62.351190
3    58.482143
4    29.910714
Name: n_sequence_percentile, dtype: float64

In [8]:
train_df = train_df.reset_index()\
    .merge(perc_sequence, on='subject')\
    .merge(labels, on='sequence')
train_df.head()

Unnamed: 0,sequence,subject,sensor_00_freq_0,sensor_00_freq_1,sensor_00_freq_2,sensor_00_freq_3,sensor_00_freq_4,sensor_00_freq_5,sensor_00_freq_6,sensor_00_freq_7,...,sensor_12_freq_23,sensor_12_freq_24,sensor_12_freq_25,sensor_12_freq_26,sensor_12_freq_27,sensor_12_freq_28,sensor_12_freq_29,sensor_12_freq_30,n_sequence_percentile,state
0,0,47,2.504637,2.595997,2.94897,2.906888,2.510662,3.610021,6.967675,5.406874,...,45.878652,32.379605,32.416223,41.594143,34.651699,39.046842,31.815038,27.425405,98.065476,0
1,121,47,2.015456,2.29653,1.811023,2.609596,5.933611,5.177809,3.282827,6.65246,...,16.436525,22.797195,37.939752,20.02553,20.446737,11.834383,0.510406,15.83376,98.065476,1
2,156,47,9.068006,13.507249,26.911191,45.104476,66.001563,87.674315,101.652947,109.029241,...,100.342807,113.971795,138.361735,59.606223,100.369674,19.717507,61.806974,31.247656,98.065476,1
3,356,47,0.209428,0.663546,0.330469,0.387669,0.57911,3.711075,3.386412,8.453457,...,23.868128,36.819744,2.430822,18.272264,9.867578,24.285402,26.592332,23.478687,98.065476,1
4,660,47,0.863988,0.508295,1.358652,0.830933,3.506715,2.506207,2.704869,5.660748,...,37.664327,58.851105,49.871422,31.984101,47.527202,20.673605,38.302775,42.216539,98.065476,1


In [9]:
# test data transformation
test_df = test.sort_values(['subject', 'sequence', 'step'])\
    .groupby(['sequence', 'subject']).apply(make_fft_features)
n_sequence_test = test.groupby('subject')['sequence'].nunique()
perc_sequence_test = n_sequence_test.rank(method='max').apply(lambda x: 100.0*(x-1)/len(n_sequence_test))
perc_sequence_test.name = 'n_sequence_percentile'
test_df = test_df.reset_index()\
    .merge(perc_sequence_test, on='subject')
test_df.head()

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,sequence,subject,sensor_00_freq_0,sensor_00_freq_1,sensor_00_freq_2,sensor_00_freq_3,sensor_00_freq_4,sensor_00_freq_5,sensor_00_freq_6,sensor_00_freq_7,...,sensor_12_freq_22,sensor_12_freq_23,sensor_12_freq_24,sensor_12_freq_25,sensor_12_freq_26,sensor_12_freq_27,sensor_12_freq_28,sensor_12_freq_29,sensor_12_freq_30,n_sequence_percentile
0,25968,684,0.156105,0.408389,0.118233,0.788171,0.426651,1.71525,0.829751,7.209688,...,1.862775,3.424352,2.261977,0.812867,1.857509,6.876121,2.524938,1.24863,0.856351,93.416928
1,26059,684,10.459042,10.605043,12.863984,14.817426,12.581004,9.93024,10.904557,7.857821,...,0.16891,1.781986,1.86813,4.907799,2.758934,1.122004,2.163796,3.331648,0.147911,93.416928
2,26176,684,0.653014,1.134623,2.58503,3.271294,9.145892,9.724072,2.781895,10.673508,...,4.899978,3.716107,3.699582,6.736932,3.70902,4.37985,1.112741,11.97787,3.142796,93.416928
3,26249,684,1.063369,0.258448,2.002965,0.64873,1.456648,2.24459,1.853217,7.909807,...,1.481531,1.516283,4.258846,6.816363,6.703618,5.741161,5.995649,7.602047,10.799659,93.416928
4,26258,684,0.536321,1.027116,1.361058,0.881242,3.053,2.949359,2.289942,7.792656,...,3.139098,3.54911,4.545558,2.716155,1.021928,1.168896,3.154242,2.033291,3.535379,93.416928


In [10]:
train_df.columns

Index(['sequence', 'subject', 'sensor_00_freq_0', 'sensor_00_freq_1',
       'sensor_00_freq_2', 'sensor_00_freq_3', 'sensor_00_freq_4',
       'sensor_00_freq_5', 'sensor_00_freq_6', 'sensor_00_freq_7',
       ...
       'sensor_12_freq_23', 'sensor_12_freq_24', 'sensor_12_freq_25',
       'sensor_12_freq_26', 'sensor_12_freq_27', 'sensor_12_freq_28',
       'sensor_12_freq_29', 'sensor_12_freq_30', 'n_sequence_percentile',
       'state'],
      dtype='object', length=407)

In [11]:
train_df.to_csv('train_tps_apr_22_fev2.csv', index=True)
test_df.to_csv('test_tps_apr_22_fev2.csv', index=True)