In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from sklearn.metrics import mean_absolute_error

In [2]:
train_df = pd.read_csv('train.csv')
train_df

Unnamed: 0,id,predicted
0,2_trans_497.csv,550
1,2_trans_483.csv,1093
2,2_trans_2396.csv,861
3,2_trans_1847.csv,1093
4,2_trans_2382.csv,488
...,...,...
2095,2_trans_1679.csv,1093
2096,2_trans_2370.csv,805
2097,2_trans_1692.csv,476
2098,2_trans_1876.csv,550


In [3]:
import scipy.integrate as it

In [4]:
norms = {
    'H2': 0.005,
    'CH4': 0.008,
    'C2H6': 0.006,
    'C2H2': 0.0008,
    'CO': 0.053,
    'CO2': 0.52
}
data_train_path = './data_train/data_train/'

def prepocess_timeseries(data_df):
    c2h6, c2h2, ch4 = [], [], []
    for i, row in data_df.iterrows():
        c2h6.append(row['H2'] + row['C2H4'])
        c2h2.append(row['H2'] + row['CO']*2)
        ch4.append((row['H2'] + row['CO'])*2)        
    data_df['C2H6'] = c2h6
    data_df['C2H2'] = c2h2
    data_df['CH4'] = ch4


    for name in data_df.columns:
        integr = np.concatenate(([0], it.cumtrapz(data_df[name])))
        inv_integr = np.flip(np.concatenate(([0], it.cumtrapz(data_df[name].iloc[::-1]))))
        data_df[name + "_integ"] = integr - inv_integr
        data_df[name + "_delta"] = np.concatenate(([0], np.diff(data_df[name])))

    for norm in norms:
        if norm in data_df.columns:
            data_df[norm + "_norm"] = data_df[norm] - norms[norm]
    
    return data_df
    

In [5]:
for file in tqdm(os.listdir(data_train_path)):
    data_df = pd.read_csv(data_train_path + file)
    data_df = prepocess_timeseries(data_df)
    data_df.to_csv('./data_train/data_train_i/'+ file, index=None)

NameError: name 'os' is not defined

In [5]:
def flatten_data(
                base_columns_names,
                columns_repeat_n, 
                data_path, 
                file_names, 
                categories=None,):
    columns = []
    for i in range(columns_repeat_n):
        for name in base_columns_names:
            columns.append(str(i) + "_" + name)
    if categories != None:
        columns.append("predicted")

    data = []
    for i in tqdm(range(len(file_names))):
        new_row = pd.read_csv(data_path + file_names[i]).values.flatten()
        if categories != None:
            new_row = np.append(new_row, categories[i])
        data.append(dict(zip(columns, new_row)))
    return pd.DataFrame(data)

In [6]:
data_train_i_path = './data_train/data_train_i/'
example_df = pd.read_csv(data_train_i_path + '2_trans_2.csv')
out_df = flatten_data(example_df.columns, len(example_df), data_train_i_path, train_df['id'].to_list(), train_df['predicted'].to_list())
out_df

100%|██████████| 2100/2100 [00:11<00:00, 179.05it/s]


Unnamed: 0,0_H2,0_CO,0_C2H4,0_C2H2,0_C2H6,0_CH4,0_H2_integ,0_H2_delta,0_CO_integ,0_CO_delta,...,419_C2H6_integ,419_C2H6_delta,419_CH4_integ,419_CH4_delta,419_H2_norm,419_CH4_norm,419_C2H6_norm,419_C2H2_norm,419_CO_norm,predicted
0,0.001202,0.029565,0.001069,0.060332,0.002271,0.061533,-0.664703,0.0,-14.043494,0.0,...,1.524587,0.000025,29.416394,0.000159,-0.002706,0.080787,0.000924,0.085693,-0.010901,550.0
1,0.001875,0.030855,0.002613,0.063585,0.004487,0.065459,-0.923330,0.0,-13.703315,0.0,...,2.395344,0.000039,29.253291,0.000119,-0.002236,0.071784,0.003284,0.076220,-0.015872,1093.0
2,0.000947,0.021001,0.001025,0.042949,0.001973,0.043896,-0.474773,0.0,-10.968896,0.0,...,1.066013,0.000028,22.887337,0.000188,-0.003184,0.069402,-0.001184,0.074786,-0.016115,861.0
3,0.000720,0.017019,0.004584,0.034759,0.005304,0.035479,-0.398885,0.0,-8.558554,0.0,...,2.877564,0.000024,17.914879,0.000136,-0.003341,0.051405,0.004263,0.056946,-0.024956,1093.0
4,0.001791,0.009544,0.007192,0.020879,0.008983,0.022670,-0.809888,0.0,-4.542676,0.0,...,4.671153,0.000033,10.705129,0.000097,-0.002909,0.024430,0.009342,0.029538,-0.038876,488.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2095,0.001043,0.009424,0.002751,0.019891,0.003793,0.020934,-0.589296,0.0,-4.551730,0.0,...,2.200328,0.000033,10.282052,0.000055,-0.002851,0.020077,0.002855,0.025128,-0.041111,1093.0
2096,0.000631,0.023220,0.003757,0.047072,0.004388,0.047703,-0.329963,0.0,-10.964632,0.0,...,2.290317,0.000028,22.589191,0.000163,-0.003895,0.065828,0.002627,0.071923,-0.017191,805.0
2097,0.002005,0.020167,0.002409,0.042339,0.004415,0.044345,-1.156283,0.0,-9.822446,0.0,...,2.491536,0.000035,21.957459,0.000243,-0.000661,0.064187,0.003042,0.067047,-0.021246,476.0
2098,0.002933,0.008451,0.000209,0.019834,0.003141,0.022767,-1.296014,0.0,-4.914527,0.0,...,1.786803,0.000030,12.421083,0.000152,-0.001238,0.043733,0.002299,0.047171,-0.030896,550.0


In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [8]:
 X, y = out_df.drop('predicted', axis=1), out_df['predicted']
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)



In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

In [11]:
reg = RandomForestRegressor(n_jobs=-1)
reg.fit(X_train, y_train)
#clf.fit(out_df.drop('category', axis=1), out_df['category'])

RandomForestRegressor(n_jobs=-1)

In [12]:
mean_absolute_error(y_test, reg.predict(X_test))

125.40023088023088

In [26]:
reg.score(X_test, y_test)

0.5456910347561887

# Тенсорка входит в здание

In [11]:
import tensorflow as tf

In [12]:
target = out_df['predicted']
dataset = tf.data.Dataset.from_tensor_slices((out_df.drop('predicted', axis=1).values, target.values-1))

In [19]:
train_dataset = dataset.shuffle(len(out_df)).batch(1)
def get_compiled_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(3000, activation='relu'),
        tf.keras.layers.Dense(500, activation='relu'),
        tf.keras.layers.Dense(100, activation='relu'),
        tf.keras.layers.Dense(30, activation='relu'),
        tf.keras.layers.Dense(1)
    ])
    model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
    return model


In [20]:
model = get_compiled_model()
model.fit(train_dataset, epochs=50)

Epoch 1/50
  67/2100 [..............................] - ETA: 1:19:32 - loss: 444.4544

KeyboardInterrupt: 

In [15]:
data_test_dir = './data_test/data_test/'

In [17]:
for file in tqdm(os.listdir(data_test_dir)):
    data_df = pd.read_csv(data_test_dir + file)
    data_df = prepocess_timeseries(data_df)
    data_df.to_csv('./data_test/data_test_i/'+ file, index=None)
example_test_df = pd.read_csv(data_test_dir + os.listdir(data_test_dir)[0])

100%|██████████| 900/900 [00:46<00:00, 19.45it/s]


In [18]:
data_test_i_dir = './data_test/data_test_i/'
example_test_df = pd.read_csv(data_test_i_dir + os.listdir(data_test_dir)[0])
test_file_names = os.listdir(data_test_i_dir)
test_complete_df = flatten_data(example_test_df.columns, len(example_test_df), data_test_i_dir, test_file_names)
    


100%|██████████| 900/900 [00:04<00:00, 205.31it/s]


In [20]:
predicts = reg.predict(test_complete_df)

In [22]:
pd.DataFrame({'id': os.listdir(data_test_dir), 'predicted': predicts.astype('int')}, index=None).to_csv('test.csv', index=None)