In [328]:
import pandas as pd
import numpy as np
import json
import os
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import pyarrow.parquet as pa
import tensorflow as tf
import pyspark as ps
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import arrays_zip, explode, col

In [329]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

In [330]:
spark = SparkSession.builder.getOrCreate()

In [331]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [332]:
spark

In [333]:
df = spark.read.parquet('train.parquet')

In [334]:
df.show()

+-----+--------------------+--------------------+-----+
|   id|               dates|              values|label|
+-----+--------------------+--------------------+-----+
|19114|[2016-01-01, 2016...|[-1.86, 0.79, 1.4...|  0.0|
|22769|[2016-05-01, 2016...|[-1.04, -3.48, 0....|  1.0|
|76935|[2017-03-01, 2017...|[0.28, 0.63, 0.06...|  0.0|
|66297|[2016-01-01, 2016...|[-0.33, 0.58, 1.1...|  0.0|
| 2191|[2016-01-01, 2016...|[1.31, 0.5, -0.54...|  0.0|
|59504|[2016-03-01, 2016...|[0.08, 0.88, 1.46...|  0.0|
|49554|[2016-04-01, 2016...|[1.05, -0.28, 1.0...|  0.0|
|58344|[2016-12-01, 2017...|[-0.36, -0.45, -0...|  0.0|
|87449|[2016-01-01, 2016...|[2.92, 4.11, 3.39...|  1.0|
|43415|[2016-02-01, 2016...|[-0.86, -0.05, -0...|  1.0|
|76676|[2016-01-01, 2016...|[2.24, 0.46, 1.41...|  0.0|
| 4321|[2016-01-01, 2016...|[1.1, 1.73, 2.04,...|  0.0|
|17921|[2016-01-01, 2016...|[1.06, 1.04, 0.62...|  0.0|
|60176|[2016-01-01, 2016...|[1.41, -0.62, -1....|  0.0|
|61864|[2016-01-01, 2016...|[0.24, 0.37, 0.97...

In [335]:
df.printSchema

<bound method DataFrame.printSchema of DataFrame[id: bigint, dates: array<date>, values: array<double>, label: double]>

In [336]:
df.pandas_api().isna().mean()

id        0.0
dates     0.0
values    0.0
label     0.0
dtype: float64

In [337]:
df_zip = df.withColumn("zip_date_val", arrays_zip("dates", "values"))

In [338]:
df_exp = df_zip.withColumn("explod_date_val", explode("zip_date_val"))

In [347]:
df_new = df_exp.select(
    col("id"),
    col("explod_date_val.dates").alias("date"),
    col("explod_date_val.values").alias("value"),
    col("label")
)

In [348]:
df_new.printSchema

<bound method DataFrame.printSchema of DataFrame[id: bigint, date: date, value: double, label: double]>

In [349]:
df_new = df_new.toPandas()

In [350]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5155412 entries, 0 to 5155411
Data columns (total 4 columns):
 #   Column  Dtype  
---  ------  -----  
 0   id      int64  
 1   date    object 
 2   value   float64
 3   label   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 157.3+ MB


In [351]:
df_new.isna().sum()

id          0
date        0
value    4582
label       0
dtype: int64

In [352]:
df_new = df_new.dropna()

In [353]:
df_new.isna().sum()

id       0
date     0
value    0
label    0
dtype: int64

In [354]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5150830 entries, 0 to 5155411
Data columns (total 4 columns):
 #   Column  Dtype  
---  ------  -----  
 0   id      int64  
 1   date    object 
 2   value   float64
 3   label   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 196.5+ MB


In [397]:
df_new['date'] = df_new['date'].values.astype("float64")

In [398]:
df_new['value'] = df_new['value'].values.astype("float64")

In [399]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5150830 entries, 0 to 5155411
Data columns (total 4 columns):
 #   Column  Dtype  
---  ------  -----  
 0   id      int64  
 1   date    float64
 2   value   float64
 3   label   int64  
dtypes: float64(2), int64(2)
memory usage: 196.5 MB


In [380]:
#df_new['date'] = df_new['date'].as  , format='%Y-%m-%d')

In [356]:
#df_new['value'] = pd.to_numeric(df_new['value'], errors='coerce')

In [361]:
# df_new['value'] = df_new['value'].astype(str)

In [366]:
# df_new['label'] = df_new['label'].astype('int64')

In [373]:
# df_new['date'] = df_new['date'].astype(str)

In [386]:
# df_new['value'] = df_new['value'].astype(str).str.replace(' ', '')
# df_new['value'] = df_new['value'].str.replace('$', '')
# df_new['value'] = pd.to_numeric(df_new['value'], errors='coerce')

In [400]:
df_new.head()

Unnamed: 0,id,date,value,label
0,19114,1.451606e+18,-1.86,0
1,19114,1.454285e+18,0.79,0
2,19114,1.45679e+18,1.4,0
3,19114,1.459469e+18,0.15,0
4,19114,1.462061e+18,0.0,0


In [401]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5150830 entries, 0 to 5155411
Data columns (total 4 columns):
 #   Column  Dtype  
---  ------  -----  
 0   id      int64  
 1   date    float64
 2   value   float64
 3   label   int64  
dtypes: float64(2), int64(2)
memory usage: 196.5 MB


In [402]:
x = df_new.drop(['label'], axis= 1)
y = df_new['label']
x.shape
y.shape

(5150830, 3)

(5150830,)

In [407]:
params_grid = {
    'n_estimators' : [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)],
    'max_depth' : [int(x) for x in np.linspace(start = 1, stop = 15, num = 15)],
    'min_samples_split' : [int(x) for x in np.linspace(start = 2, stop = 50, num = 10)],
    'min_samples_leaf' : [int(x) for x in np.linspace(start = 2, stop = 50, num = 10)]
}

In [404]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 12345)

In [408]:
dt_model = RandomForestClassifier()
grid_search = RandomizedSearchCV(dt_model, params_grid, error_score='raise', cv= 5)

In [None]:
grid_search.fit(x_train, y_train)

In [None]:
print(12)

In [385]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5150830 entries, 0 to 5155411
Data columns (total 4 columns):
 #   Column  Dtype         
---  ------  -----         
 0   id      int64         
 1   date    datetime64[ns]
 2   value   object        
 3   label   int64         
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 196.5+ MB


In [313]:
df_new = df_new.dropna()

In [315]:
df_new['value'].isna().sum()

0

In [321]:
df_new['label'].isna().sum()

0

In [76]:
dt = pd.read_parquet('train.parquet')
# dt['label'].unique()
# dt['label'] = dt['label'].apply(int)
# dt['values'] = dt['values'].apply(eval)
dt['values'] = dt['values'].apply(np.array)
dt['values'] = dt['values'].apply(lambda x: ','.join(map(str, x)))
dt['values'] = dt['values'].apply(lambda x: list(map(float, x.split(','))))
# dt['dates'] = dt['dates'].apply(np.array)
# dt['values'][0]

In [155]:
dt_model = RandomForestClassifier()
grid_search = GridSearchCV(dt_model, params_grid, error_score='raise', cv= 5)
# grid_search.fit(x_train, y_train)

In [156]:
grid_search.fit(x_train, y_train)

In [157]:
grid_search.best_params_

{'n_estimators': 70, 'min_samples_leaf': 1, 'max_depth': 10}

In [29]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      80000 non-null  int64  
 1   dates   80000 non-null  object 
 2   values  80000 non-null  object 
 3   label   80000 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 2.4+ MB


In [16]:
dt['dates'][0]

array([datetime.date(2016, 1, 1), datetime.date(2016, 2, 1),
       datetime.date(2016, 3, 1), datetime.date(2016, 4, 1),
       datetime.date(2016, 5, 1), datetime.date(2016, 6, 1),
       datetime.date(2016, 7, 1), datetime.date(2016, 8, 1),
       datetime.date(2016, 9, 1), datetime.date(2016, 10, 1),
       datetime.date(2016, 11, 1), datetime.date(2016, 12, 1),
       datetime.date(2017, 1, 1), datetime.date(2017, 2, 1),
       datetime.date(2017, 3, 1), datetime.date(2017, 4, 1),
       datetime.date(2017, 5, 1), datetime.date(2017, 6, 1),
       datetime.date(2017, 7, 1), datetime.date(2017, 8, 1),
       datetime.date(2017, 9, 1), datetime.date(2017, 10, 1),
       datetime.date(2017, 11, 1), datetime.date(2017, 12, 1),
       datetime.date(2018, 1, 1), datetime.date(2018, 2, 1),
       datetime.date(2018, 3, 1), datetime.date(2018, 4, 1),
       datetime.date(2018, 5, 1), datetime.date(2018, 6, 1),
       datetime.date(2018, 7, 1), datetime.date(2018, 8, 1),
       datetime.da

In [99]:
dat = pd.read_csv('sample_submission.csv')
dat

Unnamed: 0,id,score
0,6125.0,0.757097
1,26781.0,0.346173
2,13333.0,0.431305
3,53218.0,0.847472
4,84204.0,0.065298
...,...,...
19996,5891.0,0.995766
19997,29091.0,0.629173
19998,85877.0,0.477870
19999,73528.0,0.477469


In [292]:
df = pd.read_parquet('train.parquet')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      80000 non-null  int64  
 1   dates   80000 non-null  object 
 2   values  80000 non-null  object 
 3   label   80000 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 2.4+ MB


In [135]:
all_unique_dates = sorted(set([date for sublist in df['dates'] for date in sublist]))
for date in all_unique_dates:
    df[date] = np.nan  # Инициализация столбцов с NaN
for index, row in df.iterrows():
    for date, value in zip(row['dates'], row['values']):
        df.at[index, date] = value


In [138]:
df = df.drop(columns=['dates', 'values'])

In [143]:
df.columns = df.columns.astype(str)

In [149]:
df.fillna(df.mean(), inplace= True)