# NAME

In [1]:
import random
from copy import deepcopy
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from dtaidistance import dtw
from plotly.subplots import make_subplots
from scipy import interpolate
from scipy.stats import boxcox

In [2]:
NUM_OF_COLUMNS = 1000
NUM_OF_ROWS = 100
THRESHOLD = 0.75

In [5]:
data = pd.DataFrame(np.random.rand(NUM_OF_ROWS, NUM_OF_COLUMNS),
                   columns=['feature_' + str(value) for value in range(1, NUM_OF_COLUMNS + 1)])

data = data.mask(np.random.random(data.shape) < .025)

In [6]:
data.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_991,feature_992,feature_993,feature_994,feature_995,feature_996,feature_997,feature_998,feature_999,feature_1000
0,0.563548,0.201872,0.659316,0.656748,0.390159,0.486311,0.967334,0.645527,0.176102,0.661272,...,0.995825,0.828239,0.838296,0.015447,0.697379,0.578543,0.739551,0.343945,0.457727,0.977714
1,0.730295,0.72754,0.824477,0.082026,0.217196,0.189307,0.108888,0.558801,0.828666,0.275883,...,0.470842,0.302836,0.608771,0.289481,0.760974,0.961347,0.203498,0.741968,0.208265,0.796162
2,0.67452,0.283768,0.840303,0.558951,0.719173,0.90679,0.785114,0.949444,0.128103,0.740189,...,0.504587,0.153616,0.704466,0.140358,0.090582,0.393918,0.84485,0.49184,0.662839,0.232395
3,0.535347,0.833189,0.735505,0.334448,0.738695,0.406646,0.012724,0.493413,0.126388,0.191989,...,0.274331,0.753807,0.623712,0.668669,0.459143,0.813362,0.918134,0.612684,0.448552,0.079616
4,0.930346,0.173799,0.906326,0.477065,0.156778,0.787097,0.099542,0.506019,0.233473,0.507512,...,0.842741,0.704238,0.341839,0.144537,0.599778,0.138034,0.972311,0.291828,0.722333,0.014098


In [7]:
columnValues = {}
maskDictionary = {}
for column in list(data):
    columnValues[column] = data[column].values
    
    mask = ''.join(['1' if value else '0' for value in pd.isnull(data[column])])
    
    if mask not in maskDictionary.keys():
        maskDictionary[mask] = [column]
    
    else:
        maskDictionary[mask].append(column)

In [8]:
maskDictionary

{'0000010000000000000001000000000000000000000000000000000000010000000000000000000000000000001000000000': ['feature_1'],
 '0000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000100000000': ['feature_2'],
 '0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000': ['feature_3',
  'feature_11',
  'feature_14',
  'feature_17',
  'feature_21',
  'feature_35',
  'feature_36',
  'feature_74',
  'feature_77',
  'feature_78',
  'feature_82',
  'feature_92',
  'feature_95',
  'feature_118',
  'feature_119',
  'feature_138',
  'feature_140',
  'feature_161',
  'feature_167',
  'feature_188',
  'feature_201',
  'feature_206',
  'feature_220',
  'feature_285',
  'feature_305',
  'feature_314',
  'feature_331',
  'feature_369',
  'feature_377',
  'feature_385',
  'feature_432',
  'feature_454',
  'feature_458',
  'feature_477',
  'feature_485',
  'feature_487',
  'feature_491',
  'feature_494',
  'feature_501',
  'feat

In [9]:
columnsToPass = [value for key, value in maskDictionary.items() if len(value) == 1]

In [10]:
len(columnsToPass)

714

In [11]:
columnValues = {key: value for key, value in columnValues.items() if key not in columnsToPass}

In [12]:
for key, value in columnValues.items():
    minValue = np.nanmin(value)
    value = [subValue + np.abs(minValue) + 1 for subValue in value if np.isfinite(subValue)]
    value = boxcox(value)[0]
    
    columnValues[key] = value

In [None]:
columnsToUse = []
for mainIndex, (_, maskColumns) in enumerate(columnValues.items()):
    columnValuesFiltered = {key: value for key, value in columnValues.items() if key in maskColumns}
    
    print(mainIndex, len(trainValuesFiltered.items()), len(columnValuesFiltered.items()))
    
    dm = [[0 for _ in range(len(columnValuesFiltered))] for _ in range(len(columnValuesFiltered))]
    for index1, (filter1, item1) in enumerate(columnValuesFiltered.items()):

        for index2, (filter2, item2) in enumerate(columnValuesFiltered.items()):

            if index1 > index2:
                continue

            elif index1 == index2:
                dm[index1][index2] = 0
                dm[index2][index1] = 0

                continue
            
            try:
                metric = pearsonr(item1, item2)[0]
            
            except:
                print(list(zip(item1, item2)))

            dm[index1][index2] = metric
            dm[index2][index1] = np.nan

    dm = np.array(dm)

    while True:
        maxValue = np.nanmax(dm.flatten())

        if maxValue < threshold or not np.isfinite(maxValue):
            break

        maxIndex = [[index1, index2]
                    for index1, row in enumerate(dm)
                    for index2, value in enumerate(row)
                    if value == maxValue]

        maxIndex = maxIndex[0]

        dm = np.delete(dm, maxIndex[0], axis=0)
        dm = np.delete(dm, maxIndex[0], axis=1)

        columnValues = [value for index, value in enumerate(columnValues) if index != maxIndex[0]]

    columnsToUse.extend(columnValues)
    print(len(columnsToUse))

In [None]:
convertedDFAggregatedFiltered = convertedDFAggregated[[value for value in list(convertedDFAggregated)
                                                       if value in binaryColumns or
                                                       value in categoricalColumns or
                                                       value in columnsToUse or
                                                       value in columnsNotUsed or
                                                       value in ['filter', 'target', 'index', 'train_test', 'monthIndex']]]

In [None]:
convertedDFAggregatedFiltered.values.shape

In [4]:
trajectories = deepcopy(trajectoriesSet)
distanceMatrixDictionary = {}

iteration = 1
while True:
    distanceMatrix = np.empty((len(trajectories), len(trajectories),))
    distanceMatrix[:] = np.nan
    
    for index1, (filter1, trajectory1) in enumerate(trajectories.items()):
        tempArray = []
        
        for index2, (filter2, trajectory2) in enumerate(trajectories.items()):
            
            if index1 > index2:
                continue
            
            elif index1 == index2:
                continue
            
            else:
                unionFilter = filter1 + filter2
                sorted(unionFilter)
                
                if unionFilter in distanceMatrixDictionary.keys():
                    distanceMatrix[index1][index2] = distanceMatrixDictionary.get(unionFilter)
                    
                    continue
                
                metric = []
                for subItem1 in trajectory1:
                    
                    for subItem2 in trajectory2:
                        metric.append(dtw.distance(subItem1, subItem2, psi=1))
                
                metric = max(metric)
                
                distanceMatrix[index1][index2] = metric
                distanceMatrixDictionary[unionFilter] = metric
    
    minValue = np.min(list(distanceMatrixDictionary.values()))
    
    if minValue > THRESHOLD:
        break
    
    minIndices = np.where(distanceMatrix == minValue)
    minIndices = list(zip(minIndices[0], minIndices[1]))
    
    minIndex = minIndices[0]
    
    filter1 = list(trajectories.keys())[minIndex[0]]
    filter2 = list(trajectories.keys())[minIndex[1]]
    
    trajectory1 = trajectories.get(filter1)
    trajectory2 = trajectories.get(filter2)
    
    unionFilter = filter1 + filter2
    sorted(unionFilter)
    
    trajectoryGroup = trajectory1 + trajectory2
    
    trajectories = {key: value for key, value in trajectories.items()
                    if all(value not in unionFilter for value in key)}
    
    distanceMatrixDictionary = {key: value for key, value in distanceMatrixDictionary.items()
                                if all(value not in unionFilter for value in key)}
    
    trajectories[unionFilter] = trajectoryGroup
    
    print(iteration, 'finished!')
    iteration += 1
    
    if len(list(trajectories.keys())) == 1:
        break

1 finished!
2 finished!
3 finished!
4 finished!
5 finished!
6 finished!
7 finished!
8 finished!
9 finished!
10 finished!
11 finished!
12 finished!
13 finished!
14 finished!
15 finished!
16 finished!
17 finished!
18 finished!
19 finished!
20 finished!
21 finished!
22 finished!
23 finished!
24 finished!
25 finished!
26 finished!
27 finished!
28 finished!
29 finished!
30 finished!
31 finished!
32 finished!
33 finished!
34 finished!
35 finished!
36 finished!
37 finished!
38 finished!
39 finished!
40 finished!
41 finished!
42 finished!
43 finished!
44 finished!
45 finished!
46 finished!
47 finished!
48 finished!
49 finished!
50 finished!
51 finished!
52 finished!
53 finished!
54 finished!
55 finished!
56 finished!
57 finished!
58 finished!
59 finished!
60 finished!
61 finished!
62 finished!
63 finished!
64 finished!
65 finished!
66 finished!
67 finished!
68 finished!
69 finished!
70 finished!
71 finished!
72 finished!
73 finished!
74 finished!
75 finished!
76 finished!
77 finished!
78 finis

In [5]:
for key, _ in trajectories.items():
    print(key)

('65',)
('74',)
('88',)
('108',)
('113',)
('114',)
('118',)
('122',)
('170',)
('175',)
('194',)
('48', '157')
('166', '173')
('57', '156')
('190', '5', '16')
('43', '100')
('56', '44', '158')
('151', '182')
('94', '174')
('67', '161')
('68', '53', '106')
('31', '86', '115')
('24', '176')
('91', '103')
('112', '172', '193')
('19', '178', '60', '96')
('188', '83', '177')
('141', '154')
('7', '69')
('45', '129')
('78', '192')
('23', '117', '135', '184')
('98', '85', '196')
('79', '92', '71', '137')
('138', '15', '136')
('11', '17')
('36', '171', '18', '197')
('75', '126')
('28', '51', '186')
('33', '54', '143')
('47', '155')
('0', '50', '6', '72')
('120', '1', '2', '124')
('62', '13', '145', '99', '195')
('9', '104', '183')
('152', '144', '179', '42', '89', '125')
('66', '97', '163')
('87', '119')
('169', '90', '191')
('133', '180', '128', '164')
('148', '102', '107', '30', '64', '70')
('34', '134', '41', '198', '82', '55', '140')
('3', '40', '73', '80', '127')
('8', '116')
('27', '111', 

In [12]:
for key, value in trajectories.items():
    
    if len(key) == 1:
        continue
    
    figure = make_subplots(rows=1, cols=1)
    colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) for i in range(len(value))]
    
    for index, subValue in enumerate(value):
        
        figure.add_trace(go.Scatter(x=list(range(0, len(subValue))), y=subValue,
                                    mode='lines', marker_color=colors[index], line = dict(width=4), line_shape='spline'), row=1, col=1,
                        )
        
        '''oldScale = np.arange(0, len(subValue))
        interpolateFunction = interpolate.interp1d(oldScale, subValue)
        
        newScale = np.linspace(0, len(subValue) - 1, MAX_LEN_OF_TRAJECTORY)
        subValue = interpolateFunction(newScale)
        
        figure.add_trace(go.Scatter(x=list(range(0, len(subValue))), y=subValue,
                                    mode='lines', marker_color=colors[index]), row=1, col=2)'''
    
    figure.update_layout(showlegend=False, height=600, width=900)
    figure.show()