In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import gc 
import tqdm

% matplotlib inline

In [None]:
# Obtain all labels
response = pd.read_csv('data/train_numeric.csv.zip', index_col=0, usecols=[0, 969])
df = pd.read_csv('data/train_date.csv.zip', index_col=0)
df = df.join(response)

In [None]:
del response
gc.collect()

n_parts, n_columns = df.shape
n_columns -= 1
print(n_parts, n_columns)

In [None]:
# stations and features in each station
stations = list(set([f.split('_')[1] for f in df.columns.tolist() if f!='Response']))

station_features = {s: [f for f in df.columns.tolist() if s in f] for s in stations }

In [None]:
# Calculate samples in each feature, if there is any station that every or almost every part flows by?
columns = df.columns.tolist()
feature_samples = {}
for col in tqdm.tqdm_notebook(columns):
    feature_samples[col] = len(df.loc[df[col].notna(), col])

feature_samples = pd.Series(feature_samples)
feature_samples.sort_values(ascending=False, inplace=True)
feature_samples.drop('Response', inplace=True)

### The number of segments in production
Productions with 3 segments or more tends to have different error distributions.

In [None]:
s = {}
p = {}
for i, n in tqdm.tqdm_notebook(enumerate(feature_samples.unique())):
    s[i] = (n, feature_samples.loc[feature_samples==n].index.tolist())
    p[i] = (n, list(set([k.split('_')[1] for k in s[i][1]])))

In [None]:
def segments(part_in, threshold=0.05):
    '''Calculate number of segments in a part.'''
    part_in.dropna(inplace=True)
    part_in.drop('Response', inplace=True)
    return sum(part_in.diff().fillna(0)>threshold) + 1

# df['segment'] = df.apply(segments, axis=1)

In [None]:
threshold = 0.05
N = 1000
segment = []
for i in tqdm.tqdm_notebook(range(n_parts//N+1)):
    tmp = df.iloc[i*N:min((i+1)*N, n_parts)].copy()
#     segment.append(tmp.apply(segments, axis=1))
    segment.append(tmp.apply(lambda u: sum(u.dropna().drop('Response').diff().fillna(0)>threshold)+1, axis=1))

In [None]:
# segment = pd.DataFrame.from_dict(segment, orient='index')
# df.join(segment)

# segment = [segment[k] for k in segment.keys()]

segment = pd.concat(segment)

segment.name = 'segment'

df = df.join(segment)

In [None]:
# Since it takes so long to calculate segment, let me save it.
segment = df['segment']
segment.name = 'segment'
segment.to_csv('train_segment.csv')

In [None]:
df.head()

In [None]:
a = df.groupby('segment').Response.agg(['mean', 'sum', 'count'])
a['confidence'] = 1.96 * np.sqrt(a['mean'] * (1 - a['mean']) / a['count'])
a['mean_plus'] = a['mean'] + a['confidence']
a['mean_minus'] = a['mean'] - a['confidence']
a

In [None]:
b = df.groupby(['Response', 'segment']).Response.agg(['mean', 'sum', 'count'])
b.reset_index(level=1, inplace=True)

In [None]:
fig, ax1 = plt.subplots(figsize=(14, 7))

color = 'tab:red'
ax1.set_xlabel('#segments')
ax1.set_ylabel('negative distribution', color=color)
#ax1.set_xlim([-0.5, Nxlim+0.5])
#xticks = ['{:.1f}'.format(tmp) for tmp in ax1.get_xticks() * tbin + 0.5 * tbin]
#ax1.set_xticklabels(xticks)
ax1.bar(b.loc[0, 'segment'], b.loc[0, 'count']/b.loc[0, 'count'].sum(), alpha=0.8, color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:blue'
ax2.set_ylabel('positive distribution', color=color)  # we already handled the x-label with ax1
ax2.bar(b.loc[1, 'segment'], b.loc[1, 'count']/b.loc[1, 'count'].sum(), alpha=0.6, color=color)
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.show()

In [None]:
idx = df.loc[df.segment==4].index[0]

plt.plot(df.loc[idx].dropna().iloc[:-2], '.')

In [None]:
set([f.split('_')[0] for f in df.loc[idx].iloc[:-2].dropna().index.tolist()])