In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_json('./data/bycountry_ds.json')
df.shape

(17403, 5)

In [3]:
df.head()

Unnamed: 0,category,input,output_low,output_medium,output_high
http://dbpedia.org/resource/...All_the_Marbles,[United States],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,1,0
http://dbpedia.org/resource/An_Eye_for_an_Eye_(1981_film),[United States],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,1,0
http://dbpedia.org/resource/Brain_Dead_(1990_film),[United States],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,1,0
http://dbpedia.org/resource/For_Pete's_Sake_(film),[United States],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,1,0
http://dbpedia.org/resource/Hannah_Montana:_The_Movie,[United States],"[25, 6677, 48, 2, 138, 5, 331, 1, 7, 25, 6983,...",0,0,1


In [4]:
from tensorflow import keras

In [5]:
model = keras.models.load_model('./models/bycountry_model')

In [6]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 64)           640000    
                                                                 
 lstm_3 (LSTM)               (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 3)                 99        
                                                                 
Total params: 652,515
Trainable params: 652,515
Non-trainable params: 0
_________________________________________________________________


In [7]:
for i, layer in enumerate(model.layers):
    print(i, layer.name)
    if i != 0: # Embedding does not have activation function
        print(layer.activation)

0 embedding
1 lstm_3
<function tanh at 0x0000015A275DC820>
2 dense
<function softmax at 0x0000015A275D7A60>


In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout

In [9]:
layer_to_explore = 1

In [10]:
model2 = Sequential()

for i in range(len(model.layers)):
    if i < layer_to_explore + 1:
        print(f"Adding layer {i}")
        model2.add(model.layers[i])
        
model2.summary()

Adding layer 0
Adding layer 1
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 64)           640000    
                                                                 
 lstm_3 (LSTM)               (None, 32)                12416     
                                                                 
Total params: 652,416
Trainable params: 652,416
Non-trainable params: 0
_________________________________________________________________


In [11]:
[print(i.shape, i.dtype) for i in model2.inputs]
[print(o.shape, o.dtype) for o in model2.outputs]
[print(l.name, l.input_shape, l.dtype) for l in model2.layers]

(None, 100) <dtype: 'float32'>
(None, 32) <dtype: 'float32'>
embedding (None, 100) float32
lstm_3 (None, 100, 64) float32


[None, None]

In [12]:
inp = np.array(df.input.iloc[10])
inp

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  410,
       1514, 4369, 1284,   11,    1,    6, 2243, 4947, 7330, 2080, 8139,
       3909,    2, 2024,   14, 7714, 7714, 7714,   10, 2024,   19, 2963,
         48])

In [13]:
inputs = []
for i in range(len(df.input)):
    inputs.append(np.array(df.input.iloc[i]))

inputs = np.array(inputs)
inputs.shape

(17403, 100)

In [14]:
model2.predict(inputs)



array([[ 0.07156485,  0.292121  , -0.36352202, ...,  0.24505086,
         0.0075122 , -0.06348532],
       [-0.02730321,  0.11424701, -0.10647444, ...,  0.2577309 ,
         0.08961219,  0.19460133],
       [ 0.17597727,  0.23239158, -0.28580275, ...,  0.04924995,
        -0.09155545, -0.3179046 ],
       ...,
       [ 0.45771903,  0.4332017 , -0.50327903, ...,  0.06189189,
        -0.1883936 , -0.6567339 ],
       [ 0.2850686 ,  0.39111367, -0.44341838, ...,  0.12254363,
        -0.08124002, -0.5056254 ],
       [-0.13035586, -0.09148991,  0.04466944, ...,  0.27199036,
         0.19974534,  0.25408548]], dtype=float32)

In [15]:
# Need to check if the country is real
def get_unique_categories(df):
    unique_categories = []
    for i in range(len(df.category)):
        for cat in df.category[i]:
            if cat not in unique_categories and cat != '':
                unique_categories.append(cat)
                
    return unique_categories

In [16]:
get_unique_categories(df)

['United States',
 'France',
 'Australia',
 'United Kingdom',
 'Mexico',
 'Argentina',
 'Italy',
 'Spain',
 'United States, Mexico',
 'Belgium',
 'Germany',
 'Brazil',
 'Japan',
 'Lebanon',
 'Ireland',
 'Canada',
 'Luxembourg',
 'West Germany',
 'USA',
 'Russia',
 'China',
 'Serbia',
 'Indonesia',
 'Hong Kong',
 'Switzerland',
 'Armenia',
 'Austria',
 'India',
 'Iran',
 'Taiwan',
 'Soviet Union',
 'South Africa',
 'Romania',
 'South Korea',
 'Cambodia',
 'Netherlands',
 'Sweden',
 'Singapore',
 'Poland',
 'Norway',
 'Mali',
 'Czech Republic',
 'Denmark',
 'New Zealand',
 'Lithuania',
 'Jordan',
 'Hungary',
 'United States/Mexico',
 'Yugoslavia',
 'Italy, France',
 'Colombia',
 'Monaco',
 'Syria',
 'Slovakia',
 'Ukraine',
 'Morocco',
 'Pakistan',
 'Thailand',
 'Israel',
 'Mainland China',
 'Bulgaria',
 'Portugal',
 'Nigeria',
 'Chile',
 'Peru',
 'Iceland',
 'Algeria',
 'Tunisia',
 'Greece',
 'Malaysia',
 'Cameroon',
 'Kenya',
 'Vietnam',
 'Japan, United States',
 'Puerto Rico',
 'Finlan

In [17]:
def get_inputs_for_cat(category, df):
    raw_inputs = df[df.category.apply(lambda x: category in x)].input
    inputs = []
    for i in range(len(raw_inputs)):
        inputs.append(raw_inputs[i])
    return np.array(inputs)

In [18]:
get_inputs_for_cat('France', df)

array([[   0,    0,    0, ...,  108,   23,  192],
       [   0,    0,    0, ..., 1372,  287,  121],
       [   0,    0,    0, ...,    9,  141, 2048],
       ...,
       [   0,    0,    0, ...,   22,  608, 1051],
       [   0,    0,    0, ...,   99,   24,   26],
       [   0,    0,    0, ...,  101, 3619, 1568]])

In [25]:
def get_activation_for_cat(category, df, model):
    inputs_cat = get_inputs_for_cat(category, df)
    activations = model.predict(inputs_cat)
    
    df = pd.DataFrame()
    for neuron_index, value_list in enumerate(activations.T):
        index = f"neuron_{neuron_index + 1}"
        df[index] = value_list
    
    return df

In [26]:
get_activation_for_cat('France', df, model2)



Unnamed: 0,neuron_1,neuron_2,neuron_3,neuron_4,neuron_5,neuron_6,neuron_7,neuron_8,neuron_9,neuron_10,...,neuron_23,neuron_24,neuron_25,neuron_26,neuron_27,neuron_28,neuron_29,neuron_30,neuron_31,neuron_32
0,0.012802,0.255834,-0.285478,-0.076880,0.140202,-0.346347,-0.148518,0.152835,0.131516,0.143724,...,0.208108,0.012171,-0.138187,-0.037002,-0.168760,-0.044743,0.403420,0.275540,0.044003,0.060025
1,-0.114197,-0.236791,0.224753,0.249885,0.075467,0.161859,0.087310,-0.324186,-0.251396,-0.237267,...,-0.296594,-0.314252,0.178881,-0.277235,0.139951,0.272246,0.201364,-0.078311,0.046053,0.157695
2,0.054977,0.308612,-0.353052,-0.089603,0.165846,-0.364087,-0.136671,0.181550,0.108126,0.176852,...,0.259418,0.052278,-0.146345,-0.035713,-0.175631,-0.071884,0.427689,0.329623,0.024659,0.046147
3,-0.049788,0.162460,-0.110153,0.028377,0.217178,-0.211474,-0.159018,-0.007721,0.065437,0.063451,...,0.038543,-0.122706,-0.136662,-0.092948,-0.175322,0.036790,0.434149,0.284049,0.112804,0.165367
4,-0.000226,0.276868,-0.293031,-0.013333,0.265783,-0.364789,-0.171447,0.084312,0.042497,0.168486,...,0.214612,-0.014227,-0.139410,-0.062072,-0.187610,-0.020756,0.509852,0.400162,0.093375,0.206464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1720,0.138037,0.280349,-0.319895,-0.165824,0.012688,-0.364730,-0.071092,0.347884,0.202159,0.181130,...,0.302172,0.099229,-0.242265,0.037693,-0.137511,-0.125141,-0.079256,0.148247,-0.061676,-0.256358
1721,0.262991,0.294213,-0.361224,-0.249750,-0.034318,-0.445598,-0.018810,0.463182,0.320930,0.158977,...,0.322438,0.159250,-0.369232,0.079485,-0.058370,-0.155533,-0.540884,0.045912,-0.062865,-0.497376
1722,0.022598,0.001793,-0.057717,-0.063079,-0.026253,-0.019234,0.019303,0.055918,0.071769,0.000055,...,0.045134,0.028913,0.010005,-0.013924,0.003687,-0.027862,0.048013,-0.019877,-0.004528,-0.025721
1723,0.119124,0.183692,-0.269196,-0.158095,-0.032961,-0.281528,-0.024299,0.276412,0.213922,0.111553,...,0.235743,0.061021,-0.124880,0.022642,-0.069663,-0.113029,-0.072037,0.086475,-0.036006,-0.145382


In [34]:
all_inputs = []
for i in range(len(df.input)):
    all_inputs.append(df.input[i])

all_inputs = np.array(all_inputs)
    
all_activations = model2.predict(all_inputs)

df_activations = pd.DataFrame()
for neuron_index, value_list in enumerate(all_activations.T):
    index = f"neuron_{neuron_index + 1}"
    df_activations[index] = value_list

df_s = df_activations.copy()
for col in df_activations:
    if "neuron" in col:
        df_s[col] = (df_activations[col] - df_activations[col].mean()) / df_activations[col].std()

df_s.describe()



Unnamed: 0,neuron_1,neuron_2,neuron_3,neuron_4,neuron_5,neuron_6,neuron_7,neuron_8,neuron_9,neuron_10,...,neuron_23,neuron_24,neuron_25,neuron_26,neuron_27,neuron_28,neuron_29,neuron_30,neuron_31,neuron_32
count,17403.0,17403.0,17403.0,17403.0,17403.0,17403.0,17403.0,17403.0,17403.0,17403.0,...,17403.0,17403.0,17403.0,17403.0,17403.0,17403.0,17403.0,17403.0,17403.0,17403.0
mean,-2.16629e-07,4.1066e-07,1.922261e-07,9.753954e-08,2.67839e-07,1e-06,-8.60964e-07,-4.885026e-08,6.14096e-07,4.933455e-07,...,-2.811176e-07,-7.57465e-07,-2.661505e-07,-2e-06,9.82605e-08,8.576452e-07,-4.258052e-07,2.202594e-07,-2.523239e-07,-4.771591e-07
std,0.999998,0.999999,1.000001,0.9999996,0.9999989,1.000001,0.9999992,0.9999999,1.000001,0.9999989,...,0.999997,1.000001,1.000001,1.0,0.9999992,1.000002,1.000002,0.9999989,0.9999984,0.9999998
min,-2.211555,-2.020194,-1.714673,-1.755788,-2.752283,-1.814397,-1.397847,-1.714309,-2.180992,-2.395678,...,-1.854136,-2.288537,-1.969375,-2.89871,-1.262259,-1.715092,-2.617186,-2.601265,-2.904822,-2.691693
25%,-0.8048492,-0.8729309,-0.8808089,-0.8414666,-0.225224,-0.930823,-0.5829255,-0.9644973,-0.7081182,-0.6040406,...,-0.9768961,-0.6356609,-0.7935221,-0.304798,-0.6357913,-0.7550378,-0.5390507,-0.3846351,-0.6541336,-0.7586678
50%,-0.120699,0.3547014,-0.26955,-0.2316105,0.009577132,-0.187949,-0.3949216,0.141302,0.2724463,0.4656679,...,0.3263607,0.3247794,-0.1972756,0.369341,-0.4346038,-0.3517961,0.1100078,0.2108623,0.02043862,0.4500338
75%,0.7125897,0.8450011,0.9893752,0.7910589,0.5172332,1.170981,0.0616249,0.9221584,0.848168,0.7555107,...,0.8669542,0.7921521,0.8179525,0.731764,0.2116831,0.5866399,0.6185674,0.6166881,0.7172172,0.7190064
max,3.313614,1.590294,1.740926,1.990605,3.430601,1.464705,2.788922,1.66645,1.469774,1.188456,...,1.615086,1.623206,2.238333,1.130751,2.852166,2.470067,2.439595,2.099051,2.844593,2.039747


In [31]:
from scipy import stats

# c = "French_films"
# c = "British_films"
# c = "American_black-and-white_films"
c = "Film_scores_by_composer"


def get_activ_for_not_cat(category, df):
    return getNotCatDF(category, df).iloc[:, 3:]

def getNotCatDF(cat, df): # Standardized df
    return df[df.categories.apply(lambda x: cat not in x)]



def find_pv(category):
    actc = get_activation_for_cat(c, df, model2)
    actnc = get_activ_for_not_cat(c, df)
    reses = []
    for i in range(1000):  
        actncs = actnc.sample(len(actc))
        res = []
        for col in actc:
            p = stats.wilcoxon(np.array(actncs[col]), y=np.array(actc[col])).pvalue
            res.append(p)
        reses.append(res)
    return pd.DataFrame(np.array(reses)).mean()

for c in get_unique_categories(df)[0:3]:
    print(c)
    r = find_pv(c)
    rdf = pd.DataFrame(r)
    data = [go.Heatmap(z=rdf.T, zmin=0, zmax=1,
            colorscale=['rgb(255, 255, 255)', 'rgb(0, 0, 0)'], 
            reversescale=False)]
    layout = go.Layout(template='none', height=300)
    fig = go.Figure(data=data, layout=layout)
    fig.show()

    rdf[0] = rdf[0].apply(lambda x: 0 if x > 0.01 else 1)
    data = [go.Heatmap(z=rdf.T, zmin=0, zmax=1,
            colorscale=['rgb(0, 0, 0)', 'rgb(255, 255, 255)'], 
            reversescale=False)]
    layout = go.Layout(template='none', height=300)
    fig = go.Figure(data=data, layout=layout)
    fig.show()

United States


NameError: name 'df_activations_s' is not defined