In [4]:
import pandas as pd
import numpy as np

In [3]:
csv = '~/SageMaker/CreatingIPProfiles/data/log20101010.csv'

In [4]:
df = pd.read_csv(csv,usecols = ['ip','date','time','accession'])

In [5]:
df['datetime'] = df['date'] + ' ' + df['time']
df['datetime'] = pd.to_datetime(df['datetime'])

In [6]:
df = df.set_index('datetime')

In [7]:
df = df.drop(['date','time'],axis=1)

In [8]:
df.head()

Unnamed: 0_level_0,ip,accession
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-10-10,108.5.109.gch,0000950137-01-501552
2010-10-10,108.5.109.gch,0000950137-01-503045
2010-10-10,108.5.109.gch,0000950137-01-501552
2010-10-10,209.223.222.jdd,0001209191-10-049878
2010-10-10,209.223.222.jdd,0001144204-10-053151


In [9]:
topdocs = df.groupby('accession').size().nlargest(500)

In [10]:
topdocs = topdocs.reset_index()

In [11]:
dips = {k:g['accession'].tolist() for k,g in df.groupby('ip')}

In [12]:
df_filtered = pd.merge(topdocs, df, how='left',on='accession')

In [13]:
df_filtered['count'] = df_filtered.iloc[:,1]

In [14]:
df_filtered = df_filtered.drop(0,axis=1)

In [15]:
len(df_filtered.ip.unique())

1851

In [16]:
df1 = df_filtered.drop_duplicates()

In [17]:
sparse_matrix = pd.crosstab(df1.ip, df1.accession)

In [19]:
sparse_matrix.head()

accession,0000000000-10-017697,0000004904-09-000040,0000004904-10-000018,0000018230-07-000272,0000018230-10-000092,0000039911-10-000061,0000039911-10-000062,0000040545-10-000010,0000060667-10-000159,0000066382-10-000010,...,0001503180-10-000003,0001503180-10-000004,0001503180-10-000005,0001503180-10-000006,0001503180-10-000007,0001503180-10-000008,0001503180-10-000009,0001503180-10-000010,0001503180-10-000011,9999999997-10-018298
ip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.23.215.eef,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1.52.249.iji,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
108.11.131.ech,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
108.11.139.dcg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
108.110.44.jig,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
def sparsity_of_matrix(mat):
    return (sum(np.sum(mat)))/(np.dot(mat.shape[0], mat.shape[1]))

In [21]:
sparsity_of_matrix(sparse_matrix)

0.009967585089141005

## Cosine Similarity Between IP Addresses

In [22]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform
from scipy.spatial.distance import pdist
import matplotlib.pyplot as plt

%matplotlib inline


def distance_matrix(df,metric):
    # create our pairwise distance matrix
    pairwise = pd.DataFrame(squareform(pdist(df, metric=metric)),
                            columns = df.index,index = df.index)

    # move to long form
    long_form = pairwise.unstack()

    # rename columns and turn into a dataframe
    long_form.index.rename(['ip', 'toIp'], inplace=True)
    return long_form.to_frame('distance metric').reset_index()

def count_metric_variations(matrix):
    df = pd.DataFrame(matrix)
    d_cosines = {}
    for cosine in df['distance metric']:
        if cosine in d_cosines:
            d_cosines[cosine]+=1
        else:
            d_cosines[cosine] = 1
    return d_cosines

def plot_distr_metric(dic):
    keys = dic.keys()
    vals = dic.values()

    plt.scatter(keys, np.divide(list(vals), sum(vals)), label="Real distribution")

    plt.ylim(0,1)
    plt.ylabel ('Percentage')
    plt.xlabel ('Significant number')
    plt.xticks(np.arange(0, 1, step=0.2))
    plt.title("Normalized Distribution of Cosine Similarities")
    plt.show()

In [23]:
testing = distance_matrix(sparse_matrix,'cosine')

In [24]:
d_cosines = count_metric_variations(testing)

## SVD User - Item Collaborative Filtering

In [25]:
from scipy.linalg import svd
from scipy.sparse.linalg import svds

In [26]:
U,s,VT = svd(sparse_matrix)

In [27]:
sigma = np.diag(s)

In [28]:
print(U.shape, sigma.shape, VT.shape)

(1851, 1851) (500, 500) (500, 500)


In [29]:
sparse_matrix.head()

accession,0000000000-10-017697,0000004904-09-000040,0000004904-10-000018,0000018230-07-000272,0000018230-10-000092,0000039911-10-000061,0000039911-10-000062,0000040545-10-000010,0000060667-10-000159,0000066382-10-000010,...,0001503180-10-000003,0001503180-10-000004,0001503180-10-000005,0001503180-10-000006,0001503180-10-000007,0001503180-10-000008,0001503180-10-000009,0001503180-10-000010,0001503180-10-000011,9999999997-10-018298
ip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.23.215.eef,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1.52.249.iji,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
108.11.131.ech,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
108.11.139.dcg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
108.110.44.jig,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# Zero out all but the first two singular values
sigma_reduced = np.zeros(sparse_matrix.shape)
np.fill_diagonal(sigma_reduced, sigma)
sigma_reduced[:, 2:] = 0

# Reoconstruct the ratings matrix
docs_reconstructed = U @ sigma_reduced @ VT

sparse_reconstructed_df = pd.DataFrame(docs_reconstructed, index=ips, columns=docs)
#print(sparse_reconstructed_df.head())
#print(ratings_df)


NameError: name 'ips' is not defined

### NMF Model

In [10]:
from scipy.sparse.linalg import svds
from sklearn.decomposition import NMF

In [242]:
modelnmf = NMF(50)

In [243]:
mat = modelnmf.fit_transform(sparse_matrix)

In [244]:
H = modelnmf.components_

In [245]:
H.shape

(50, 500)

In [246]:
W = mat@H

In [345]:
labels = sparse_matrix.columns.to_list()
ips = sparse_matrix.index.to_list()

In [77]:
def build_NMF(df):
    sparse_matrix = pd.crosstab(df.ip, df.accession)
    model_nmf = NMF(n_components = 50
               , init = 'random'
               , random_state = 0)
    m = model_nmf.fit_transform(sparse_matrix)
    h = model_nmf.components_
    nmf_matrix = m @ h
    x_labels = list(sparse_matrix.index)
    y_labels = list(sparse_matrix.columns)
    reconsruct_err = modelnmf.reconstruction_err_
  
    return x_labels, y_labels, nmf_matrix

# Top 5 most similar documents' indices recommended for document at index 0
def top_10(idx):
    top10 = nmf_matrix[idx].argsort()[-11:][::-1][1:]
    return list(np.array(y_labels)[top10])

def predictions_NMF(nmf_matrix):
    d_predicted = {}
    keys = [list(x_labels)[i] for i in range(nmf_matrix.shape[0])]
    values = [top_10(i) for i in range(len(keys))]
    return dict(zip(keys, values))

#Classification Accuracy: Precision Score
def score(df_test, d_predicted):
    count = 0
    test = {k:g['accession'].tolist() for k,g in df_test.groupby('ip')}
    for ip,lst in test.items():
        for doc in lst:
            if doc in d_predicted[ip]:
                count+=1
             
            #True Positive: We predicted Document in list, and it is
            #True Negative: We predicted Document not in list, and it isn't.
            #False Negative: We predicted document not in list, and it is
            #False Positive: We predicted document in lst, but it isn't
    size = df_test.shape[0]        
    return abs((size-count) / size)

In [78]:
df_train = pd.read_csv('training_log.csv')
x_labels, y_labels, nmf_matrix = build_NMF(df_train)

In [79]:
d_predicted = predictions_NMF(nmf_matrix)

In [80]:
df_test = pd.read_csv('testing_log.csv')

In [81]:
score(df_test, d_predicted)

0.829810298102981