# User Oriented Dataset - 3D UMAP
- UDD/MDS18/PP4
- Final Project: 'Studying Twitter User Accounts: Spotting Suspicious Social Bot Behavior'
- Master Candidate: Marcelo Rovai
- Professors:	   
    - Eduardo Graells-Garrido (Supervisor)
    - Loreto Bravo
    - Leo Ferres

## Main Libraries & Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.datasets import load_iris, load_digits
from sklearn.model_selection import train_test_split
import umap

In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper
from bokeh.palettes import Spectral10

output_notebook()

In [None]:
import plotly
import plotly.express as px
import plotly.graph_objs as go

plotly.offline.init_notebook_mode()

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('display.float_format', lambda x: '%.f' % x)
sns.set_context("paper",
                rc={
                    "font.size": 12,
                    "axes.titlesize": 12,
                    "axes.labelsize": 12,
                    'figure.figsize': (14, 10)
                })

## Main Functions

In [None]:
def plot_feature(df, feat, kde=True, prt=False, save_file_name=None, x_max = None):
    f, axes = plt.subplots(ncols=1, figsize=(10, 4))
    sns.despine(left=True)
    sns.distplot(df[feat],
                 color="m",
                 hist=False,
                 kde=kde,
                 kde_kws={
                     'shade': True,
                     'linewidth': 3
                 })
    plt.suptitle(feat + " distribution", fontsize=20)
    plt.xlabel(feat,fontsize=15)
    plt.setp(axes, yticks=[])
    plt.xlim(0, x_max)

    if prt == True:
        plt.savefig("../images/"+save_file_name, bbox_inches='tight')

In [None]:
def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name

In [None]:
def plot_feat_comp(df1,
                   df2,
                   feat,
                   color1='m',
                   color2='g',
                   prt=False,
                   x_max=None,
                  log=False):
    feat1 = df1[feat]
    feat2 = df2[feat]
    label_1 = get_df_name(df1)
    label_2 = get_df_name(df2)
    fig, axes = plt.subplots(ncols=1, figsize=(10, 5))

    #fig = plt.figure(figsize = (10,5))
    ax = sns.distplot(feat1,
                      color=color1,
                      label=label_1,
                      hist=False,
                      kde=True,
                      kde_kws={
                          'shade': True,
                          'linewidth': 3
                      })
    plt.xlim(x_max)
    if log:
        ax.set_xscale('log')
    
    ax = sns.distplot(feat2,
                      color=color2,
                      label=label_2,
                      hist=False,
                      kde=True,
                      kde_kws={
                          'shade': True,
                          'linewidth': 3
                      })
    plt.xlim(x_max)
    if log:
        ax.set_xscale('log')
    
    plt.suptitle(label_1 + "-" + label_2 + "  " + feat + ' distribution',
                 fontsize=16)
    sns.despine(left=True)
    plt.setp(axes, yticks=[])

    if prt == True:
        plt.savefig("../images/" + label_1 + "-" + label_2 + "-" + feat+".png",
                    bbox_inches='tight')

In [None]:
def plot_feat_comp_2_graph(df1,
                           df2,
                           feat,
                           color1='m',
                           color2='g',
                           prt=False,
                           log=False):
    feat1 = df1[feat]
    feat2 = df2[feat]
    label_1 = get_df_name(df1)
    label_2 = get_df_name(df2)
    f, axes = plt.subplots(ncols=2, figsize=(10, 4))
    sns.despine(left=True)
    ax = sns.distplot(feat1,
                      color=color1,
                      label=label_1,
                      hist=False,
                      kde=True,
                      kde_kws={
                          'shade': True,
                          'linewidth': 3
                      },
                      ax=axes[0])

    if log:
        ax.set_xscale('log')

    ax = sns.distplot(feat2,
                      color=color2,
                      label=label_2,
                      hist=False,
                      kde=True,
                      kde_kws={
                          'shade': True,
                          'linewidth': 3
                      },
                      ax=axes[1])

    if log:
        ax.set_xscale('log')

    plt.suptitle(label_1 + "-" + label_2 + "  " + feat + " distribution",
                 fontsize=16)
    plt.setp(axes, yticks=[])

    if prt == True:
        plt.savefig("../images/" + label_1 + "-" + label_2 + "-" + feat+".png",
                    bbox_inches='tight')

## Import Dataset

In [None]:
ls ../data/

In [None]:
df = pd.read_csv("../data/60_user_metrics_label.csv", lineterminator='\n', low_memory=False)
df.shape

In [None]:
df.head(2)

In [None]:
del df['Unnamed: 0']

In [None]:
df.info()

In [None]:
df[df.acc_verif == 1].label.value_counts()

## Prepare data

Use all data

In [None]:
data = df.copy()
data.shape

In [None]:
data.set_index("user", inplace = True)
data.head(2)

In [None]:
data.label.value_counts()

In [None]:
X_train = data.values[:,0:(data.shape[1]-1)]

In [None]:
X_train

In [None]:
#X_train = np.log(X_train + 1)
n = X_train.shape[0]
print("\nThis data set contains " + str(n) + " samples")
y_train = data.values[:,data.shape[1]-1]
print("\nDimensions of the  data set: ")
print(X_train.shape, y_train.shape)

## Apply 3D Model

### Default parameters:

In [None]:
reducer = umap.UMAP(n_components=3,)

In [None]:
embedding = reducer.fit_transform(X_train)
embedding.shape

In [None]:
embedding

In [None]:
def plt_color(lst):
    cols=[]
    size=[]
    for l in lst:
        if l=='low_freq':
            cols.append('gray')
            size.append(.1)
        elif l=='high_freq':
            cols.append('red')
            size.append(50)
        else:
            cols.append('blue')
            size.append(25)
    return cols, size

In [None]:
cols, size = plt_color(y_train)
plt.figure(figsize=(15,15))
plt.scatter(embedding[:, 0], embedding[:, 1], c=cols, s=size)
plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the User dataset', fontsize=15);

In [None]:
cols, size = plt_color(y_train)
plt.figure(figsize=(15,15))
plt.scatter(embedding[:, 0], embedding[:, 2], c=cols, s=size)
plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the User dataset', fontsize=15);

In [None]:
cols, size = plt_color(y_train)
plt.figure(figsize=(15,15))
plt.scatter(embedding[:, 1], embedding[:, 2], c=cols, s=size)
plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the User dataset', fontsize=15);

In [None]:
fig = plt.figure(figsize=(15,15))
ax = Axes3D(fig)
ax.scatter(embedding[:, 0], embedding[:, 1], embedding[:, 2], c=cols, s=size)
plt.suptitle('UMAP 3D projection of the User dataset', fontsize=15);
plt.savefig("../images/3D/UMAP_3D_proj_Use_dataset", bbox_inches='tight')

---

### UMAP enhanced clustering

In [None]:
def plot_UMAP_3D(X_train, n_neighbors, min_dist, save=False):

    clusterable_embedding = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=3,
        random_state=42,
    ).fit_transform(X_train)

    cols, size = plt_color(y_train)
    fig = plt.figure(figsize=(15, 15))
    ax = Axes3D(fig)
    ax.scatter(clusterable_embedding[:, 0],
               clusterable_embedding[:, 1],
               clusterable_embedding[:, 2],
               c=cols,
               s=size)
    plt.suptitle('3D UMAP Projection Mid-High-Freq- n_neighbors=' +
                 str(n_neighbors) + ' - min_dist=' + str(min_dist),
                 fontsize=15)
    if save:
        plt.savefig('../images/3D/3D_UMAP-Susp_User_Dataset_Result_n_neighbors=' +
                    str(n_neighbors) + '_min_dist=' + str(min_dist)+'.png',
                    bbox_inches='tight')
    return clusterable_embedding

In [None]:
n_neighbors=3
min_dist=0.1
clusterable_embedding = plot_UMAP_3D(X_train, n_neighbors, min_dist, save=True)

In [None]:
clusterable_embedding

In [None]:
n_neighbors=3
min_dist=0.0
clusterable_embedding_3 = plot_UMAP_3D(X_train, n_neighbors, min_dist, save=True)

In [None]:
n_neighbors=5
min_dist=0.0
clusterable_embedding_5 = plot_UMAP_3D(X_train, n_neighbors, min_dist, save=True)

In [None]:
n_neighbors=10
min_dist=0.0
clusterable_embedding_10 = plot_UMAP_3D(X_train, n_neighbors, min_dist, save=True)

In [None]:
n_neighbors=15
min_dist=0.0
clusterable_embedding_15 = plot_UMAP_3D(X_train, n_neighbors, min_dist, save=True)

In [None]:
n_neighbors=30
min_dist=0.0
clusterable_embedding_30 = plot_UMAP_3D(X_train, n_neighbors, min_dist, save=True)

In [None]:
n_neighbors=50
min_dist=0.0
clusterable_embedding_50 = plot_UMAP_3D(X_train, n_neighbors, min_dist, save=False)

In [None]:
n_neighbors=100
min_dist=0.0
clusterable_embedding_100 = plot_UMAP_3D(X_train, n_neighbors, min_dist, save=False)

## Manipulation 3D with Plotly

In [None]:
def Interactive_3D_UMAP(clusterable_embedding):
    trace = go.Scatter3d(
        x = clusterable_embedding[:, 0],
        y = clusterable_embedding[:, 1],
        z = clusterable_embedding[:, 2],
        mode='markers',
        marker={
            'size': 5,
            'opacity': 0.5,
            'color':'red'
        }
    )

    layout = go.Layout(
        margin={'l': 0, 'r': 0, 'b': 0, 't': 0}
    )
    data = [trace]
    plot_figure = go.Figure(data=data, layout=layout)

    plotly.offline.iplot(plot_figure)

In [None]:
Interactive_3D_UMAP(clusterable_embedding_5)

In [None]:
Interactive_3D_UMAP(clusterable_embedding_10)

In [None]:
Interactive_3D_UMAP(clusterable_embedding_100)

## Creating and exploration: Dataframe with 2D and 3D data

Working with:
- n_neighbors = 30
- min_dist = 0.0

In [None]:
len(clusterable_embedding_30)

In [None]:
user_df = pd.DataFrame(clusterable_embedding_30, columns=('x', 'y', 'z'))

In [None]:
user_df.head()

In [None]:
user_df = pd.concat([data.reset_index(), user_df], axis=1)

In [None]:
user_df.head(2)

### Exploring 3D

In [None]:
def explore_Umap_3D(user_df,
                    list_users,
                    showbackground=True,
                    showticklabels=True):
    
    user_data = user_df.apply(add_color_size, axis=1)
    fig = go.Figure(data=[
        go.Scatter3d(x=user_data.x,
                     y=user_data.y,
                     z=user_data.z,
                     mode='markers',
                     text = 'User: @'+ user_data['user'],
                     hoverinfo='text+x+y+z',
                     marker=dict(color=user_data.color,
                                 sizemode='diameter',
                                 sizeref=1,
                                 size=.5,
                                 opacity=0.9))
    ])
    fig.update_layout(margin=dict(l=0, r=0, b=0, t=0),
                      scene=dict(xaxis=dict(showbackground=showbackground,
                                            showticklabels=showticklabels),
                                 yaxis=dict(showbackground=showbackground,
                                            showticklabels=showticklabels),
                                 zaxis=dict(showbackground=showbackground,
                                            showticklabels=showticklabels)))
    plotly.offline.plot(fig, filename='user_UMAP_3D.html', auto_open=False)
    fig.show()

In [None]:
def add_color_size(row): 
    if row['user'] in list_users:
        color = 'orange'
        size = 50
    elif row['acc_verif']:
        color = 'yellow'
        size = 5
    else:
        freq = row['label']
        if freq == 'low_freq':
            color = 'gray'
            size = .1
        elif freq == 'high_freq':
            color = 'red'
            size = 5
        else:
            color = 'blue'
            size = 3
        
    row['color'] = color
    row['size'] = size

    return row

In [None]:
list_users = []
explore_Umap_3D(user_df, list_users)

### Exploring 2D

In [None]:
def explore_Umap_2D(user_df, list_users, x='x', y='y', title = '2D UMAP projection of the User dataset'):
    user_df = user_df.apply(add_color_size, axis=1)
    datasource = ColumnDataSource(user_df)

    p = figure(title='(' + x + ',' + y +') ' + title,
               plot_width=800,
               plot_height=800,
               tools=('crosshair, pan, wheel_zoom, reset, save, box_select'))
    p.xaxis.axis_label = x.upper()
    p.yaxis.axis_label = y.upper()

    p.add_tools(
        HoverTool(tooltips=[("user", "@user"), (
            "(x,y)",
            "($x, $y)"), ("acc_verif",
                          "@acc_verif"), ("fols_frs_ratio",
                                          "@fols_frs_ratio")]))
    p.circle(x,
             y,
             source=datasource,
             color='color',
             size='size',
             fill_alpha=0.6)

    show(p)

In [None]:
list_users = []
explore_Umap_2D(user_df, list_users, 'x', 'y')

### Verifying Simple Bots

In [None]:
list_users = ['fedoraletelier', 'Aliciacarafipl3', 'Dolores09072598']
explore_Umap_2D(user_df, list_users, 'x', 'y')

In [None]:
list_users = ['fedoraletelier', 'Aliciacarafipl3', 'Dolores09072598']
explore_Umap_2D(user_df, list_users, 'x', 'z')

In [None]:
list_users = ['fedoraletelier', 'Aliciacarafipl3', 'Dolores09072598']
explore_Umap_2D(user_df, list_users, 'y', 'z')

### Verifying Top High-Frequency users

In [None]:
list_users = ['AlbertoMayol','Tomaskovacic', 'NelsonCL28']
explore_Umap_2D(user_df, list_users)

### Verifying bot suspicius new users 

In [None]:
list_users = [
    'EncuestaExpress', 'RResponsablecl', 'cazadorandino90', 'Piagutierrezs',
    'NathalySeplved3', 'ElCentinelaMPE', 'AShumman', 'PamelaSoler3',
    'Sumate_Guillier', 'jav_ast'
]

explore_Umap_2D(user_df, list_users)

In [None]:
explore_Umap_2D(user_df, list_users, 'x', 'z')

In [None]:
explore_Umap_2D(user_df, list_users, 'y', 'z')

### Verifying - Suspitious Users Cluster 0

In [None]:
list_users = [
    'Aptimate', 'EncuestaExpress', 'jaimeantonio67', 'comandoguillie4',
    'RResponsablecl', 'mas_estudiantil', 'arqmneira', 'Maria_sepul66',
    'cazadorandino90', 'DulceViborita', 'FresiaPavez1', 'Sirius4321',
    'viejofasho', 'Ignacio90415476', 'EstudioOsorno', 'NathalySeplved3',
    'MSP_LasCondes', 'ElCentinelaMPE', 'caco_sanmartin', 'maasma58',
    'Dolores09072598', 'SoleAitkenP', 'JuanManuelCorn5', 'Trab_vXguillier',
    'c_alvan_0207', 'Santiag87306226', 'Edelarosacris', 'andres20der',
    'mrgrtgautier', 'BassaRiveros', 'Conapro_jjcc', 'SebastianFont20',
    'ChristianPolo7', 'almquin', 'chunchitomauro', 'Atletadelgol32',
    'laviedej', 'Ange_GossowF', 'Sumate_Guillier', 'solgensen',
    'FelipeAlegreJ', 'BarbManriquez', 'jav_ast'
]

In [None]:
explore_Umap_2D(user_df, list_users, 'x', 'y', '3D UMAP Projection - Suspicious Users Cluster 0')

In [None]:
explore_Umap_2D(user_df, list_users, 'x', 'z', '3D UMAP Projection - Suspicious Users Cluster 0')

In [None]:
explore_Umap_2D(user_df, list_users, 'y', 'z', '3D UMAP Projection - Suspicious Users Cluster 0')

In [None]:
list_users = list(set([
    'fedoraletelier', 'Aliciacarafipl3', 'Dolores09072598'
    'AlbertoMayol','Tomaskovacic', 'NelsonCL28',
    'EncuestaExpress', 'RResponsablecl', 'cazadorandino90', 'Piagutierrezs',
    'NathalySeplved3', 'ElCentinelaMPE', 'AShumman', 'PamelaSoler3',
    'Sumate_Guillier', 'jav_ast'
    'Aptimate', 'EncuestaExpress', 'jaimeantonio67', 'comandoguillie4',
    'RResponsablecl', 'mas_estudiantil', 'arqmneira', 'Maria_sepul66',
    'cazadorandino90', 'DulceViborita', 'FresiaPavez1', 'Sirius4321',
    'viejofasho', 'Ignacio90415476', 'EstudioOsorno', 'NathalySeplved3',
    'MSP_LasCondes', 'ElCentinelaMPE', 'caco_sanmartin', 'maasma58',
    'Dolores09072598', 'SoleAitkenP', 'JuanManuelCorn5', 'Trab_vXguillier',
    'c_alvan_0207', 'Santiag87306226', 'Edelarosacris', 'andres20der',
    'mrgrtgautier', 'BassaRiveros', 'Conapro_jjcc', 'SebastianFont20',
    'ChristianPolo7', 'almquin', 'chunchitomauro', 'Atletadelgol32',
    'laviedej', 'Ange_GossowF', 'Sumate_Guillier', 'solgensen',
    'FelipeAlegreJ', 'BarbManriquez', 'jav_ast'
]))

In [None]:
explore_Umap_2D(user_df, list_users, 'x', 'y', '3D UMAP Projection - All Suspicious Users')

In [None]:
explore_Umap_2D(user_df, list_users, 'x', 'z', '3D UMAP Projection - All Suspicious Users')

In [None]:
explore_Umap_2D(user_df, list_users, 'y', 'z', '3D UMAP Projection - All Suspicious Users')