# User Oriented Dataset - UMAP
- UDD/MDS18/PP4
- Final Project: 'Studying Twitter User Accounts: Spotting Suspicious Social Bot Behavior'
- Master Candidate: Marcelo Rovai
- Professors:	   
    - Eduardo Graells-Garrido (Supervisor)
    - Loreto Bravo
    - Leo Ferres

## Main Libraries & Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris, load_digits
from sklearn.model_selection import train_test_split
import umap

In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper
from bokeh.palettes import Spectral10

output_notebook()

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('display.float_format', lambda x: '%.f' % x)
sns.set_context("paper",
                rc={
                    "font.size": 12,
                    "axes.titlesize": 12,
                    "axes.labelsize": 12,
                    'figure.figsize': (14, 10)
                })

## Main Functions

In [None]:
def plot_feature(df, feat, kde=True, prt=False, save_file_name=None, x_max = None):
    f, axes = plt.subplots(ncols=1, figsize=(10, 4))
    sns.despine(left=True)
    sns.distplot(df[feat],
                 color="m",
                 hist=False,
                 kde=kde,
                 kde_kws={
                     'shade': True,
                     'linewidth': 3
                 })
    plt.suptitle(feat + " distribution", fontsize=20)
    plt.xlabel(feat,fontsize=15)
    plt.setp(axes, yticks=[])
    plt.xlim(0, x_max)

    if prt == True:
        plt.savefig("../images/"+save_file_name, bbox_inches='tight')

In [None]:
def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name

In [None]:
def plot_feat_comp(df1,
                   df2,
                   feat,
                   color1='m',
                   color2='g',
                   prt=False,
                   x_max=None,
                  logx=False,
                  logy=False):
    feat1 = df1[feat]
    feat2 = df2[feat]
    label_1 = get_df_name(df1)
    label_2 = get_df_name(df2)
    fig, axes = plt.subplots(ncols=1, figsize=(10, 5))

    #fig = plt.figure(figsize = (10,5))
    ax = sns.distplot(feat1,
                      color=color1,
                      label=label_1,
                      hist=False,
                      kde=True,
                      kde_kws={
                          'shade': True,
                          'linewidth': 3
                      })
    plt.xlim(x_max)
    if logx:
        ax.set_xscale('log')
    if logy:
        ax.set_yscale('log')
    
    ax = sns.distplot(feat2,
                      color=color2,
                      label=label_2,
                      hist=False,
                      kde=True,
                      kde_kws={
                          'shade': True,
                          'linewidth': 3
                      })
    plt.xlim(x_max)
    if logx:
        ax.set_xscale('log')
    if logy:
        ax.set_yscale('log')
    
    plt.suptitle(label_1 + "-" + label_2 + "  " + feat + ' distribution',
                 fontsize=16)
    sns.despine(left=True)
    plt.setp(axes, yticks=[])

    if prt == True:
        plt.savefig("../images/" + label_1 + "-" + label_2 + "-" + feat+".png",
                    bbox_inches='tight')

In [None]:
def plot_feat_comp_2_graph(df1,
                           df2,
                           feat,
                           color1='m',
                           color2='g',
                           prt=False,
                           logx=False,
                           logy=False):
    feat1 = df1[feat]
    feat2 = df2[feat]
    label_1 = get_df_name(df1)
    label_2 = get_df_name(df2)
    f, axes = plt.subplots(ncols=2, figsize=(10, 4))
    sns.despine(left=True)
    ax = sns.distplot(feat1,
                      color=color1,
                      label=label_1,
                      hist=False,
                      kde=True,
                      kde_kws={
                          'shade': True,
                          'linewidth': 3
                      },
                      ax=axes[0])

    if logx:
        ax.set_xscale('log')
        
    if logy:
        ax.set_yscale('log')
        
    ax = sns.distplot(feat2,
                      color=color2,
                      label=label_2,
                      hist=False,
                      kde=True,
                      kde_kws={
                          'shade': True,
                          'linewidth': 3
                      },
                      ax=axes[1])

    if logx:
        ax.set_xscale('log')
    if logy:
        ax.set_yscale('log')

    plt.suptitle(label_1 + "-" + label_2 + "  " + feat + " distribution",
                 fontsize=16)
    plt.setp(axes, yticks=[])

    if prt == True:
        plt.savefig("../images/" + label_1 + "-" + label_2 + "-" + feat+".png",
                    bbox_inches='tight')

## Import Dataset

In [None]:
ls ../data/

In [None]:
df = pd.read_csv("../data/60_user_metrics_label.csv", lineterminator='\n', low_memory=False)
df.shape

In [None]:
df.head(2)

In [None]:
del df['Unnamed: 0']

In [None]:
df.info()

In [None]:
df[df.acc_verif == 1].label.value_counts()

## Prepare data

In [None]:
data = df.copy()
data.set_index("user", inplace = True)
data.head(2)

In [None]:
X_train = data.values[:,0:(data.shape[1]-1)]

In [None]:
X_train

In [None]:
#X_train = np.log(X_train + 1)
n = X_train.shape[0]
print("\nThis data set contains " + str(n) + " samples")
y_train = data.values[:,data.shape[1]-1]
print("\nDimensions of the  data set: ")
print(X_train.shape, y_train.shape)

## Apply Model

### Default parameters:

In [None]:
reducer = umap.UMAP()

In [None]:
embedding = reducer.fit_transform(X_train)
embedding.shape

In [None]:
embedding

In [None]:
def plt_color(lst):
    cols=[]
    size=[]
    for l in lst:
        if l=='low_freq':
            cols.append('gray')
            size.append(.1)
        elif l=='high_freq':
            cols.append('red')
            size.append(50)
        else:
            cols.append('blue')
            size.append(25)
    return cols, size

In [None]:
cols, size = plt_color(y_train)
plt.figure(figsize=(15,15))
plt.scatter(embedding[:, 0], embedding[:, 1], c=cols, s=size)
plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the User dataset', fontsize=15);

---

### UMAP enhanced clustering

In [None]:
def plot_UMAP(X_train, n_neighbors=15, min_dist=0.1, save=False):

    clusterable_embedding = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=2,
        random_state=42,
    ).fit_transform(X_train)

    cols, size = plt_color(y_train)
    plt.figure(figsize=(15, 15))
    plt.scatter(clusterable_embedding[:, 0],
                clusterable_embedding[:, 1],
                c=cols,
                s=size)
    plt.gca().set_aspect('equal', 'datalim')
    plt.title('2D UMAP Projection - n_neighbors=' + str(n_neighbors) +
              ' - min_dist=' + str(min_dist),
              fontsize=15)
    if save:
        plt.savefig('../images/UMAP-User_Dataset_Result_n_neighbors=' +
                    str(n_neighbors) + '_min_dist=' + str(min_dist)+'.png',
                    bbox_inches='tight')
    return clusterable_embedding

In [None]:
n_neighbors=3
min_dist=0.1
clusterable_embedding = plot_UMAP(X_train, n_neighbors, min_dist, save=True)

In [None]:
n_neighbors=3
min_dist=0.0
clusterable_embedding_3 = plot_UMAP(X_train, n_neighbors, min_dist, save=True)

In [None]:
n_neighbors=5
min_dist=0.0
clusterable_embedding_5 = plot_UMAP(X_train, n_neighbors, min_dist, save=True)

In [None]:
n_neighbors=10
min_dist=0.0
clusterable_embedding_10 = plot_UMAP(X_train, n_neighbors, min_dist, save=True)

In [None]:
n_neighbors=15
min_dist=0.0
clusterable_embedding_15 = plot_UMAP(X_train, n_neighbors, min_dist, save=True)

In [None]:
n_neighbors=30
min_dist=0.0
clusterable_embedding_30 = plot_UMAP(X_train, n_neighbors, min_dist, save=True)

In [None]:
n_neighbors=50
min_dist=0.0
clusterable_embedding_50 = plot_UMAP(X_train, n_neighbors, min_dist, save=True)

In [None]:
n_neighbors=100
min_dist=0.0
clusterable_embedding_100 = plot_UMAP(X_train, n_neighbors, min_dist, save=True)

### Creating and exploration: Dataframe with 2D data

Working with:
- n_neighbors = 30
- min_dist = 0.0

In [None]:
user_df = pd.DataFrame(clusterable_embedding_30, columns=('x', 'y'))

In [None]:
user_df = pd.concat([df, user_df], axis=1)

In [None]:
user_df.head(2)

In [None]:
def add_color_size(row): 
    if row['user'] in list_users:
        color = 'orange'
        size = 50
    elif row['acc_verif']:
        color = 'yellow'
        size = 5
    else:
        freq = row['label']
        if freq == 'low_freq':
            color = 'gray'
            size = .1
        elif freq == 'high_freq':
            color = 'red'
            size = 5
        else:
            color = 'blue'
            size = 3
        
    row['color'] = color
    row['size'] = size

    return row

In [None]:
def explore_Umap(user_df, list_users, title = 'UMAP projection of the User dataset'):
    user_df = user_df.apply(add_color_size, axis=1)
    datasource = ColumnDataSource(user_df)

    p = figure(title=title,
               plot_width=800,
               plot_height=800,
               tools=('crosshair, pan, wheel_zoom, reset, save, box_select'))
    p.xaxis.axis_label = 'X'
    p.yaxis.axis_label = 'Y'

    p.add_tools(
        HoverTool(tooltips=[
            ("user", "@user"),
            ("(x,y)", "($x, $y)"),
            ("acc_verif", "@acc_verif"),
            ("fols_frs_ratio", "@fols_frs_ratio")
        ]))
    p.circle('x',
             'y',
             source=datasource,
             color='color',
             size='size',
             fill_alpha=0.6)

    show(p)

### Verifying Simple Bots

In [None]:
list_users = ['fedoraletelier', 'Aliciacarafipl3', 'Dolores09072598']
explore_Umap(user_df, list_users)

### Verifying Top High-Frequency users

In [None]:
list_users = ['AlbertoMayol','Tomaskovacic', 'NelsonCL28']
explore_Umap(user_df, list_users)

### Verifying bot suspicius new users 

In [None]:
list_users = [
    'EncuestaExpress', 'RResponsablecl', 'cazadorandino90', 'Piagutierrezs',
    'NathalySeplved3', 'ElCentinelaMPE', 'AShumman', 'PamelaSoler3',
    'Sumate_Guillier', 'jav_ast'
]

explore_Umap(user_df, list_users, 'Suspitious New Users - 2D UMAP Projection')

### Verifying all users suspicius to be a bot

In [None]:
list_users = [
    'fedoraletelier', 'Aliciacarafipl3', 'Dolores09072598'
    'AlbertoMayol','Tomaskovacic', 'NelsonCL28',
    'EncuestaExpress', 'RResponsablecl', 'cazadorandino90', 'Piagutierrezs',
    'NathalySeplved3', 'ElCentinelaMPE', 'AShumman', 'PamelaSoler3',
    'Sumate_Guillier', 'jav_ast'
]

explore_Umap(user_df, list_users, 'Suspitious Users - 2D UMAP Projection')

### Verifying - Suspitious Users Cluster 0

In [None]:
bots_users_lst = [
    'Aptimate', 'EncuestaExpress', 'jaimeantonio67', 'comandoguillie4',
    'RResponsablecl', 'mas_estudiantil', 'arqmneira', 'Maria_sepul66',
    'cazadorandino90', 'DulceViborita', 'FresiaPavez1', 'Sirius4321',
    'viejofasho', 'Ignacio90415476', 'EstudioOsorno', 'NathalySeplved3',
    'MSP_LasCondes', 'ElCentinelaMPE', 'caco_sanmartin', 'maasma58',
    'Dolores09072598', 'SoleAitkenP', 'JuanManuelCorn5', 'Trab_vXguillier',
    'c_alvan_0207', 'Santiag87306226', 'Edelarosacris', 'andres20der',
    'mrgrtgautier', 'BassaRiveros', 'Conapro_jjcc', 'SebastianFont20',
    'ChristianPolo7', 'almquin', 'chunchitomauro', 'Atletadelgol32',
    'laviedej', 'Ange_GossowF', 'Sumate_Guillier', 'solgensen',
    'FelipeAlegreJ', 'BarbManriquez', 'jav_ast'
]
explore_Umap(user_df, bots_users_lst, 'Suspitious Users Cluster 0 - 2D UMAP Projection')

### Creating selected Clusters

<img src="UMAP_clusters.png">

In [None]:
datasource.selected.indices

In [None]:
user_cluster_1 = user_df[(user_df.x > 7) 
                         & (user_df.x < 12) 
                         & (user_df.y < -5)
                         & (user_df.y > -10)]
user_cluster_1.shape

In [None]:
user_cluster_2 = user_df[(user_df.x > 9) 
                         & (user_df.x < 13) 
                         & (user_df.y < -2.2)
                         & (user_df.y > -4)]
user_cluster_2.shape

In [None]:
user_cluster_3 = user_df[(user_df.x > -3) 
                         & (user_df.x < 0.5) 
                         & (user_df.y < -10)
                         & (user_df.y > -13)]
user_cluster_3.shape

In [None]:
user_cluster_1.describe().T

In [None]:
user_cluster_3.describe().T

In [None]:
features = ['default_prof_image', 'acc_verif', 'local_bol',
       'default_profile', 'descr_bol', 'active_acc', 'tws_cnt', 'frs_cnt',
       'fols_cnt', 'fav_cnt', 'tweets_analysed', 'account_age_days',
       'ave_acc_tw_day', 'ave_recent_tw_day', 'max_tweet_day',
       'ment_tweets_cnt', 'hash_tweets_cnt', 'unique_mentions_cnt',
       'unique_hashtags_cnt', 'ment_idx', 'hash_idx', 'rt_ratio', 'mean_urls',
       'mean_lenght', 'recent_age_days', 'user_name_len', 'name_len',
       'user_name_len_num', 'simil_name', 'descr_len', 'fols_frs_ratio']

In [None]:
for feat in features:
    plot_feature(user_cluster_1, feat, prt=False, save_file_name=None)

In [None]:
feat = 'max_tweet_day'
plot_feat_comp_2_graph(user_cluster_1, user_cluster_3, feat, logx=False, prt=True)

In [None]:
feat = 'max_tweet_day'
plot_feat_comp(user_cluster_1, user_cluster_3, feat, logx=True, logy=False, prt=True)

In [None]:
plot_feat_comp(user_cluster_1, user_cluster_3,'descr_len', prt=True) 

In [None]:
plot_feat_comp(user_cluster_1, user_cluster_3,'simil_name', prt=True) 

In [None]:
plot_feat_comp(user_cluster_1, user_cluster_3,'user_name_len_num') 

In [None]:
plot_feat_comp(user_cluster_1, user_cluster_3,'name_len') 

In [None]:
plot_feat_comp(user_cluster_1, user_cluster_3,'user_name_len',prt=True) 

In [None]:
plot_feat_comp(user_cluster_1, user_cluster_3,'mean_lenght', prt=False ) 

In [None]:
plot_feat_comp(user_cluster_1, user_cluster_3,'mean_urls') 

In [None]:
plot_feat_comp(user_cluster_1, user_cluster_3,'rt_ratio', prt=True ) # (rt_tweets_cnt/tweets_analysed)*100

In [None]:
plot_feat_comp(user_cluster_1, user_cluster_3,'hash_idx') # unique_hashtags_cnt/hash_tweets_cnt

In [None]:
plot_feat_comp(user_cluster_1, user_cluster_3, 'ment_idx', prt=False,
               x_max=10)  # unique_mentions_cnt/ment_tweets_cnt

In [None]:
plot_feat_comp(user_cluster_1, user_cluster_3,'unique_hashtags_cnt', prt=False, x_max=100)

In [None]:
plot_feat_comp_2_graph(user_cluster_1, user_cluster_3,'unique_mentions_cnt')

In [None]:
plot_feat_comp_2_graph(user_cluster_1, user_cluster_3,'hash_tweets_cnt')

In [None]:
plot_feat_comp_2_graph(user_cluster_1, user_cluster_3,'ment_tweets_cnt')

In [None]:
plot_feat_comp_2_graph(user_cluster_1, user_cluster_3,'ave_recent_tw_day')

In [None]:
plot_feat_comp_2_graph(user_cluster_1, user_cluster_3,'ave_acc_tw_day')

In [None]:
plot_feat_comp(user_cluster_1, user_cluster_3,'account_age_days', prt=True)

In [None]:
plot_feat_comp(user_cluster_1, user_cluster_3,'recent_age_days', prt=False)

In [None]:
plot_feat_comp_2_graph(user_cluster_1, user_cluster_3,'tws_cnt')

In [None]:
plot_feat_comp(user_cluster_1, user_cluster_3,'frs_cnt', prt=False, x_max=5_000)

In [None]:
plot_feat_comp_2_graph(user_cluster_1, user_cluster_3,'fols_cnt', prt=False)

In [None]:
plot_feat_comp_2_graph(user_cluster_1, user_cluster_3,'fols_frs_ratio') # followers / Following

In [None]:
plot_feat_comp_2_graph(user_cluster_1, user_cluster_3,'fav_cnt') # likes

In [None]:
plot_feat_comp_2_graph(user_cluster_1, user_cluster_3,'tweets_analysed')