## Install dependencies

In [None]:
!pip install tweepy
!pip install pymongo
!pip install vaderSentiment
!pip install emoji

## Import some needed scripts

In [None]:
from dataset import getFeaturesFromDBData, Database
from pprint import pprint
import matplotlib.pyplot as plt
from sklearn import preprocessing
from scipy.cluster.hierarchy import fcluster
from api import API
import ml

## Define some constants needed

In [None]:
MONGO_DB_URI = "mongodb://localhost:27017/?readPreference=primary&appname=MongoDB%20Compass&directConnection=true&ssl=false"
USER_DATASET_FILE = "users_dataset.csv"

## Initialize tweeter API

In [None]:
api = API();

## Gather data using twitter API and populate MongoDB

We gather the data in a way to try to maintain a 50-50% of bots and humans

In [None]:
api.getDataFromTweeterInDB(MONGO_DB_URI,USER_DATASET_FILE)          #Gather tweet data using tweeter API and store in MongoDB

## Create our features dataset using the data we have in the database

In [None]:
dataset = getFeaturesFromDBData(MONGO_DB_URI)                   #Load data from MongoDB, calculate features and return feature dataset
dataset.fillna(value=0)                                         #Fill all null values with zero
dataset.to_csv("dataset.csv")                                   #Save dataset to CSV
dataset = preprocessing.scale(dataset)                          #Scale features
dataset["label"] = dataset["label"].map({"HUMAN": 0, "BOT": 1}) #Map BOT label to 1 and Human label to 0
pprint('Done!')

## Train supervised models and get metrics

In [None]:
results = ml.trainSupervisedModels(dataset)
for result in results:
    print('Model : ' + result['name'])
    pprint(result['metrics'])
    print('-----')

## Compare supervised algorithms

In [None]:
ml.supervisedModelComparison(results)

## Unsupervised Algorithms

### Find the appropriate cluster number

In [None]:
ml.findeKmeansClusterNumber(dataset);

### Train K-Means model

In [None]:
cluster = ml.trainUnsupervisedModels(dataset)

### K-Means Clustering based on Activity

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(dataset.iloc[:,21], df_std.iloc[:,31],c=cluster, cmap='prism')  # plot points with cluster dependent colors
plt.title('K-Means Clustering based on Activity')
plt.show()

### Hierarchical clustering for the same dataset

In [None]:
# creating a dataset for hierarchical clustering
dataset2_standardized = dataset.copy()
# needed imports
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
import numpy as np
# some setting for this notebook to actually show the graphs inline
# you probably won't need this
%matplotlib inline
np.set_printoptions(precision=5, suppress=True)  # suppress scientific float notation
#creating the linkage matrix
H_cluster = linkage(dataset2_standardized,'ward')
plt.title('Hierarchical Clustering Dendrogram (truncated)')
plt.xlabel('sample index or (cluster size)')
plt.ylabel('distance')
dendrogram(
    H_cluster,
    truncate_mode='lastp',  # show only the last p merged clusters
    p=5,  # show only the last p merged clusters
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,  # to get a distribution impression in truncated branches
)
plt.show()

### Hierarchical Clutering based on Activity

In [None]:
# Assigning the clusters and plotting the observations as per hierarchical clustering
k=5
cluster_2 = fcluster(H_cluster, k, criterion='maxclust')
cluster_2[0:30:,]
plt.figure(figsize=(10, 8))
plt.scatter(dataset2_standardized.iloc[:,25], dataset2_standardized.iloc[:,31],c=cluster_2, cmap='prism')  # plot points with cluster dependent colors
plt.title('Hierarchical Clutering based on Activity')
plt.show()