# SETUP

In [5]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Common imports
import os #It provides operating system operations.

import numpy as np

from numpy.random import rand #It returns random values.
from numpy.random import randn #It returns a sample from the standard normal distribution.

import pandas as pd

# To plot pretty figures
%matplotlib inline 
import matplotlib as mpl 

# To create interactive plot
import matplotlib.pyplot as plt #It provides a MATLAB-like way of plotting.

# To load image
import matplotlib.image as mpimg #It supports image loading, rescaling and display operations.

#MaxNlocator: class used to select no more than N intervals at nice locations
from matplotlib.ticker import MaxNLocator

# To set labelsize
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import seaborn as sns

import html
#to do wordcloud data viz technique (greater words are the significant ones)
#STOPWORDS is for removing unuseful words
from wordcloud import WordCloud, STOPWORDS

#countvect is to transform each doc (message) into a vector on the basis of the frequency of each word that occurs in the entire corpus
#tfidfvect does the same of countvect but instead of frequency uses tf-idf for the weight of each word
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#lda for topic modelling: algo to find hidden topics (output) from preprocessed data (input)
from sklearn.decomposition import LatentDirichletAllocation

#interactive topic model visualization
import pyLDAvis
import pyLDAvis.sklearn

#to load more files and create a unique dataset
import glob

#
import random
#
import gensim
#
import re, nltk, spacy, string

#
from pprint import pprint

#
from gensim import corpora, models
from gensim.models import CoherenceModel

#
import collections
from nltk.util import ngrams
from wordcloud import WordCloud

#
import pyLDAvis.gensim_models
import pyLDAvis

#
from sklearn.cluster import KMeans

#
from collections import Counter

#
import textblob
from textblob import TextBlob

#
import json

#
from geopy.geocoders import Nominatim
from geopy.point import Point

#
import folium
from folium.plugins import MarkerCluster

#
import time

#
import socket
from urllib3.connection import HTTPConnection

#
import warnings

#
import requests
from bs4 import BeautifulSoup

#
from urllib.parse import urlparse

#
import ast

#
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
# Where to save the data, results and images
PATH = "../"

DATA_PATH = os.path.join(PATH, "data")
os.makedirs(DATA_PATH, exist_ok=True)

RESULTS_PATH = os.path.join(PATH, "results")
os.makedirs(RESULTS_PATH, exist_ok=True)

IMAGES_PATH = os.path.join(PATH, "images")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    
    if tight_layout:
        plt.tight_layout()
        
    plt.savefig(path, format=fig_extension, dpi=resolution)

Consider the following documentation for the various libraries:

1. [`numpy`](https://numpy.org/doc/)
1. [`numpy.random.rand`](https://numpy.org/doc/reference/random/generated/numpy.random.rand.html)
1. [`numpy.random.randn`](https://numpy.org/doc/reference/random/generated/numpy.random.randn.html)
1. [`matplotlib`](https://matplotlib.org/)
1. [`matplotlib.pyplot`](https://matplotlib.org/stable/api/pyplot_summary.html)
1. [`matplotlib.image`](https://matplotlib.org/stable/api/image_api.html)