##Vectorizing data

#### Uploading the .db file

In [None]:
from google.colab import files
uploaded = files.upload()

#### Downloading the driver file

In [None]:
!wget https://repo1.maven.org/maven2/org/xerial/sqlite-jdbc/3.34.0/sqlite-jdbc-3.34.0.jar
!wget https://repo1.maven.org/maven2/com/johnsnowlabs/nlp/spark-nlp_2.12/5.1.4/spark-nlp_2.12-5.1.4.jar
!wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.900/aws-java-sdk-bundle-1.11.900.jar

#### Installing the necessary libraries

In [None]:
!pip install pyspark
!pip install spark-nlp==5.1.4
!pip install findspark

#### Import of necessary libraries

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline
from pyspark.ml.feature import Word2Vec
from sparknlp.base import DocumentAssembler, Finisher
from sparknlp.annotator import LemmatizerModel, Tokenizer as SparkNLPTokenizer, StopWordsCleaner

#### Creating a spark session

In [None]:
spark = SparkSession.builder \
 .config('spark.jars', './*') \
 .getOrCreate()

#### Loading the post table in dataframe

In [None]:
df = spark.read.format('jdbc') \
 .options(driver='org.sqlite.JDBC', dbtable='posts',
       url='jdbc:sqlite:telegram_data.db') \
 .load()

#### Removing extra characters

In [None]:
df = df.withColumn('post_text', F.regexp_replace(df.post_text, '[^\\wа-яА-Я\\s]', ''))

#### Collecting documents from the text of posts

In [None]:
documentAssembler = DocumentAssembler() \
 .setInputCol("post_text") \
 .setOutputCol("document")

#### Converting documents to tokens

In [None]:
spark_nlp_tokenizer = SparkNLPTokenizer() \
 .setInputCols(["document"]) \
 .setOutputCol("token")

#### Removing stop words

In [None]:
stop_words = StopWordsCleaner.pretrained("stopwords_ru", "ru") \
 .setInputCols(["token"]) \
 .setOutputCol("cleanTokens")

#### Lemmatizing tokens

In [None]:
lemmatizer = LemmatizerModel.pretrained("lemma", "ru") \
 .setInputCols(["cleanTokens"]) \
 .setOutputCol("lemma")

#### Collect final results into an array

In [None]:
finisher = Finisher() \
 .setInputCols(["lemma"]) \
 .setIncludeMetadata(False)

#### Create a pipeline for data processing and apply it to the data

In [None]:
pipeline = Pipeline(stages=[documentAssembler, spark_nlp_tokenizer, stop_words, lemmatizer, finisher])
result = pipeline.fit(df).transform(df)

#### Set up the Word2Vec model for vectorization and apply it to data

In [None]:
word2Vec = Word2Vec(vectorSize=100, minCount=5, inputCol="finished_lemma", outputCol="vector")
result = word2Vec.fit(result).transform(result)s

#### Conclusion results

In [None]:
for row in result.select('vector').collect():
 print(row)

##Open Spark UI

####Downloading Spark to the environment

In [1]:
!wget https://downloads.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar -xvzf spark-3.5.0-bin-hadoop3.tgz > /dev/null

--2023-11-24 17:07:38--  https://downloads.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
Resolving downloads.apache.org (downloads.apache.org)... 88.99.95.219, 135.181.214.104, 2a01:4f9:3a:2c57::2, ...
Connecting to downloads.apache.org (downloads.apache.org)|88.99.95.219|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 400395283 (382M) [application/x-gzip]
Saving to: ‘spark-3.5.0-bin-hadoop3.tgz’


2023-11-24 17:07:53 (27.2 MB/s) - ‘spark-3.5.0-bin-hadoop3.tgz’ saved [400395283/400395283]



####Setting up the environment

In [2]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"

####Install and run findspark

In [3]:
!pip install -q findspark
import findspark
findspark.init()

####Import pySpark

In [4]:
import pyspark
from pyspark.conf import SparkConf
from pyspark import SparkContext

####Initate Spark environment

In [5]:
conf = SparkConf().setAppName("DataFrame").set('spark.ui.port', '4050')
spark = SparkContext(conf=conf)

####Grabing the ngrok application and assigning token key

In [None]:
!wget https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.tgz
!tar -xvzf ngrok-v3-stable-linux-amd64.tgz
!./ngrok authtoken 2YMQOAF2om5z0WgkKb2Fm84ZEQd_3uuoD3Bfc5K6DNMcjcg5U

####Starting an agent and connecting Spark UI to ngrok

In [7]:
get_ipython().system_raw('./ngrok http 4050 &')

####Showing a direct link to the Spark UI

In [8]:
!curl -s http://localhost:4040/api/tunnels | grep -Po 'public_url":"(?=https)\K[^"]*'

https://ef0d-35-188-101-219.ngrok-free.app
