#### Uploading the .db file

In [None]:
from google.colab import files
uploaded = files.upload()

#### Downloading the driver file

In [None]:
!wget https://repo1.maven.org/maven2/org/xerial/sqlite-jdbc/3.34.0/sqlite-jdbc-3.34.0.jar
!wget https://repo1.maven.org/maven2/com/johnsnowlabs/nlp/spark-nlp_2.12/5.1.4/spark-nlp_2.12-5.1.4.jar
!wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.900/aws-java-sdk-bundle-1.11.900.jar

#### Installing the necessary libraries

In [None]:
!pip install pyspark
!pip install spark-nlp==5.1.4

#### Import of necessary libraries

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline
from pyspark.ml.feature import Word2Vec
from sparknlp.base import DocumentAssembler, Finisher
from sparknlp.annotator import LemmatizerModel, Tokenizer as SparkNLPTokenizer, StopWordsCleaner

#### Creating a spark session

In [5]:
spark = SparkSession.builder \
 .config('spark.jars', './*') \
 .getOrCreate()

#### Loading the post table in dataframe

In [6]:
df = spark.read.format('jdbc') \
 .options(driver='org.sqlite.JDBC', dbtable='posts',
       url='jdbc:sqlite:telegram_data.db') \
 .load()

#### Removing extra characters

In [7]:
df = df.withColumn('post_text', F.regexp_replace(df.post_text, '[^\\wа-яА-Я\\s]', ''))

#### Collecting documents from the text of posts

In [8]:
documentAssembler = DocumentAssembler() \
 .setInputCol("post_text") \
 .setOutputCol("document")

#### Converting documents to tokens

In [9]:
spark_nlp_tokenizer = SparkNLPTokenizer() \
 .setInputCols(["document"]) \
 .setOutputCol("token")

#### Removing stop words

In [10]:
stop_words = StopWordsCleaner.pretrained("stopwords_ru", "ru") \
 .setInputCols(["token"]) \
 .setOutputCol("cleanTokens")

stopwords_ru download started this may take some time.
Approximate size to download 2.9 KB
[OK!]


#### Lemmatizing tokens

In [11]:
lemmatizer = LemmatizerModel.pretrained("lemma", "ru") \
 .setInputCols(["cleanTokens"]) \
 .setOutputCol("lemma")

lemma download started this may take some time.
Approximate size to download 1.3 MB
[OK!]


#### Collect final results into an array

In [12]:
finisher = Finisher() \
 .setInputCols(["lemma"]) \
 .setIncludeMetadata(False)

#### Create a pipeline for data processing and apply it to the data

In [13]:
pipeline = Pipeline(stages=[documentAssembler, spark_nlp_tokenizer, stop_words, lemmatizer, finisher])
result = pipeline.fit(df).transform(df)

#### Set up the Word2Vec model for vectorization and apply it to data

In [14]:
word2Vec = Word2Vec(vectorSize=100, minCount=5, inputCol="finished_lemma", outputCol="vector")
result = word2Vec.fit(result).transform(result)

#### Conclusion results

In [16]:
for row in result.select('vector').collect():
 print(row)

Row(vector=DenseVector([0.0003, -0.0002, 0.0002, -0.0001, 0.0003, -0.0002, 0.0, -0.0001, 0.0001, 0.0001, -0.0001, -0.0001, 0.0001, -0.0002, -0.0002, -0.0003, 0.0, 0.0004, 0.0004, -0.0005, -0.0001, 0.0002, -0.0001, -0.0004, 0.0001, -0.0001, -0.0005, -0.0003, 0.0, 0.0002, -0.0002, 0.0005, -0.0005, 0.0002, 0.0001, 0.0001, -0.0002, 0.0003, -0.0002, 0.0002, -0.0, -0.0001, 0.0, 0.0002, -0.0001, 0.0, 0.0005, 0.0003, -0.0003, -0.0001, 0.0002, -0.0004, 0.0001, 0.0002, 0.0005, -0.0002, 0.0, 0.0003, -0.0002, -0.0003, -0.0002, -0.0004, -0.0003, 0.0001, 0.0005, 0.0002, 0.0003, 0.0004, -0.0001, 0.0002, -0.0002, -0.0, 0.0003, 0.0, -0.0001, -0.0002, 0.0001, 0.0, -0.0002, 0.0003, -0.0005, 0.0005, -0.0003, -0.0, 0.0, 0.0003, -0.0001, -0.0001, 0.0001, -0.0001, -0.0004, -0.0001, -0.0004, 0.0003, -0.0, -0.0003, -0.0003, -0.0002, 0.0, 0.0002]))
Row(vector=DenseVector([0.0003, -0.0001, 0.0, 0.0001, 0.0001, 0.0001, -0.0003, 0.0002, -0.0001, 0.0003, 0.0001, 0.0001, 0.0005, 0.0002, 0.0002, 0.0003, 0.0005, -0.00