#### Uploading the .db file

In [1]:
from google.colab import files
uploaded = files.upload()

Saving telegram_data.db to telegram_data.db


#### Downloading the driver file

In [None]:
!wget https://repo1.maven.org/maven2/org/xerial/sqlite-jdbc/3.34.0/sqlite-jdbc-3.34.0.jar
!wget https://repo1.maven.org/maven2/com/johnsnowlabs/nlp/spark-nlp_2.12/5.1.4/spark-nlp_2.12-5.1.4.jar
!wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.900/aws-java-sdk-bundle-1.11.900.jar

#### Installing the necessary libraries

In [None]:
!pip install pyspark
!pip install spark-nlp==5.1.4

#### Import of necessary libraries

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline
from sparknlp.base import DocumentAssembler, Finisher
from sparknlp.annotator import LemmatizerModel, Tokenizer as SparkNLPTokenizer, StopWordsCleaner

#### Creating a spark session

In [5]:
spark = SparkSession.builder \
 .config('spark.jars', './*') \
 .getOrCreate()

#### Loading the post table in dataframe

In [6]:
df = spark.read.format('jdbc') \
 .options(driver='org.sqlite.JDBC', dbtable='posts',
       url='jdbc:sqlite:telegram_data.db') \
 .load()

#### Removing extra characters

In [7]:
df = df.withColumn('post_text', F.regexp_replace(df.post_text, '[^\\wа-яА-Я\\s]', ''))

#### Collecting documents from the text of posts

In [8]:
documentAssembler = DocumentAssembler() \
 .setInputCol("post_text") \
 .setOutputCol("document")

#### Converting documents to tokens

In [9]:
spark_nlp_tokenizer = SparkNLPTokenizer() \
 .setInputCols(["document"]) \
 .setOutputCol("token")

#### Removing stop words

In [None]:
stop_words = StopWordsCleaner.pretrained("stopwords_ru", "ru") \
 .setInputCols(["token"]) \
 .setOutputCol("cleanTokens")

#### Lemmatizing tokens

In [None]:
lemmatizer = LemmatizerModel.pretrained("lemma", "ru") \
 .setInputCols(["cleanTokens"]) \
 .setOutputCol("lemma")

#### Collect final results into an array

In [12]:
finisher = Finisher() \
 .setInputCols(["lemma"]) \
 .setIncludeMetadata(False)

#### Create a pipeline for data processing

In [13]:
pipeline = Pipeline(stages=[documentAssembler, spark_nlp_tokenizer, stop_words, lemmatizer, finisher])

#### Apply the pipelines to the data

In [14]:
result = pipeline.fit(df).transform(df)

#### Conclusion results

In [None]:
for row in result.select('finished_lemma').collect():
 print(row)