In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
from datetime import datetime, timedelta
from mysql.connector import connect, Error
from omegaconf import OmegaConf
import sqlalchemy
import pandas as pd
import boto3
from dotenv import load_dotenv

sys.path.append('../src')

In [None]:
load_dotenv()

In [None]:
conf = OmegaConf.load('../config.yaml')

In [None]:
mysql_host = os.getenv("MYSQL_HOST")
mysql_user = os.getenv("MYSQL_USER")
mysql_password = os.getenv("MYSQL_PASSWORD")
s3_access_key_id = os.getenv("S3_ACCESS_KEY_ID")
s3_secret_key = os.getenv("S3_SECRET_KEY")

id	channel_id	channel_name	date	text

In [None]:
# CONNECTION TO MYSQL SERVER
try:
    with connect(
        host=conf.host,
        user=conf.user,
        password=conf.password
    ) as connection:
        print(connection)
except Error as e:
    print(e)

In [None]:
# CREATE DATABASE
db_query = "CREATE DATABASE gpb_news_external"
try:
    with connect(
        host=conf.host,
        user=conf.user,
        password=conf.password,
    ) as connection:
        with connection.cursor() as cursor:
            cursor.execute(db_query)
except Error as e:
    print(e)

In [None]:
# SHOW DATABASES
query = "SHOW DATABASES"
try:
    with connect(
        host=conf.host,
        user=conf.user,
        password=conf.password,
    ) as connection:
        with connection.cursor() as cursor:
            cursor.execute(query)
            for db in cursor:
                print(db)
except Error as e:
    print(e)

In [None]:
# CONNECTION TO EXISTING DATABASE
try:
    with connect(
        host=conf.host,
        user=conf.user,
        password=conf.password,
        database=conf.database
    ) as connection:
        print(connection)
except Error as e:
    print(e)

In [None]:
query = \
"""
CREATE TABLE telegram_news(
        id BIGINT,
        channel_id BIGINT,
        channel_name VARCHAR(100),
        date VARCHAR(100),
        text TEXT
)
"""
try:
    with connect(
        host=conf.host,
        user=conf.user,
        password=conf.password,
        database=conf.database
    ) as connection:
        with connection.cursor() as cursor:
            cursor.execute(query)
            connection.commit()  # after any changes to table
except Error as e:
    print(e)

In [None]:
show_table_query = "DESCRIBE telegram_news"
try:
    with connect(
        host=conf.host,
        user=conf.user,
        password=conf.password,
        database=conf.database
    ) as connection:
        with connection.cursor() as cursor:
            cursor.execute(show_table_query)
            result = cursor.fetchall()
            for row in result:
                print(row)
except Error as e:
    print(e)

In [None]:
df = pd.read_json('../output/bbcrussian.json')
df.head()

In [None]:
try:
    connection = sqlalchemy.create_engine(f'mysql+mysqlconnector://{conf.user}:{conf.password}@{conf.host}/{conf.database}')
    df.to_sql(con=connection, name='telegram_news', if_exists='append', index=False)
except Error as e:
    print(e)

In [81]:
query = "SELECT * FROM telegram_news"
try:
    # with connect(
    #     host=conf.host,
    #     user=conf.user,
    #     password=conf.password,
    #     database=conf.database
    # ) as connection:
    connection = sqlalchemy.create_engine(f'mysql+mysqlconnector://{mysql_user}:{mysql_password}@{mysql_host}/{conf.db.database}')
    query_df = pd.read_sql(
        query, 
        connection,
        # chunksize=1000
    )
    # for chunk in query_df:
        # display(chunk)
except Error as e:
    print(e)

In [82]:
query_df.channel_name.value_counts()

rian_ru          87
meduzalive       49
bbcrussian       43
currenttime      26
varlamov_news    12
nytimes           2
Name: channel_name, dtype: int64

In [89]:
query_df

Unnamed: 0,rownum,message_id,channel_id,channel_name,channel_url,date,text,views,forwards,found_urls,report_dttm
0,1,40591,1003921752,bbcrussian,https://t.me/bbcrussian,2023-01-19 21:30:34,Украинская Рада рассмотрит поправки к закону о...,99933,27,,2023-01-20 14:52:16
1,2,40590,1003921752,bbcrussian,https://t.me/bbcrussian,2023-01-19 21:15:16,"Фото ЗРПК ""Панцирь"" на минобороны и еще одном ...",98120,892,https://twitter.com/michaelh992/status/1616111...,2023-01-20 14:52:16
2,3,40589,1003921752,bbcrussian,https://t.me/bbcrussian,2023-01-19 20:37:03,"""Режим молчания свою работу выполнил"": Пригожи...",96633,164,,2023-01-20 14:52:16
3,4,40588,1003921752,bbcrussian,https://t.me/bbcrussian,2023-01-19 20:10:17,Молдовский блогер Некоглай пообещал закупить д...,95827,383,https://www.bbc.com/russian/news-63572024,2023-01-20 14:52:16
4,5,40587,1003921752,bbcrussian,https://t.me/bbcrussian,2023-01-19 19:50:14,Владимир Зеленский и Шарль Мишель: Украине нуж...,92098,13,,2023-01-20 14:52:16
...,...,...,...,...,...,...,...,...,...,...,...
214,215,20574,1073531022,currenttime,https://t.me/currenttime,2023-01-19 10:20:06,ФСБ России сообщила о возбуждении дела о шпион...,38943,19,http://www.fsb.ru/fsb/press/message/single.htm...,2023-01-20 14:52:19
215,216,20573,1073531022,currenttime,https://t.me/currenttime,2023-01-19 09:57:27,Сегодня 330-й день полномасштабного российског...,39462,14,https://youtu.be/8tCvepIHvME,2023-01-20 14:52:19
216,217,20572,1073531022,currenttime,https://t.me/currenttime,2023-01-19 09:12:56,Не менее 15 военнослужащих погибли в результат...,45194,19,https://rferl.link/g5D4,2023-01-20 14:52:19
217,218,20571,1073531022,currenttime,https://t.me/currenttime,2023-01-19 08:40:04,В ожидании кувалды Вагнера\n\nДоброе утро!\n\n...,42076,50,https://telegra.ph/V-ozhidanii-kuvaldy-Vagnera...,2023-01-20 14:52:19
