<h2>Получаем данные со стены группы VK</h2>

In [25]:
import csv # для сохранения информации в CSV
import httpx # для запроса
import pandas as pd # для визуализации результатов
import pydantic # для работы с результатами парсинга 
import random # для рандомизации временного интервала
import time # для задержки между запросами

In [None]:
from config import access_token, owner_id # настройки для запроса
from datetime import datetime as dt # для перевода даты из timestamp
from glom import glom # для безопасного получения значений словаря

In [26]:
def getjson(url, data=None):
    response = httpx.get(url, params=data)
    return response.json()

In [27]:
def get_all_posts(access_token, owner_id, count=100, offset=0):
    """takes access_token, owner_id (group_id), count(default=100), offset(default=0)
    and returns all posts from vk group in a list of dictionaries
    and the number of posts in second variable"""
    
    all_posts = []
    while True:
        time.sleep(random.random())
        wall = getjson("https://api.vk.com/method/wall.get", {
            "owner_id" : owner_id,
            "count": count,
            "access_token": access_token,
            "offset": offset,
            "v": '5.131'
        })
        count_posts = wall['response']['count']
        posts = wall['response']['items']

        all_posts.extend(posts)

        if len(all_posts) >= count_posts:
            break
        else:
            offset += 100

    return all_posts, count_posts

In [28]:
def make_posts(all_posts):
    """Takes in a list of dictionaries with posts, converts the data
    in a new structure, returns a new list of dictionaries with the posts"""
    filtered_data = []
    for post in all_posts:
        try:
            link = 'https://vk.com/wall-{owner_id}_{id}'.format(owner_id=owner_id[1:], id=id)
        except:
            link = ''
        try:
            date = dt.fromtimestamp(int(post['date'])).strftime('%d-%m-%Y %H:%M:%S')
        except:
            date = ''
        
        id_ = glom(post,'id',default=None)
        timestamp = glom(post,'date',default=None)
        likes = glom(post, 'likes.count', default=None)
        reposts = glom(post, 'reposts.count', default=None)
        comments = glom(post, 'comments.count', default=None)
        views = glom(post, 'views.count', default=None)
        text = glom(post, 'text', default=None)
        

        all_attachments = []
        try:
            attachments = post['attachments']
            if attachments:
                for att in attachments:
                    if att['type'] == 'video':
                        video_title = att['video']['title']
                        all_attachments.append(video_title)
                    if att['type'] == 'photo':
                        photo = att['photo']['text']
                        all_attachments.append(photo)
        except:
            attachments = ''

        filtered_post = {
            'id': id_,
            'date': date,
            'timestamp': timestamp,
            'likes': likes,
            'reposts': reposts,
            'comments': comments,
            'views': views,
            'text': text,
            'attachments': all_attachments,
            'link': link,
        }
        filtered_data.append(filtered_post)
    
    return filtered_data

In [29]:
def write_csv(data, filename, encoding='utf-8'):
    """Recives data as a list of dictionaries, the file name as a
    string ('*.csv'), encoding(default='utf-8'), returns csv file"""
    with open(filename, 'w', newline='', encoding=encoding) as csvfile:
        fieldnames = ['id', 'date', 'timestamp', 'likes', 'reposts',
                      'views', 'comments', 'text', 'attachments', 'link']

        writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=fieldnames)

        writer.writeheader()

        writer.writerows(data)

        print('Data written to csv', filename)
    csvfile.close()

In [30]:
all_posts, count_posts = get_all_posts(access_token, owner_id)
pposts = make_posts(all_posts)
write_csv(pposts,'{owner_id}-{datetime}.csv'.format(owner_id=owner_id[1:], datetime=str(dt.now())[:10]))

Data written to csv 113913953-2021-10-01.csv


In [33]:
nice_posts = pd.read_csv(open('113913953-2021-10-01.csv', 'r', encoding='utf-8'),sep=';')
nice_posts

Unnamed: 0,id,date,timestamp,likes,reposts,views,comments,text,attachments,link
0,233,11-08-2017 16:21:05,1502457665,34,4,3009.0,3,#Python #Программирование #ProgrammingHub \nПр...,[],https://vk.com/wall-113913953_<built-in functi...
1,622,29-09-2021 21:21:31,1632939691,0,0,,0,test,[],https://vk.com/wall-113913953_<built-in functi...
2,621,29-09-2021 21:14:20,1632939260,0,0,,0,test,[],https://vk.com/wall-113913953_<built-in functi...
3,620,29-09-2021 21:05:49,1632938749,0,0,,0,еуые,[],https://vk.com/wall-113913953_<built-in functi...
4,619,29-09-2021 20:59:50,1632938390,0,0,,0,еуые,[],https://vk.com/wall-113913953_<built-in functi...
...,...,...,...,...,...,...,...,...,...,...
251,9,06-02-2016 13:17:46,1454753866,8,0,,0,,"['Книга ""Программирование на Python 3"" написан...",https://vk.com/wall-113913953_<built-in functi...
252,7,06-02-2016 12:38:46,1454751526,10,0,,0,,['Программирование на Python. Том 2. 4-е издан...,https://vk.com/wall-113913953_<built-in functi...
253,6,06-02-2016 12:37:49,1454751469,12,0,,0,,['Программирование на Python. Том 1. 4-е издан...,https://vk.com/wall-113913953_<built-in functi...
254,3,06-02-2016 12:07:49,1454749669,15,4,,2,Из серии книг по программированию на языке Pyt...,[''],https://vk.com/wall-113913953_<built-in functi...
