Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scrape images, video, and post forwarding information for Telegram #413

Open
wants to merge 30 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
72b26f2
Scrape images, video, and post forwarding information for Telegram ch…
loganwilliams Oct 15, 2020
de4ebed
Fix KeyError caused by retweets without URLs in TwitterProfileScraper
loganwilliams Feb 24, 2022
b8efce2
Clean up unnecessary imports
loganwilliams Mar 8, 2022
ed82916
added capability to extract the number of channel members when the th…
trislee Mar 29, 2022
fb8d73a
handled case where channel has no profile image
trislee Mar 29, 2022
d32c9ad
added capability to scrape multiple videos from a single post
trislee Mar 30, 2022
a7eb54d
implemented Media dataclasses for Telegram, and added variable for ex…
trislee Mar 31, 2022
4e59638
added a forwardedUrl attribute to TelegramPost and made forwarded att…
trislee Mar 31, 2022
2ce014a
fixed edge case for videos that have data-link-attr but no href attri…
trislee Apr 3, 2022
f978954
Merge branch 'JustAnotherArchivist:master' into master
trislee Apr 3, 2022
babcddd
made Telegram scraper not return full channel info for forwarded_from…
trislee Apr 17, 2022
1e4e0c2
fixed issue where Telegram scraper terminated early because some page…
trislee Apr 17, 2022
b276c3c
fixed issue where some videos and photos weren't being scraped (becau…
trislee Apr 17, 2022
97d38e5
added additional termination criteria to Telegram scraper
trislee Apr 21, 2022
9b3faec
added additional attributes for hashtags and user mentions, removed r…
trislee Apr 21, 2022
21f7b62
moved forward finding out of tgme_widget_message_text clause, since i…
trislee Apr 21, 2022
5648e95
improved consistency of code formatting and added _STYLE_MEDIA_URL_PA…
trislee Apr 27, 2022
c18ca0f
Merge branch 'master' into telegram-media
trislee May 9, 2022
0a4bd39
Merge pull request #2 from bellingcat/telegram-media
trislee May 9, 2022
f385135
fixed merge conflicts
trislee May 9, 2022
b13e62e
Merge branch 'JustAnotherArchivist-master'
trislee May 9, 2022
e2d9223
forgot to save modified twitter.py module
trislee May 9, 2022
0822a9c
Merge pull request #4 from JustAnotherArchivist/master
trislee May 25, 2022
07a5f6f
merged master into more-tg-info to update upstream PR
trislee May 25, 2022
65723f1
fixed merge
trislee May 25, 2022
56e4232
fixed typo
trislee Jun 23, 2022
056cd62
incorporated requested changes from maintainer, removed modifications…
trislee Jun 23, 2022
73f10a4
fixed edge case where channel with no members fails _get_entity
trislee Jul 5, 2022
cbdfeed
fixed edge case where members information wasnt included
trislee Nov 30, 2022
cacd783
merged upstram changes
trislee Apr 4, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
239 changes: 188 additions & 51 deletions snscrape/modules/telegram.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,9 @@
import typing
import urllib.parse


_logger = logging.getLogger(__name__)
_SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')

_STYLE_MEDIA_URL_PATTERN = re.compile(r'url\(\'(.*?)\'\)')

@dataclasses.dataclass
class LinkPreview:
Expand All @@ -24,26 +23,12 @@ class LinkPreview:
image: typing.Optional[str] = None


@dataclasses.dataclass
class TelegramPost(snscrape.base.Item):
url: str
date: datetime.datetime
content: str
outlinks: list
linkPreview: typing.Optional[LinkPreview] = None

outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks), 'outlinks')

def __str__(self):
return self.url


@dataclasses.dataclass
class Channel(snscrape.base.Entity):
username: str
title: str
verified: bool
photo: str
title: typing.Optional[str] = None
verified: typing.Optional[bool] = None
photo: typing.Optional[str] = None
description: typing.Optional[str] = None
members: typing.Optional[int] = None
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
Expand All @@ -60,6 +45,55 @@ def __str__(self):
return f'https://t.me/s/{self.username}'


@dataclasses.dataclass
class TelegramPost(snscrape.base.Item):
url: str
date: datetime.datetime
content: str
outlinks: typing.List[str] = None
mentions: typing.List[str] = None
hashtags: typing.List[str] = None
forwarded: typing.Optional['Channel'] = None
forwardedUrl: typing.Optional[str] = None
media: typing.Optional[typing.List['Medium']] = None
views: typing.Optional[snscrape.base.IntWithGranularity] = None
linkPreview: typing.Optional[LinkPreview] = None

outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks), 'outlinks')

def __str__(self):
return self.url


class Medium:
pass


@dataclasses.dataclass
class Photo(Medium):
url: str


@dataclasses.dataclass
class Video(Medium):
thumbnailUrl: str
duration: float
url: typing.Optional[str] = None


@dataclasses.dataclass
class VoiceMessage(Medium):
url: str
duration: str
bars:typing.List[float]


@dataclasses.dataclass
class Gif(Medium):
thumbnailUrl: str
url: typing.Optional[str] = None


class TelegramChannelScraper(snscrape.base.Scraper):
name = 'telegram-channel'

Expand Down Expand Up @@ -90,25 +124,85 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
_logger.warning(f'Possibly incorrect URL: {rawUrl!r}')
url = rawUrl.replace('//t.me/', '//t.me/s/')
date = datetime.datetime.strptime(dateDiv.find('time', datetime = True)['datetime'].replace('-', '', 2).replace(':', ''), '%Y%m%dT%H%M%S%z')
media = []
outlinks = []
mentions = []
hashtags = []
forwarded = None
forwardedUrl = None

if (forwardTag := post.find('a', class_ = 'tgme_widget_message_forwarded_from_name')):
forwardedUrl = forwardTag['href']
forwardedName = forwardedUrl.split('t.me/')[1].split('/')[0]
forwarded = Channel(username = forwardedName)

if (message := post.find('div', class_ = 'tgme_widget_message_text')):
content = message.text
outlinks = []
for link in post.find_all('a'):
if any(x in link.parent.attrs.get('class', []) for x in ('tgme_widget_message_user', 'tgme_widget_message_author')):
# Author links at the top (avatar and name)
continue
if link['href'] == rawUrl or link['href'] == url:
# Generic filter of links to the post itself, catches videos, photos, and the date link
continue
if _SINGLE_MEDIA_LINK_PATTERN.match(link['href']):
# Individual photo or video link
continue
href = urllib.parse.urljoin(pageUrl, link['href'])
if href not in outlinks:
outlinks.append(href)
content = message.get_text(separator="\n")
else:
content = None
outlinks = []

for link in post.find_all('a'):
if any(x in link.parent.attrs.get('class', []) for x in ('tgme_widget_message_user', 'tgme_widget_message_author')):
# Author links at the top (avatar and name)
continue
if link['href'] == rawUrl or link['href'] == url:
style = link.attrs.get('style', '')
# Generic filter of links to the post itself, catches videos, photos, and the date link
if style != '':
imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style)
if len(imageUrls) == 1:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there any examples with more than one match (here or a few lines below)?

media.append(Photo(url = imageUrls[0]))
continue
if _SINGLE_MEDIA_LINK_PATTERN.match(link['href']):
style = link.attrs.get('style', '')
imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style)
if len(imageUrls) == 1:
media.append(Photo(url = imageUrls[0]))
# resp = self._get(image[0])
# encoded_string = base64.b64encode(resp.content)
# Individual photo or video link
continue
if link.text.startswith('@'):
mentions.append(link.text.strip('@'))
continue
if link.text.startswith('#'):
hashtags.append(link.text.strip('#'))
continue
href = urllib.parse.urljoin(pageUrl, link['href'])
if (href not in outlinks) and (href != rawUrl) and (href != forwardedUrl):
outlinks.append(href)

for voicePlayer in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}):
audioUrl = voicePlayer.find('audio')['src']
durationStr = voicePlayer.find('time').text
duration = _durationStrToSeconds(durationStr)
barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')]

media.append(VoiceMessage(url = audioUrl, duration = duration, bars = barHeights))

for videoPlayer in post.find_all('a', {'class': 'tgme_widget_message_video_player'}):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because the extraction of images and videos is done separately, the order is not preserved. For example, https://t.me/s/nexta_live/43102 has video 1 (without URL), image, video 2 (with URL), but the image gets listed first. I think that can be fixed by simply merging this loop (and also the one for the voice player extraction) into the general link loop above, since they're all a tags in the post div.

iTag = videoPlayer.find('i')
if iTag is None:
videoUrl = None
videoThumbnailUrl = None
else:
style = iTag['style']
videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0]
videoTag = videoPlayer.find('video')
videoUrl = None if videoTag is None else videoTag['src']
mKwargs = {
'thumbnailUrl': videoThumbnailUrl,
'url': videoUrl,
}
timeTag = videoPlayer.find('time')
if timeTag is None:
cls = Gif

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have some examples? I don't remember seeing fake-GIFs on Telegram before. (Also for the future test suite.)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Telegram doesn't have a policy on whether or not they're allowed, right? I don't think a real-GIF would ever inaccurately go down this path, so isn't it just making the logic more robust against change?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer erroring out on things the code doesn't actually understand and implement. It might be 'more robust' in some sense, but it can easily result in misparsing the data as well.

But if 'videos' without a time tag already exist similar to how it is on Twitter, this is totally fine. Hence why I'm asking for examples. :-)

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not an active Telegram user, so I don't think I'll be able to quickly come up with an example myself. @loganwilliams , do you remember running into a problem which required adding this line back when you implemented this?

On the other hand, what data misparsing are you imagining from this, @JustAnotherArchivist ? Especially if Twitter already has examples which require this behavior, what's the error mode that we'd want to call out by throwing here?

I'm hoping that merging this will get everyone off the fork, but am concerned that if we introduce new exceptions, it'll require more significant updates to existing workflows.

Edit: As a compromise, I'm adding a warning log to this in my PR. It won't stop execution, but will let the user know in case there's something actually wrong.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @john-osullivan. Thanks for your work pushing this forward. You can see an example of a GIF here: https://t.me/thisisatestchannel19451923/3

It sits in the same .tgme_widget_message_video_player element and lacks a duration.

else:
cls = Video
durationStr = videoPlayer.find('time').text
mKwargs['duration'] = _durationStrToSeconds(durationStr)
media.append(cls(**mKwargs))

linkPreview = None
if (linkPreviewA := post.find('a', class_ = 'tgme_widget_message_link_preview')):
kwargs = {}
Expand All @@ -125,20 +219,45 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False):
else:
_logger.warning(f'Could not process link preview image on {url}')
linkPreview = LinkPreview(**kwargs)
yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, linkPreview = linkPreview)
if kwargs['href'] in outlinks:
outlinks.remove(kwargs['href'])
Comment on lines +222 to +223

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer leaving the link preview href in outlinks as well, similar to how the Twitter scraper will have outlinks from link cards in outlinks.


viewsSpan = post.find('span', class_ = 'tgme_widget_message_views')
views = None if viewsSpan is None else _parse_num(viewsSpan.text)

outlinks = outlinks if outlinks else None
media = media if media else None
mentions = mentions if mentions else None
hashtags = hashtags if hashtags else None

yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, mentions = mentions, hashtags = hashtags, linkPreview = linkPreview, media = media, forwarded = forwarded, forwardedUrl = forwardedUrl, views = views)

def get_items(self):
r, soup = self._initial_page()
if '/s/' not in r.url:
_logger.warning('No public post list for this user')
return
nextPageUrl = ''
while True:
yield from self._soup_to_items(soup, r.url)
try:
if soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)['href'].split('/')[-1] == '1':
# if message 1 is the first message in the page, terminate scraping
break
except:
pass
Comment on lines +243 to +248

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bare except is awful and hides various exceptions that shouldn't be caught, such as ^C interrupts. This test should really be done without a try-except.

pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
if not pageLink:
break
# some pages are missing a "tme_messages_more" tag, causing early termination
if '=' not in nextPageUrl:
nextPageUrl = soup.find('link', attrs = {'rel': 'canonical'}, href = True)['href']
nextPostIndex = int(nextPageUrl.split('=')[-1]) - 20
if nextPostIndex > 20:
pageLink = {'href': nextPageUrl.split('=')[0] + f'={nextPostIndex}'}
else:
break
Comment on lines +251 to +258

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't this approach lead to duplicates in some cases? When a post includes multiple media, those get their own ID, so there'd be a gap in post IDs.
I feel like it'd be better to get the ID of the first post on the page and then use that as the before parameter.

nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
r = self._get(nextPageUrl, headers = self._headers)
r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = _telegramResponseOkCallback)
if r.status_code != 200:
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
soup = bs4.BeautifulSoup(r.text, 'lxml')
Expand All @@ -151,9 +270,18 @@ def _get_entity(self):
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
soup = bs4.BeautifulSoup(r.text, 'lxml')
membersDiv = soup.find('div', class_ = 'tgme_page_extra')
if membersDiv.text.endswith(' members'):
kwargs['members'] = int(membersDiv.text[:-8].replace(' ', ''))
kwargs['photo'] = soup.find('img', class_ = 'tgme_page_photo_image').attrs['src']
if membersDiv is not None:
if membersDiv.text.split(',')[0].endswith((' members', ' subscribers')):
membersStr = ''.join(membersDiv.text.split(',')[0].split(' ')[:-1])
if membersStr == 'no':
kwargs['members'] = 0
else:
kwargs['members'] = int(membersStr)
photoImg = soup.find('img', class_ = 'tgme_page_photo_image')
if photoImg is not None:
kwargs['photo'] = photoImg.attrs['src']
else:
kwargs['photo'] = None

r, soup = self._initial_page()
if '/s/' not in r.url: # Redirect on channels without public posts
Expand All @@ -174,17 +302,8 @@ def _get_entity(self):
if (descriptionDiv := channelInfoDiv.find('div', class_ = 'tgme_channel_info_description')):
kwargs['description'] = descriptionDiv.text

def parse_num(s):
s = s.replace(' ', '')
if s.endswith('M'):
return int(float(s[:-1]) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].split('.')[1]))
elif s.endswith('K'):
return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
else:
return int(s), 1

for div in channelInfoDiv.find_all('div', class_ = 'tgme_channel_info_counter'):
value, granularity = parse_num(div.find('span', class_ = 'counter_value').text)
value, granularity = _parse_num(div.find('span', class_ = 'counter_value').text)
type_ = div.find('span', class_ = 'counter_type').text
if type_ == 'members':
# Already extracted more accurately from /channel, skip
Expand All @@ -201,3 +320,21 @@ def _cli_setup_parser(cls, subparser):
@classmethod
def _cli_from_args(cls, args):
return cls._cli_construct(args, args.channel)

def _parse_num(s):
s = s.replace(' ', '')
if s.endswith('M'):
return int(float(s[:-1]) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].split('.')[1]))
elif s.endswith('K'):
return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
return int(s), 1

def _durationStrToSeconds(durationStr):
durationList = durationStr.split(':')
return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(durationList))])

def _telegramResponseOkCallback(r):
if r.status_code == 200:
return (True, None)
return (False, f'{r.status_code=}')

2 changes: 1 addition & 1 deletion snscrape/modules/vkontakte.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,4 +385,4 @@ def _cli_setup_parser(cls, subparser):

@classmethod
def _cli_from_args(cls, args):
return cls._cli_construct(args, args.username)
return cls._cli_construct(args, args.username)