# Importing Required Libraries

In [None]:
pip install ntscraper  #installation if required

In [1]:
from ntscraper import Nitter    #Importing the Nitter module to scrape tweets from Nitter
from pprint import pprint  #Used to pretty-print the output for better readability
import pandas as pd    #Used for data manipulation

# Demo: Scraping Tweets and Saving to CSV

In [None]:
# This section is a demonstration to show how the Nitter scraper works. It includes the process of initializing 
# the scraper, retrieving tweets by a specific hashtag, extracting relevant data, and saving it to a CSV file.

In [2]:
scraper = Nitter(log_level = 1,skip_instance_check= False)

Testing instances:  92%|██████████████████████████████████████████████████████████     | 71/77 [01:46<00:10,  1.76s/it]

20-Aug-24 12:41:41 - Certificate did not match expected hostname: nt.ggtyler.dev. Certificate: {'subject': ((('commonName', '4g.ggtyler.dev'),),), 'issuer': ((('countryName', 'US'),), (('organizationName', "Let's Encrypt"),), (('commonName', 'E6'),)), 'version': 3, 'serialNumber': '043C83E6DFFFA194D2CDA2DE14B572820A1C', 'notBefore': 'Jul 13 13:08:14 2024 GMT', 'notAfter': 'Oct 11 13:08:13 2024 GMT', 'subjectAltName': (('DNS', '4g.ggtyler.dev'),), 'OCSP': ('http://e6.o.lencr.org',), 'caIssuers': ('http://e6.i.lencr.org/',)}


Testing instances:  95%|███████████████████████████████████████████████████████████▋   | 73/77 [01:48<00:05,  1.30s/it]

20-Aug-24 12:41:43 - Certificate did not match expected hostname: nitter.uni-sonia.com. Certificate: {'subject': ((('commonName', '*.xserver.jp'),),), 'issuer': ((('countryName', 'JP'),), (('organizationName', 'CloudSecure Corporation'),), (('commonName', 'CloudSecure RSA Domain Validation Secure Server CA 2'),)), 'version': 3, 'serialNumber': 'ACA67AD2030638EE2DCE8E845B8299A6', 'notBefore': 'Mar 11 00:00:00 2024 GMT', 'notAfter': 'Apr 11 23:59:59 2025 GMT', 'subjectAltName': (('DNS', '*.xserver.jp'), ('DNS', 'xserver.jp')), 'OCSP': ('http://ocsp.sectigo.com',), 'caIssuers': ('http://crt.sectigo.com/CloudSecureRSADomainValidationSecureServerCA2.crt',)}


Testing instances:  99%|██████████████████████████████████████████████████████████████▏| 76/77 [02:04<00:04,  4.84s/it]

20-Aug-24 12:41:59 - Certificate did not match expected hostname: nitter.tinfoil-hat.net. Certificate: {'subject': ((('commonName', 'jelly.tinfoil-hat.de'),),), 'issuer': ((('countryName', 'US'),), (('organizationName', "Let's Encrypt"),), (('commonName', 'E6'),)), 'version': 3, 'serialNumber': '03557B828B954DCD5ADD0EEA6DDF9F1E0085', 'notBefore': 'Jul 16 04:52:19 2024 GMT', 'notAfter': 'Oct 14 04:52:18 2024 GMT', 'subjectAltName': (('DNS', 'jelly.tinfoil-hat.de'),), 'OCSP': ('http://e6.o.lencr.org',), 'caIssuers': ('http://e6.i.lencr.org/',)}


Testing instances: 100%|███████████████████████████████████████████████████████████████| 77/77 [02:05<00:00,  1.63s/it]


In [4]:
#The mode can be "hashtag" to search by hashtag 
# or "user" to retrieve tweets from a specific user's timeline. Here, up to 1000 tweets limit but you can change accordingly. 
tweets = scraper.get_tweets("elonmusk",mode="user",number=1000)

20-Aug-24 12:42:21 - No instance specified, using random instance https://nt.vern.cc
20-Aug-24 12:42:27 - Current stats for elonmusk: 21 tweets, 0 threads...
20-Aug-24 12:42:32 - Current stats for elonmusk: 41 tweets, 0 threads...
20-Aug-24 12:42:37 - Current stats for elonmusk: 60 tweets, 0 threads...
20-Aug-24 12:42:41 - Current stats for elonmusk: 80 tweets, 0 threads...
20-Aug-24 12:42:46 - Current stats for elonmusk: 100 tweets, 0 threads...
20-Aug-24 12:42:51 - Current stats for elonmusk: 119 tweets, 0 threads...
20-Aug-24 12:42:55 - Current stats for elonmusk: 139 tweets, 0 threads...
20-Aug-24 12:43:00 - Current stats for elonmusk: 159 tweets, 0 threads...
20-Aug-24 12:43:05 - Current stats for elonmusk: 179 tweets, 0 threads...
20-Aug-24 12:43:10 - Current stats for elonmusk: 199 tweets, 0 threads...
20-Aug-24 12:43:14 - Current stats for elonmusk: 219 tweets, 0 threads...
20-Aug-24 12:43:19 - Current stats for elonmusk: 239 tweets, 0 threads...
20-Aug-24 12:43:23 - Current st

In [23]:
pprint(tweets)

{'threads': [],
 'tweets': [{'date': 'Aug 16, 2024 · 10:19 PM UTC',
             'external-link': '',
             'gifs': [],
             'is-pinned': True,
             'is-retweet': False,
             'link': 'https://twitter.com/elonmusk/status/1824571630611251681#m',
             'pictures': [],
             'quoted-post': {'date': 'Aug 16, 2024 · 9:00 PM UTC',
                             'gifs': [],
                             'link': 'https://twitter.com/premium/status/1824551877394174112#m',
                             'pictures': [],
                             'text': 'in case you missed it..  Grok 2 is here '
                                     '– our most advanced AI assistant, built '
                                     'right into X.  sign up to try it out: '
                                     'https://x.com/i/premium_sign_up?referring_page=grok  '
                                     '4 examples of what Grok can do for you:',
                             'user'

In [6]:
tweets.keys()

dict_keys(['tweets', 'threads'])

In [7]:
tweets['tweets'][0]

{'link': 'https://twitter.com/elonmusk/status/1824571630611251681#m',
 'text': 'Try Grok 2!',
 'user': {'name': 'Elon Musk',
  'username': '@elonmusk',
  'profile_id': '1815749056821346304',
  'avatar': 'https://pbs.twimg.com/profile_images/1815749056821346304/jS8I28PL_bigger.jpg'},
 'date': 'Aug 16, 2024 · 10:19 PM UTC',
 'is-retweet': False,
 'is-pinned': True,
 'external-link': '',
 'replying-to': [],
 'quoted-post': {'link': 'https://twitter.com/premium/status/1824551877394174112#m',
  'text': 'in case you missed it..  Grok 2 is here – our most advanced AI assistant, built right into X.  sign up to try it out: https://x.com/i/premium_sign_up?referring_page=grok  4 examples of what Grok can do for you:',
  'user': {'name': 'Premium',
   'username': '@premium',
   'profile_id': '1683366300054069248',
   'avatar': 'https://pbs.twimg.com/profile_images/1683366300054069248/67v23AEj_mini.jpg'},
  'date': 'Aug 16, 2024 · 9:00 PM UTC',
  'pictures': [],
  'videos': [],
  'gifs': []},
 'sta

In [8]:
# Getting profile information for the user
elon_info = scraper.get_profile_info(username="elonmusk")

20-Aug-24 12:48:28 - No instance specified, using random instance https://nitter.privacydev.net


In [9]:
pprint(elon_info)

{'bio': '',
 'id': '44196397',
 'image': 'https://pbs.twimg.com/profile_images/1815749056821346304/jS8I28PL_400x400.jpg',
 'joined': '8:12 PM - 2 Jun 2009',
 'location': '',
 'name': 'Elon Musk',
 'stats': {'followers': 195121513,
           'following': 715,
           'likes': 68694,
           'media': 0,
           'tweets': 49812},
 'username': '@elonmusk',
 'website': ''}


In [10]:
# Create an empty dictionary to store the relevant tweet data
data = {
    'link':[],
    'text':[],
    'user':[],
    'likes':[],
    'quotes':[],
    'retweets':[],
    'comments':[],
    'date' : []
}

# Loop through each tweet and append its data to the respective lists in the dictionary
for tweet in tweets['tweets']:
    data['link'].append(tweet['link'])
    data['text'].append(tweet['text'])
    data['user'].append(tweet['user']['name'])
    data['likes'].append(tweet['stats']['likes'])
    data['quotes'].append(tweet['stats']['quotes'])    
    data['retweets'].append(tweet['stats']['retweets'])    
    data['comments'].append(tweet['stats']['comments'])    
    data['date'].append(tweet['date'])

In [20]:
data

{'link': ['https://twitter.com/elonmusk/status/1824571630611251681#m',
  'https://twitter.com/elonmusk/status/1825786899014844702#m',
  'https://twitter.com/elonmusk/status/1825778823205380347#m',
  'https://twitter.com/elonmusk/status/1825768840090836999#m',
  'https://twitter.com/elonmusk/status/1825743803765895540#m',
  'https://twitter.com/elonmusk/status/1825743161076875773#m',
  'https://twitter.com/elonmusk/status/1825741761039241475#m',
  'https://twitter.com/elonmusk/status/1825738817275383950#m',
  'https://twitter.com/elonmusk/status/1825723913051000851#m',
  'https://twitter.com/elonmusk/status/1825715527446155288#m',
  'https://twitter.com/elonmusk/status/1825713908067479735#m',
  'https://twitter.com/elonmusk/status/1825713065985188224#m',
  'https://twitter.com/elonmusk/status/1825559831132172668#m',
  'https://twitter.com/elonmusk/status/1825558840336920868#m',
  'https://twitter.com/elonmusk/status/1825558181839618531#m',
  'https://twitter.com/elonmusk/status/18255576

In [22]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,link,text,user,likes,quotes,retweets,comments,date
0,https://twitter.com/elonmusk/status/1824571630...,Try Grok 2!,Elon Musk,50814,548,7337,11590,"Aug 16, 2024 · 10:19 PM UTC"
1,https://twitter.com/elonmusk/status/1825786899...,Amazing,Elon Musk,14076,128,2672,1827,"Aug 20, 2024 · 6:48 AM UTC"
2,https://twitter.com/elonmusk/status/1825778823...,Why not fix it right now? 🤷‍♂️,Elon Musk,39216,208,4825,3253,"Aug 20, 2024 · 6:16 AM UTC"
3,https://twitter.com/elonmusk/status/1825768840...,"From the standpoint of the faaaaaar left, this...",Elon Musk,28917,262,4112,3263,"Aug 20, 2024 · 5:36 AM UTC"
4,https://twitter.com/elonmusk/status/1825743803...,🕺🕺🔫🔫🔫🔫🔫🔫🔫🔫🔫🔫🕺🕺,Elon Musk,58148,650,4333,6632,"Aug 20, 2024 · 3:56 AM UTC"


In [13]:
df.to_csv("elonmusk.csv")

# Main Function: Creating a Tweets Dataset for a Specified User

In [None]:
# This function is the main utility for the user. It scrapes a specified number of tweets from a user's timeline 
# and saves them to a CSV file. The user can specify the username and the number of tweets they want to scrape by simply calling the function.

In [14]:
import pandas as pd
scraper = Nitter()
def create_tweets_dataset(username,no_of_tweets):
    tweets = scraper.get_tweets(username,mode="user",number=no_of_tweets)
    data = {
        'link':[],
        'text':[],
        'user':[],
        'likes':[],
        'quotes':[],
        'retweets':[],
        'comments':[]
    }

    for tweet in tweets['tweets']:
        data['link'].append(tweet['link'])
        data['text'].append(tweet['text'])
        data['user'].append(tweet['user']['name'])
        data['likes'].append(tweet['stats']['likes'])
        data['quotes'].append(tweet['stats']['quotes'])    
        data['retweets'].append(tweet['stats']['retweets'])    
        data['comments'].append(tweet['stats']['comments'])    
    df = pd.DataFrame(data)
    df.to_csv(username+"_tweets_data.csv")

Testing instances:  92%|██████████████████████████████████████████████████████████     | 71/77 [01:42<00:09,  1.56s/it]

20-Aug-24 13:01:40 - Certificate did not match expected hostname: nt.ggtyler.dev. Certificate: {'subject': ((('commonName', '4g.ggtyler.dev'),),), 'issuer': ((('countryName', 'US'),), (('organizationName', "Let's Encrypt"),), (('commonName', 'E6'),)), 'version': 3, 'serialNumber': '043C83E6DFFFA194D2CDA2DE14B572820A1C', 'notBefore': 'Jul 13 13:08:14 2024 GMT', 'notAfter': 'Oct 11 13:08:13 2024 GMT', 'subjectAltName': (('DNS', '4g.ggtyler.dev'),), 'OCSP': ('http://e6.o.lencr.org',), 'caIssuers': ('http://e6.i.lencr.org/',)}


Testing instances:  95%|███████████████████████████████████████████████████████████▋   | 73/77 [01:43<00:04,  1.07s/it]

20-Aug-24 13:01:42 - Certificate did not match expected hostname: nitter.uni-sonia.com. Certificate: {'subject': ((('commonName', '*.xserver.jp'),),), 'issuer': ((('countryName', 'JP'),), (('organizationName', 'CloudSecure Corporation'),), (('commonName', 'CloudSecure RSA Domain Validation Secure Server CA 2'),)), 'version': 3, 'serialNumber': 'ACA67AD2030638EE2DCE8E845B8299A6', 'notBefore': 'Mar 11 00:00:00 2024 GMT', 'notAfter': 'Apr 11 23:59:59 2025 GMT', 'subjectAltName': (('DNS', '*.xserver.jp'), ('DNS', 'xserver.jp')), 'OCSP': ('http://ocsp.sectigo.com',), 'caIssuers': ('http://crt.sectigo.com/CloudSecureRSADomainValidationSecureServerCA2.crt',)}


Testing instances:  99%|██████████████████████████████████████████████████████████████▏| 76/77 [02:01<00:05,  5.22s/it]

20-Aug-24 13:02:01 - Certificate did not match expected hostname: nitter.tinfoil-hat.net. Certificate: {'subject': ((('commonName', 'jelly.tinfoil-hat.de'),),), 'issuer': ((('countryName', 'US'),), (('organizationName', "Let's Encrypt"),), (('commonName', 'E6'),)), 'version': 3, 'serialNumber': '03557B828B954DCD5ADD0EEA6DDF9F1E0085', 'notBefore': 'Jul 16 04:52:19 2024 GMT', 'notAfter': 'Oct 14 04:52:18 2024 GMT', 'subjectAltName': (('DNS', 'jelly.tinfoil-hat.de'),), 'OCSP': ('http://e6.o.lencr.org',), 'caIssuers': ('http://e6.i.lencr.org/',)}


Testing instances: 100%|███████████████████████████████████████████████████████████████| 77/77 [02:04<00:00,  1.61s/it]


In [15]:
create_tweets_dataset("AXISgsm",100)
#Use the function to create a dataset for your required profile to scrapp, Simplly change the username of your profile ("MrBeast to "ABC...")
#and change the limit of tweets (100 to .....n) you want to scrapp.

20-Aug-24 13:03:14 - No instance specified, using random instance https://nitter.privacydev.net
20-Aug-24 13:03:20 - Current stats for AXISgsm: 21 tweets, 0 threads...
20-Aug-24 13:03:26 - Current stats for AXISgsm: 40 tweets, 0 threads...
20-Aug-24 13:03:31 - Current stats for AXISgsm: 60 tweets, 0 threads...
20-Aug-24 13:03:33 - Empty page on https://nitter.privacydev.net
