## Coding Exercise #0505

In [1]:
# Install Tweepy once.
# !pip install tweepy

In [2]:
import tweepy
import re
import os
import pickle
import nltk
from nltk.corpus import stopwords
from tweepy import OAuthHandler

### 1. Fetching tweets from Twitter:
More information can be found [here](http://docs.tweepy.org/en/v3.5.0/index.html).

In [14]:
# Xác thực
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)

In [15]:
def collect_tweets(keyword, location=None, num_tweets=100):
    tweets = []
    try:
        # Xây dựng query
        query = f"{keyword} -filter:retweets lang:en"
        
        # Thêm location nếu có
        geocode = None
        if location:
            geocode = f"{location},500km"
        
        # Thu thập tweet
        for tweet in tweepy.Cursor(api.search_tweets,
                                 q=query,
                                 tweet_mode='extended',
                                 geocode=geocode).items(num_tweets):
            tweets.append(tweet.full_text)
            
        return tweets
    
    except tweepy.TweepyException as e:
        print(f"Lỗi khi thu thập tweet: {e}")
        return []

In [16]:
def preprocess_tweet(tweet):
    # Chuyển về chữ thường
    tweet = tweet.lower()
    
    # Xóa URL
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet)
    
    # Thay thế từ viết tắt
    contractions = {
        r"i'm": "i am",
        r"you're": "you are",
        r"it's": "it is",
        r"that's": "that is",
        r"what's": "what is",
        r"where's": "where is",
        r"\'ll": " will",
        r"\'ve": " have",
        r"\'re": " are",
        r"\'d": " would",
        r"won't": "will not",
        r"can't": "cannot",
        r"n't": " not"
    }
    
    for pattern, repl in contractions.items():
        tweet = re.sub(pattern, repl, tweet)
    
    # Xóa ký tự đặc biệt và số
    tweet = re.sub(r"[^a-zA-Z]", " ", tweet)
    
    # Xóa khoảng trắng thừa
    tweet = re.sub(r"\s+", " ", tweet).strip()
    
    # Loại bỏ stopwords
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(tweet)
    words = [word for word in words if word not in stop_words]
    
    return ' '.join(words)

In [17]:
# Thông số tìm kiếm
keyword = "Trump"
location = "40.71,-74.0"  # New York
num_tweets = 30 # Số lượng tweet muốn lấy

# Thu thập tweet
raw_tweets = collect_tweets(keyword, location, num_tweets)
print(f"Đã thu thập được {len(raw_tweets)} tweet")

# Tiền xử lý
processed_tweets = [preprocess_tweet(tweet) for tweet in raw_tweets]

# In kết quả
print("\n10 tweet đầu tiên sau khi xử lý:")
for i, tweet in enumerate(processed_tweets[:10], 1):
    print(f"{i}. {tweet}")

Lỗi khi thu thập tweet: 403 Forbidden
453 - You currently have access to a subset of X API V2 endpoints and limited v1.1 endpoints (e.g. media post, oauth) only. If you need access to this endpoint, you may need a different access level. You can learn more here: https://developer.x.com/en/portal/product
Đã thu thập được 0 tweet

10 tweet đầu tiên sau khi xử lý:


In [23]:
import os
from dotenv import load_dotenv

load_dotenv()

# In ra các key để kiểm tra (DEBUG ONLY - nhớ xóa sau khi kiểm tra)
print("Bearer Token:", bool(os.getenv("TWITTER_BEARER_TOKEN")))
print("Consumer Key:", bool(os.getenv("TWITTER_CONSUMER_KEY")))

Bearer Token: True
Consumer Key: True


In [25]:
try:
    # Search recent tweets (API v2)
    response = client.search_recent_tweets(
        query="China -is:retweet lang:en",
        max_results=100,
        tweet_fields=["created_at", "text"],
        user_fields=["location"],
        expansions=["author_id"],
        place_fields=["geo"]
    )
    
    if response.data:
        tweets = [tweet.text for tweet in response.data]
        print(f"Đã thu thập được {len(tweets)} tweet")
        for i, tweet in enumerate(tweets[:10], 1):
            print(f"{i}. {tweet}")
    else:
        print("Không tìm thấy tweet nào phù hợp")

except tweepy.TweepyException as e:
    print(f"Lỗi API: {e}")
except Exception as e:
    print(f"Lỗi hệ thống: {e}")

Lỗi API: 401 Unauthorized
Unauthorized


In [29]:
import tweepy
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Initialize Client WITHOUT bearer token
client = tweepy.Client(
    consumer_key=os.getenv("TWITTER_CONSUMER_KEY"),
    consumer_secret=os.getenv("TWITTER_CONSUMER_SECRET"),
    access_token=os.getenv("TWITTER_ACCESS_TOKEN"),
    access_token_secret=os.getenv("TWITTER_ACCESS_SECRET"),
    wait_on_rate_limit=True
)

In [36]:
from tweepy import OAuth2UserHandler

auth = OAuth2UserHandler(
    client_id="CLIENT_ID",
    client_secret="CLIENT_SECRET",
    redirect_uri="https://localhost",
    scope=["tweet.read", "offline.access"]
)

# Lấy URL xác thực
print("Mở URL này trong trình duyệt:", auth.get_authorization_url())

Mở URL này trong trình duyệt: https://twitter.com/i/oauth2/authorize?response_type=code&client_id=CLIENT_ID&redirect_uri=https%3A%2F%2Flocalhost&scope=tweet.read+offline.access&state=GK4CsUIl6vfWr4mRiViTHYUT5WSpkp&code_challenge=1uFX82BGSF9jRa760FeSFdLwvdTC3uFK1zXgQKEBE78&code_challenge_method=S256


In [None]:
import os
import tweepy
from tweepy import OAuth2UserHandler
from dotenv import load_dotenv
from urllib.parse import parse_qs, urlparse
import webbrowser

# 1. Load biến môi trường
load_dotenv()  # Tự động tìm file .env

# 2. Lấy credentials từ .env
CLIENT_ID = os.getenv("CLIENT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
REDIRECT_URI = "https://b648-2405-4803-fee6-acb0-f835-3b3-70ad-3dcc.ngrok-free.app/callback"

# 3. Kiểm tra biến môi trường
if not all([CLIENT_ID, CLIENT_SECRET, REDIRECT_URI]):
    raise ValueError("Missing Twitter credentials in .env file")

# 4. Xác thực OAuth 2.0
auth = OAuth2UserHandler(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    redirect_uri=REDIRECT_URI,
    scope=["tweet.read", "users.read"],
)

# 5. Quy trình xác thực
try:
    # Lấy URL xác thực
    auth_url = auth.get_authorization_url()
    print(f"Mở URL để xác thực:\n{auth_url}")
    webbrowser.open(auth_url)

    # Nhập callback URL
    callback_url = input("Dán URL callback chứa code: ")
    code = parse_qs(urlparse(callback_url).query['code'][0]

    # Lấy access token
    token = auth.fetch_token(code=code)
    print("Xác thực thành công!")

    # 6. Sử dụng API
    client = tweepy.Client(token['access_token'])
    tweets = client.search_recent_tweets(
        query="Python lang:en -is:retweet",
        max_results=5
    )
    
    for tweet in tweets.data:
        print(tweet.text)

except Exception as e:
    print(f"Lỗi: {e}")

SyntaxError: '(' was never closed (927504295.py, line 37)

SyntaxError: '(' was never closed (2496415833.py, line 37)

In [56]:
from pyngrok import ngrok

# 1. Mở tunnel
tunnel = ngrok.connect(8000)  # Trả về đối tượng NgrokTunnel

# 2. Lấy URL công khai (phải truy cập qua thuộc tính public_url)
callback_url = f"{tunnel.public_url}/callback"
print("Callback URL an toàn:", callback_url)

# 3. Cấu hình Twitter API
auth = tweepy.OAuth2UserHandler(
    client_id="CLIENT_ID",
    client_secret="CLIENT_SECRET",
    redirect_uri=callback_url,  # Sử dụng URL ngrok
    scope=["tweet.read"]
)

Callback URL an toàn: https://e6a3-2405-4803-fee6-acb0-f835-3b3-70ad-3dcc.ngrok-free.app/callback


In [60]:
import tweepy
from dotenv import load_dotenv

# 1. Load biến môi trường
load_dotenv()

# 2. Lấy thông tin xác thực
CLIENT_ID = os.getenv("CLIENT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
auth = tweepy.OAuth2UserHandler(
    client_id="CLIENT_ID",
    client_secret="CLIENT_SECRET",
    redirect_uri=callback_url,
    scope=["tweet.read", "users.read"]
)

auth_url = auth.get_authorization_url()
print("🔗 Truy cập URL để xác thực:", auth_url)
# 5. Nhập code từ callback (sau khi xác thực)
code = input("Nhập code từ URL callback: ")
token = auth.fetch_token(code=code)

# 6. Sử dụng API
client = tweepy.Client(token['access_token'])
tweets = client.search_recent_tweets(query="Python", max_results=5)
for tweet in tweets.data:
    print(tweet.text)

🔗 Truy cập URL để xác thực: https://twitter.com/i/oauth2/authorize?response_type=code&client_id=CLIENT_ID&redirect_uri=https%3A%2F%2Fe6a3-2405-4803-fee6-acb0-f835-3b3-70ad-3dcc.ngrok-free.app%2Fcallback&scope=tweet.read+users.read&state=Cp2cTiz0ZNeu2BdWLgVTMHSJCoeLiN&code_challenge=4ytZGf65JQFArLyO3M4ciMUL1lZ0n6iYzzDSv0Qo06Y&code_challenge_method=S256


TypeError: OAuth2UserHandler.fetch_token() got an unexpected keyword argument 'code'

In [5]:
# my_keyword = "Trump"                           # Search keyword.
# my_location = "40.71,-74.0,500km"               # Location = latitude, longitude, radius.
# n_tweets = 100                                 # Maximum number of tweets.

In [6]:
# my_tweets = []
# api = tweepy.API(auth, timeout = 10)           # Timeout after 10 seconds.
# for status in tweepy.Cursor(api.search, q = my_keyword + " -filter:retweets", lang="en", result_type="recent", geocode=my_location).items(n_tweets):
#     my_tweets.append(status.text)

In [7]:
# print('\n'.join(my_tweets))

### 2. Pre-processing the tweets:

In [8]:
# my_tweets_new = []
# for tweet in my_tweets:
#     tweet = tweet.lower()
#     tweet = re.sub(r"^https://t.co/[a-zA-Z0-9]*\s", " ", tweet)
#     tweet = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*\s", " ", tweet)
#     tweet = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*$", " ", tweet)
#     tweet = re.sub(r"that's","that is",tweet)
#     tweet = re.sub(r"there's","there is",tweet)
#     tweet = re.sub(r"what's","what is",tweet)
#     tweet = re.sub(r"where's","where is",tweet)
#     tweet = re.sub(r"it's","it is",tweet)
#     tweet = re.sub(r"who's","who is",tweet)
#     tweet = re.sub(r"i'm","i am",tweet)
#     tweet = re.sub(r"she's","she is",tweet)
#     tweet = re.sub(r"he's","he is",tweet)
#     tweet = re.sub(r"they're","they are",tweet)
#     tweet = re.sub(r"who're","who are",tweet)
#     tweet = re.sub(r"ain't","am not",tweet)
#     tweet = re.sub(r"wouldn't","would not",tweet)
#     tweet = re.sub(r"shouldn't","should not",tweet)
#     tweet = re.sub(r"can't","can not",tweet)
#     tweet = re.sub(r"couldn't","could not",tweet)
#     tweet = re.sub(r"won't","will not",tweet)
#     tweet = re.sub(r"\W"," ",tweet)
#     tweet = re.sub(r"\d"," ",tweet)
#     tweet = re.sub(r"\s+[a-z]\s+"," ",tweet)
#     tweet = re.sub(r"\s+[a-z]$"," ",tweet)
#     tweet = re.sub(r"^[a-z]\s+"," ",tweet)
#     tweet = re.sub(r"\s+"," ",tweet)
#     words = nltk.word_tokenize(tweet)
#     words = [x for x in words if x not in stopwords.words('english')]
#     tweet = ' '.join(words) 
#     my_tweets_new.append(tweet)

In [9]:
# # Print out the tweets.
# i = 0
# for tw in my_tweets_new:
#     i += 1
#     print(str(i) + " : " + tw)