<a href="https://colab.research.google.com/github/JloukYahya/NER-vaccins-using-tweets/blob/main/twitter_scrap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Twitter scrapping
# Jlouk yahya

In [None]:
# imports
import sys
import os
import re
import tweepy
from tweepy import OAuthHandler
from textblob import TextBlob

import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from IPython.display import clear_output
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline

from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS

In [None]:
# install Flair
!pip install --upgrade git+https://github.com/flairNLP/flair.git

clear_output()

In [None]:

from flair.data import Sentence
from flair.models import SequenceTagger

tagger = SequenceTagger.load('ner')

clear_output()

In [None]:

from flair.models import TextClassifier

classifier = TextClassifier.load('en-sentiment')

clear_output()

### Twitter API
## pour les obtenire on doit creer un compte sur twitter et passer en mode developper, apres cela il faut demander au Twitter de developer le compte pour obtenir des KPI elevated (ils reponds ont trois jours).

In [None]:
#entrer ici les kpi key et secret key
TWITTER_KEY = '' #@param {type:"string"}
TWITTER_SECRET_KEY = '' #@param {type:"string"}

In [None]:
# se connecter
auth = tweepy.AppAuthHandler(TWITTER_KEY, TWITTER_SECRET_KEY)

api = tweepy.API(auth, wait_on_rate_limit=True,
				   wait_on_rate_limit_notify=True)

if (not api):
    print ("Can't Authenticate")
    sys.exit(-1)


#mnt on debute notre scrapping


In [None]:
#@title chercher les APIs
#@markdown ### le mot qu'on veut scrapper par exemple "sinopharm":
searchQuery = '' #@param {type:"string"}
#@markdown ### nombre de tweets qu'on veut scrapper:
#@markdown #### la limite et de 45000 par 15minutes
maxTweets = 90000 #@param {type:"slider", min:0, max:90000, step:100}
Filter_Retweets = True #@param {type:"boolean"}

tweetsPerQry = 100  # this is the max the API permits
tweet_lst = []

if Filter_Retweets:
  searchQuery = searchQuery + ' -filter:retweets'  # pour ne pas avoir les retweets retweets

# ici sinciId est juste pour les cas on veut scrapper plus que 45000
#mais rester sur la meme dataset sans avoir un conflit d'id
#donc si on voudrait scrapper plus de 45000 il faut scrapper les premier 45000 
#attendre 15 min et specifier ici sinceId=45000, pour completer de puis l'id 
#45000
sinceId = None

max_id = -10000000000

tweetCount = 0
print("Downloading max {0} tweets".format(maxTweets))
while tweetCount < maxTweets:
    try:
        if (max_id <= 0):
            if (not sinceId):
                new_tweets = api.search(q=searchQuery, count=tweetsPerQry, lang="en")
            else:
                new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
                                        lang="en", since_id=sinceId)
        else:
            if (not sinceId):
                new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
                                        lang="en", max_id=str(max_id - 1))
            else:
                new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
                                        lang="en", max_id=str(max_id - 1),
                                        since_id=sinceId)
        if not new_tweets:
            print("No more tweets found")
            break
        for tweet in new_tweets:
          if hasattr(tweet, 'reply_count'):
            reply_count = tweet.reply_count
          else:
            reply_count = 0
          if hasattr(tweet, 'retweeted'):
            retweeted = tweet.retweeted
          else:
            retweeted = "NA"
            
          # pour avoir le sujet on fixe le terme de recherche
          topic = searchQuery[:searchQuery.find('-')].capitalize().strip()
          
          # date
          tweetDate = tweet.created_at.date()
          
          tweet_lst.append([tweetDate, topic, 
                      tweet.id, tweet.user.screen_name, tweet.user.name, tweet.text, tweet.favorite_count, 
                      reply_count, tweet.retweet_count, retweeted])

        tweetCount += len(new_tweets)
        print("Downloaded {0} tweets".format(tweetCount))
        max_id = new_tweets[-1].id
    except tweepy.TweepError as e:
        # Just exit if any error
        print("some error : " + str(e))
        break

clear_output()
print("Downloaded {0} tweets".format(tweetCount))

##Data Sciencing

maintenant on vas stocker les tweets dans un fichier CSV

In [None]:
pd.set_option('display.max_colwidth', -1)

# utilisant pandas dataframe
tweet_df = pd.DataFrame(tweet_lst, columns=['tweet_dt', 'topic', 'id', 'username', 'name', 'tweet', 'like_count', 'reply_count', 'retweet_count', 'retweeted'])
tweet_df.to_csv('moderna.csv')
tweet_df.head()

malheureusement twitter ne nous donne pas la possibiliter de filitrer la date
pour cela j'ai utiliser le filitre suivant :

In [None]:
#@title specifier la date pour obtenire que les tweets qui date de 2021
today = datetime.now().date()
yesterday = today - timedelta(1)

debut_dt = '2021-01-01' #@param {type:"date"}
fin_dt = '2021-09-30' #@param {type:"date"}

if start_dt == '':
  start_dt = yesterday
else:
  start_dt = datetime.strptime(start_dt, '%Y-%m-%d').date()

if end_dt == '':
  end_dt = today
else:
  end_dt = datetime.strptime(end_dt, '%Y-%m-%d').date()


tweet_df = tweet_df[(tweet_df['tweet_dt'] >= start_dt) 
                    & (tweet_df['tweet_dt'] <= end_dt)]
tweet_df.shape

## NER et l'analyse des sentiments

pour le NER je vais utiliser Flair library: https://github.com/zalandoresearch/flair

###NER

precedement j'ai collocter les tweets et ajouter les tags en des lignes separer cela va nous aider par la suite. ainsi que creer un tag nommer hashtags pout que FLAIR le reconnu.

### analyse des sentiments

pour le KPI de polariter on utlise flair classifier et les ajouter apres a nos datasets

In [None]:
# liste predection
nerlst = []

for index, row in tqdm(tweet_df.iterrows(), total=tweet_df.shape[0]):
  cleanedTweet = row['tweet'].replace("#", "")
  sentence = Sentence(cleanedTweet, use_tokenizer=True)
  
  # predection des tags NER

  tagger.predict(sentence)
  ners = sentence.to_dict(tag_type='ner')['entities']
  
  # predection des sentiments
  classifier.predict(sentence)
  label = sentence.labels[0]
  response = {'result': label.value, 'polarity':label.score}
  
  #hashtags
  hashtags = re.findall(r'#\w+', row['tweet'])
  if len(hashtags) >= 1:
    for hashtag in hashtags:
      ners.append({ 'type': 'Hashtag', 'text': hashtag })
  
  for ner in ners:
    adj_polarity = response['polarity']
    if response['result'] == 'NEGATIVE':
      adj_polarity = response['polarity'] * -1
    try:
      ner['type']
    except:
      ner['type'] = ''      
    nerlst.append([ row['tweet_dt'], row['topic'], row['id'], row['username'], 
                   row['name'], row['tweet'], ner['type'], ner['text'], response['result'], 
                   response['polarity'], adj_polarity, row['like_count'], row['reply_count'], 
                  row['retweet_count'] ])

clear_output()

In [None]:
df_ner = pd.DataFrame(nerlst, columns=['tweet_dt', 'topic', 'id', 'username', 'name', 'tweet', 'tag_type', 'tag', 'sentiment', 'polarity', 
                                       'adj_polarity','like_count', 'reply_count', 'retweet_count'])
df_ner.head()

pour plus cerner notre dataset avant d'uiliser Openrefine ov a banner les mots qu'on veut pas

In [None]:
# filtre
banned_words = ['anti', 'anti-vaxer', 'pharm', '#pfzer', '#johnson', 'j&j', 'vaxer', 'astral',
                'asia', 'Asia', 'latin','latinos']

df_ner = df_ner[~df_ner['tag'].isin(banned_words)]

caluclation de la frequences des tags

In [None]:
ner_groups = df_ner.groupby(['tag', 'tag_type']).agg({'tag': "count", 'adj_polarity': "mean",
                                                     'like_count': 'sum', 'reply_count': 'sum',
                                                     'retweet_count': 'sum'})
ner_groups = ner_groups.rename(columns={
    "tag": "Frequency",
    "adj_polarity": "Avg_Polarity",
    "like_count": "Total_Likes",
    "reply_count": "Total_Replies",
    "retweet_count": "Total_Retweets"
})
ner_groups = ner_groups.sort_values(['Frequency'], ascending=False)
ner_groups = ner_groups.reset_index()
ner_groups.head()

creer un sentiment general soit positive ou negative
et je mentionne la difficulter d'avoir neutre

In [None]:
ner_groups['Sentiment'] = np.where(ner_groups['Avg_Polarity']>=0, 'POSITIVE', 'NEGATIVE')
ner_groups.head()

## Visualization

on peut tracer des courbes baser sut les tags


*   les plus populaire
*   plus de j'aimes
*   plus de commentaires
*   plus de retweets



In [None]:
Filter_TAG = False 

#entrer le tag

TAG = '' 
Top_N = 10 

if TAG != 'Hashtag':
  TAG = TAG[:3].upper()

if Filter_TAG:
  filtered_group = ner_groups[(ner_groups['tag_type'] == TAG)]
else:
  filtered_group = ner_groups

# plot
fig = plt.figure(figsize=(20, 16))
fig.subplots_adjust(hspace=0.2, wspace=0.5)

ax1 = fig.add_subplot(321)
sns.barplot(x="Frequency", y="tag", data=filtered_group[:Top_N], hue="Sentiment")
ax2 = fig.add_subplot(322)
filtered_group = filtered_group.sort_values(['Total_Likes'], ascending=False)
sns.barplot(x="Total_Likes", y="tag", data=filtered_group[:Top_N], hue="Sentiment")
ax3 = fig.add_subplot(323)
filtered_group = filtered_group.sort_values(['Total_Replies'], ascending=False)
sns.barplot(x="Total_Replies", y="tag", data=filtered_group[:Top_N], hue="Sentiment")
ax4 = fig.add_subplot(324)
filtered_group = filtered_group.sort_values(['Total_Retweets'], ascending=False)
sns.barplot(x="Total_Retweets", y="tag", data=filtered_group[:Top_N], hue="Sentiment")

ax1.title.set_text('les plus populaire')
ax2.title.set_text('les plus liker')
ax3.title.set_text('les plis commenter')
ax4.title.set_text('les plus retweeter')

ax1.set_ylabel('')    
ax1.set_xlabel('')
ax2.set_ylabel('')    
ax2.set_xlabel('')
ax3.set_ylabel('')    
ax3.set_xlabel('')
ax4.set_ylabel('')    
ax4.set_xlabel('')

###la polariter

In [None]:
fig = plt.figure(figsize=(12, 6))
sns.distplot(filtered_group['Avg_Polarity'], hist=False, kde_kws={"shade": True})


le reste d'analyse sera avec powerBI/SPSS

Merci.
