In [1]:
################### DOCUMENTS
# http://naelshiab.com/quand-twitter-rencontre-python/
# https://marcobonzanini.com/2015/03/02/mining-twitter-data-with-python-part-1/
# https://marcobonzanini.com/2015/03/09/mining-twitter-data-with-python-part-2/
# https://marcobonzanini.com/2015/03/17/mining-twitter-data-with-python-part-3-term-frequencies/
# https://marcobonzanini.com/2015/03/23/mining-twitter-data-with-python-part-4-rugby-and-term-co-occurrences/
# https://marcobonzanini.com/2015/04/01/mining-twitter-data-with-python-part-5-data-visualisation-basics/
# https://marcobonzanini.com/2015/05/17/mining-twitter-data-with-python-part-6-sentiment-analysis-basics/
# https://marcobonzanini.com/2015/06/16/mining-twitter-data-with-python-and-js-part-7-geolocation-and-interactive-maps/

In [2]:
########################################################################################
######################### QUEL SCREEN TWITTER EST A ANALYSER ? #########################
########################################################################################
MonScreenName = 'equipedefrance'
Langue = 'french'

In [3]:
# IMPORT PANDAS
import pandas as pd
# IMPORT TWEEPY
import tweepy
# IMPORT JSON RE & STRING ...
import json
import re
import string
import operator 
from collections import Counter
# IMPORT EMOJI & EMOTICON'S
import emoji
# IMPORT NLTK
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# Gestion configparser
import configparser

In [4]:
# Gestion de la configuration
config = configparser.ConfigParser()
config.read('../Configuration.conf')

['../Configuration.conf']

In [5]:
# Obtention de mes identifiants Twitter
ma_consumer_key = config.get('Twitter', 'consumer_key')
ma_consumer_secret = config.get('Twitter', 'consumer_secret')
ma_Oauth_token = config.get('Twitter', 'Oauth_token')
ma_Oauth_token_secret = config.get('Twitter', 'Oauth_token_secret')

In [6]:
# IDENTIFICATION DE MON API TWITTER
auth = tweepy.OAuthHandler(ma_consumer_key, ma_consumer_secret)
auth.set_access_token(ma_Oauth_token, ma_Oauth_token_secret)
api = tweepy.API(auth)

In [7]:
# FONCTION OBTENTION TWEETS DANS DATAFRAME
def MaRechercheTweets(ScreenName):
    #Twitter only allows access to a users most recent 3240 tweets with this method
    #initialize a list to hold all the tweepy Tweets
    alltweets = []
    #make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.user_timeline(screen_name = ScreenName, count = 200)
    #save most recent tweets
    alltweets.extend(new_tweets)
    #save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1
    #keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) > 0:
        #all subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name = ScreenName, count = 200, max_id = oldest)
        #save most recent tweets
        alltweets.extend(new_tweets)
        #update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1
        print ('... %s Tweets chargés' % (len(alltweets)))
    #transform the tweepy tweets into a 2D array
    #outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8")] for tweet in alltweets]
    outtweets = [[tweet.id_str, tweet.created_at, tweet.text] for tweet in alltweets]
    return outtweets

In [8]:
# FONCTION DE TRAITEMENT DU TEXTE DES TWEETS
def MonTraitementTweets(MonTweet, lowercase=False):
    Tokens = tokenize(MonTweet) # FONCTION(S) Tokenize
    TokensEmojis = [Token for Token in ''.join(c for c in MonTweet if c in emoji.UNICODE_EMOJI)] # Emoji's
    TokensEmojisSens = [emoji.demojize(Token) for Token in TokensEmojis] # Emoji's (Sens)
    TokensURLs = [Token for Token in Tokens if Token.startswith('http')] # URL's
    TokensHastags = [Token for Token in Tokens if Token.startswith('#')] # Hastag's
    TokensMentions = [Token for Token in Tokens if Token.startswith('@')] # Mention's
    TokensRestants = [Token for Token in Tokens if Token not in TokensEmojis
                                                and Token not in TokensURLs
                                                and Token not in TokensHastags
                                                and Token not in TokensMentions]
    TokensRestants = [Token.lower() for Token in TokensRestants if Token not in Punctuation] # Punctuation 
    TokensRestants = [Token.lower() for Token in TokensRestants if Token not in StopWords] # Stopwords                
    return pd.Series([TokensRestants, TokensEmojis, TokensEmojisSens, TokensURLs, TokensHastags, TokensMentions])

In [9]:
# FONCTION TOKENIZE
def tokenize(s):
    return tokens_re.findall(s)

In [10]:
# GESTION DES EMOTICON's POUR TOKENIZE
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)

In [11]:
# GESTION REGEX POUR TOKENIZE
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)

In [12]:
# GESTION DE LA PONCTUATION
Punctuation = list(string.punctuation) + ['“', '…', '”']

In [13]:
# GESTION DES STOPWORDS
if Langue == 'english':
    StopWords = stopwords.words(Langue) + ['rt', 'via']             # Pour l'anglais
if Langue == 'french':
    StopWords = stopwords.words(Langue) + ['rt', 'via', 'les']      # Pour le français

In [14]:
TweetsCherches = MaRechercheTweets(MonScreenName)
TweetsCherches_df = pd.DataFrame(TweetsCherches, columns = ['Tweet_Id', 'Tweet_Date', 'Tweet_Texte'])

... 400 Tweets chargés
... 600 Tweets chargés
... 800 Tweets chargés
... 1000 Tweets chargés
... 1200 Tweets chargés
... 1400 Tweets chargés
... 1600 Tweets chargés
... 1800 Tweets chargés
... 2000 Tweets chargés
... 2200 Tweets chargés
... 2400 Tweets chargés
... 2600 Tweets chargés
... 2800 Tweets chargés
... 3000 Tweets chargés
... 3200 Tweets chargés
... 3210 Tweets chargés
... 3210 Tweets chargés


In [15]:
# TRAITEMENT DU TEXTE DES TWEETS
TweetsCherches_df[['Tokens', 'Emojis', 'SensEmojis', 'URLs', 'Hastags', 'Mentions']] = TweetsCherches_df.apply(lambda x: MonTraitementTweets(x['Tweet_Texte']), axis = 1)
TweetsCherches_df

Unnamed: 0,Tweet_Id,Tweet_Date,Tweet_Texte,Tokens,Emojis,SensEmojis,URLs,Hastags,Mentions
0,1119643136215650310,2019-04-20 16:45:00,"Pour fêter les 100 ans de la @FFF, @AntoGriezm...","[fêter, 100, ans, oppos, és, thomas, lemar, q]",[],[],[https://t.co/hNEdQDhHag],[],"[@FFF, @AntoGriezmann, @nglkante, @samumtiti]"
1,1119177316549050369,2019-04-19 09:54:00,RDV demain pour le 2ème épisode du Quiz du cen...,"[rdv, demain, 2, ème, épisode, quiz, centenair...",[😄],[:grinning_face_with_smiling_eyes:],[https://t.co/L7GGCZXebS],[],"[@FFF, @AntoGriezmann, @nglkante]"
2,1117784797353578496,2019-04-15 13:40:37,Corinne Diacre communiquera le jeudi 2 mai sur...,"[corinne, diacre, communiquera, jeudi, 2, mai,...",[],[],[https://t.co/mFtZGmegAa],[],[]
3,1117491463988838401,2019-04-14 18:15:01,L’@efootdefrance est Championne du Monde !! 🏆\...,"[’, championne, monde, 🇫, 🇷]","[🏆, 💪, 👊]","[:trophy:, :flexed_biceps:, :oncoming_fist:]",[https://t.co/HdDB2OIGTR],[#FIFAeNationsCup],[@efootdefrance]
4,1117488813297762306,2019-04-14 18:04:29,L’@efootdefrance est Championne du Monde 🔥🏆\nL...,"[’, championne, monde, bleus, remportent, 🇫, 🇷]","[🔥, 🏆, 💪]","[:fire:, :trophy:, :flexed_biceps:]",[https://t.co/EtcFMpmG2S],[#FIFAeNationsCup],[@efootdefrance]
5,1117480136973090817,2019-04-14 17:30:01,RT @efootdefrance: Pour @MATUIDIBlaise le matc...,"[match, contre, l'argentine, 🇦, 🇷, a, moment, ...","[⭐, ⭐, 🎮]","[:white_medium_star:, :white_medium_star:, :vi...",[],[],"[@efootdefrance, @MATUIDIBlaise]"
6,1117479440982970370,2019-04-14 17:27:15,Suivez la Finale de l’@efootdefrance face à l’...,"[suivez, finale, ’, face, ’, argentine, direct...",[],[],[https://t.co/H2CpDZR7WD],[#FIFAeNationsCup],[@efootdefrance]
7,1117452903172714499,2019-04-14 15:41:48,Finale de Coupe du Monde face à l’Argentine po...,"[finale, coupe, monde, face, ’, argentine, ’, ...",[👊],[:oncoming_fist:],[https://t.co/RNStqIZHZN],[#FIFAeNationsCup],[@efootdefrance]
8,1117450204821766145,2019-04-14 15:31:04,Finale de Coupe du Monde pour l’@efootdefrance...,"[finale, coupe, monde, ’]",[👊],[:oncoming_fist:],[https://t.co/xPW7SJAkpa],[#FIFAeNationsCup],[@efootdefrance]
9,1116431211373780992,2019-04-11 20:01:57,Nos Bleus du futsal étaient chauds hier (victo...,"[bleus, futsal, chauds, hier, victoire, 6, 0, ...","[🔥, 🔥, 🔥]","[:fire:, :fire:, :fire:]",[https://t.co/1ZyKXpKlZw],[],[]


In [16]:
# ANALYSE DE L'ENSEMBLE DES TWEETS (MOTS UTILES)
Comptage = Counter()
TsLesTokens = []
for i in range (0, len(TweetsCherches_df)):
    TsLesTokens.extend(TweetsCherches_df['Tokens'][i])
Comptage.update(TsLesTokens)
print('********** ANALYSE DES TWEETS (MOTS UTILES)')
print(Comptage.most_common(20))

********** ANALYSE DES TWEETS (MOTS UTILES)
[('voici', 1778), ('russe', 1770), ('nom', 1769), ('🇷', 331), ('🇫', 301), ('france', 283), ('️', 282), ('’', 253), ('1', 199), ('é', 191), ('bleus', 168), ('monde', 146), ('match', 135), ('2', 126), ('0', 121), ('a', 117), ('ée', 110), ('înement', 98), ('face', 90), ('h00', 89)]


In [17]:
# ANALYSE DE L'ENSEMBLE DES MENTIONS (PERSONNES RE-TWEETEES)
Comptage = Counter()
TsLesTokens = []
for i in range (0, len(TweetsCherches_df)):
    TsLesTokens.extend(TweetsCherches_df['Mentions'][i])
Comptage.update(TsLesTokens)
print('********** ANALYSE DES MENTIONS (PERSONNES RE-TWEETEES)')
print(Comptage.most_common(20))

********** ANALYSE DES MENTIONS (PERSONNES RE-TWEETEES)
[('@AntoGriezmann', 59), ('@KMbappe', 50), ('@W9', 30), ('@samumtiti', 26), ('@equipedefrance', 26), ('@paulpogba', 24), ('@MATUIDIBlaise', 22), ('@BenPavard28', 22), ('@_OlivierGiroud_', 22), ('@kimpembe_3', 19), ('@nglkante', 18), ('@FFF', 17), ('@Dembouz', 17), ('@AreolaOfficiel', 16), ('@benmendy23', 16), ('@raphaelvarane', 14), ('@TF1', 14), ('@CorentinTolisso', 12), ('@EURO2020', 11), ('@M6', 10)]


In [18]:
# ANALYSE DE L'ENSEMBLE DES HASHTAGS (REFERENCES)
Comptage = Counter()
TsLesTokens = []
for i in range (0, len(TweetsCherches_df)):
    TsLesTokens.extend(TweetsCherches_df['Hastags'][i])
Comptage.update(TsLesTokens)
print('********** ANALYSE DES HASHTAGS (REFERENCES)')
print(Comptage.most_common(20))

********** ANALYSE DES HASHTAGS (REFERENCES)
[('#FiersdetreBleus', 508), ('#FRAURU', 67), ('#FRA', 54), ('#FRAISL', 51), ('#FRAALL', 51), ('#FRAARG', 45), ('#FiersdetreBleues', 43), ('#FRADAN', 37), ('#ALLFRA', 37), ('#FRACRO', 36), ('#FRABEL', 34), ('#PBSFRA', 32), ('#FRAPBS', 29), ('#MOLFRA', 23), ('#FRAJAP', 16), ('#Bleus', 15), ('#FRABRE', 13), ('#FRAMEX', 12), ('#FRAUSA', 10), ('#fiersdetrebleus', 9)]


In [19]:
# ANALYSE DE L'ENSEMBLE DES EMOJIS (LEUR SENS)
Comptage = Counter()
TsLesTokens = []
for i in range (0, len(TweetsCherches_df)):
    TsLesTokens.extend(TweetsCherches_df['SensEmojis'][i])
Comptage.update(TsLesTokens)
print('********** ANALYSE DES EMOJIS (LEUR SENS)')
print(Comptage.most_common(20))

********** ANALYSE DES EMOJIS (LEUR SENS)
[(':white_medium_star:', 130), (':fire:', 103), (':oncoming_fist:', 90), (':soccer_ball:', 76), (':right_arrow:', 74), (':flexed_biceps:', 55), (':raising_hands:', 51), (':VS_button:', 31), (':grinning_face_with_smiling_eyes:', 27), (':white_circle:', 27), (':blue_circle:', 25), (':television:', 24), (':party_popper:', 24), (':star-struck:', 24), (':red_circle:', 23), (':raised_fist:', 22), (':face_with_tears_of_joy:', 22), (':alarm_clock:', 20), (':clapping_hands:', 20), (':smiling_face_with_heart-eyes:', 17)]
