# Récupération des données twitter

A partir des comptes twitter issus de Decodex, on récupère les données de twitter correspondants à ces comptes : 
    - les abonnés ; 
    - les tweets ; 
    - les retweets et likes ; 

#### Import packages 

In [1]:
import pandas as pd 
import tweepy as tw
import re

import pickle

##### Récupération clés twitter

In [2]:
%cd ../../data

C:\Users\ManonRICHARD\Documents\PFE\twitter-fakenews\data


In [3]:
twitter_keys = eval(open("api_twitter_keys.txt").read())

In [4]:
twitter_keys.keys()

dict_keys(['consumer_key', 'consumer_secret', 'access_token', 'access_token_secret'])

In [5]:
auth = tw.OAuthHandler(twitter_keys['consumer_key'], twitter_keys['consumer_secret'])
auth.set_access_token(twitter_keys['access_token'], twitter_keys['access_token_secret'])
api = tw.API(auth, wait_on_rate_limit = True)

## Import données Decodex

In [None]:
decodex_data = pd.read_csv("twitter_accounts_decodex.csv", sep=";")
decodex_data.head(2)

In [7]:
decodex_data['path_website'] = decodex_data['path_website'].apply(lambda txt : re.sub('twitter.(com|fr)/', '', txt))
decodex_data['path_website'] = decodex_data['path_website'].apply(lambda txt : re.sub(r'\?lang=fr', '', txt))

In [None]:
decodex_data.head(2)

## Récupération des tweets 

In [9]:
### Extraction d'autant de tweets anciens possibles (et pas seulement limite 200) d'un userID
def extract_all_tweets_from_user(userID) :

    tweets = api.user_timeline(screen_name=userID, 
                               # 200 is the maximum allowed count
                               count=200,
                               include_rts = False,
                               # Necessary to keep full_text 
                               # otherwise only the first 140 words are extracted
                               tweet_mode = 'extended'
                               )
  
    all_tweets = []
    all_tweets.extend(tweets)
    oldest_id = tweets[-1].id
    
    while True:
        tweets = api.user_timeline(screen_name=userID, 
                                   # 200 is the maximum allowed count
                                   count=200,
                                   include_rts = False,
                                   max_id = oldest_id - 1,
                                   # Necessary to keep full_text 
                                   # otherwise only the first 140 words are extracted
                                   tweet_mode = 'extended'
                                   )
        if len(tweets) == 0:
            break
            
        oldest_id = tweets[-1].id
        all_tweets.extend(tweets)
        #print('N of tweets downloaded till now {}'.format(len(all_tweets)))
        
    return(all_tweets)

In [None]:
## On récupère les tweets de tous les sites
%time
tweets_all_accounts = []
accounts_not_found = []
count_action = 0
for tweet_account in decodex_data['path_website'] : 
    
    # get tweets from tweet account : 
    try:
        all_tweets_account = extract_all_tweets_from_user(tweet_account)
        
    except:
        print(f"No tweet collected from {tweet_account}")
        accounts_not_found.append(tweet_account)
        pass
    
    if len(all_tweets_account) > 0 : 
        tweets_all_accounts.append(all_tweets_account)
        
    # get all followers of tweet account : 
    
        
    count_action += 1
    
    if (count_action % 10) == 0: 
        print(count_action)
        print(tweet_account)

In [12]:
len(accounts_not_found)

47

In [13]:
len(tweets_all_accounts)

425

In [14]:
def create_df_tweet(all_tweets) :
    df_all = pd.DataFrame(columns=["id", "created_at", "favorite_count", "retweet_count", "text"])
    for elem in all_tweets  :
        outtweets = [[tweet.id_str,
                      tweet.user.screen_name,
                      tweet.created_at, 
                      tweet.favorite_count, 
                      tweet.retweet_count, 
                      tweet.full_text.encode("utf-8").decode("utf-8")] 
                     for idx, tweet in enumerate(elem)]
        df = pd.DataFrame(outtweets,columns=["id", "user", "created_at", "favorite_count", "retweet_count", "text"])
        df_all = pd.concat([df_all, df])

    return(df_all)

# https://www.geeksforgeeks.org/python-status-object-in-tweepy/

In [15]:
df_tweets_format = create_df_tweet(tweets_all_accounts)

In [None]:
df_tweets_format.head()

In [18]:
len(df_tweets_format) # 1M

1020089

In [17]:
df_tweets_format.groupby('user').count()

Unnamed: 0_level_0,id,created_at,favorite_count,retweet_count,text
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1RiposteLaique,3200,3200,3200,3200,3200
20Minutes,2875,2875,2875,2875,2875
9GAG,3230,3230,3230,3230,3230
ABC,3077,3077,3077,3077,3077
AE911Truth,3174,3174,3174,3174,3174
...,...,...,...,...,...
washingtonpost,3125,3125,3125,3125,3125
wikiHow,3177,3177,3177,3177,3177
worldtvdesinfo,2986,2986,2986,2986,2986
wucnews,6392,6392,6392,6392,6392


### Export des données

In [19]:
with open("tweets_all_accounts.txt", "wb") as fp :   #Pickling
    pickle.dump(df_tweets_format, fp)

### Import des données

In [21]:
with open("tweets_all_accounts.txt", "rb") as file :   
    tweets_all_accounts = pickle.load(file)

In [None]:
tweets_all_accounts.head()

In [23]:
len(tweets_all_accounts)

1020089