In [2]:
from twython import Twython

import pandas as pd
pd.set_option('display.max_colwidth', 100)

from bs4 import BeautifulSoup
import re

import json
import os
import glob
import time
from datetime import date,datetime

import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
import nltk  
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import textblob
from textblob import TextBlob 

import seaborn as sns
import matplotlib

In [125]:
# Getting tweets from twitter search (works with '@' and '#' prefix)
# ------------------------------------------------------------------

def connect_to_twitter(twitter_auth_path):
    with open(twitter_auth_path, 'r') as f:
        twitter_auth = json.load(f)
        APP_KEY = twitter_auth['APP_KEY'] # your app key
        APP_SECRET = twitter_auth['APP_SECRET'] # your app secret
        OAUTH_TOKEN = twitter_auth['OAUTH_TOKEN'] # your oauth token
        OAUTH_TOKEN_SECRET = twitter_auth['OAUTH_TOKEN_SECRET'] # your oauth token secret
    twitter = Twython(APP_KEY, APP_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
    return twitter

def get_tweet_data(tweet, tweet_data):        
    tweet_data['text'] = tweet['full_text']
    tweet_data['hashtags'] = get_hashtags_string(tweet)
    tweet_data['tweet_id'] = tweet['id']
    tweet_data['created_at'] = tweet['created_at']
    tweet_data['retweet_count'] = tweet['retweet_count']
    tweet_data['favorite_count'] = tweet['favorite_count']
    tweet_data['in_reply_to_status_id'] = tweet['in_reply_to_status_id']
    tweet_data['in_reply_to_screen_name'] = tweet['in_reply_to_screen_name']
    tweet_data['lang'] = tweet['lang']
    return tweet_data

def get_tweet_author_data(tweet, tweet_data):
    tweet_data['author'] = tweet['user']['screen_name']
    tweet_data['account_created_at'] = tweet['user']['created_at']
    tweet_data['author_description'] = tweet['user']['description']
    tweet_data['author_id'] = tweet['user']['id']
    tweet_data['author_location'] = tweet['user']['location']
    tweet_data['author_statuses_count'] = tweet['user']['statuses_count']
    tweet_data['author_followers_count'] = tweet['user']['followers_count']
    tweet_data['author_friends_count'] = tweet['user']['friends_count']
    tweet_data['author_favourites_count'] = tweet['user']['favourites_count']
    tweet_data['author_listed_count'] = tweet['user']['listed_count']
    return tweet_data

def scraper_metadata(tweet_data, scrap_time, search_phrase):
    tweet_data['scrap_time'] = scrap_time
    tweet_data['file_tag'] = scrap_time[:19]
    tweet_data['scrap_phrase'] = search_phrase
    return tweet_data

def get_hashtags_string(tweet):
    hashtags_string = ''
    for nr in range(len(tweet['entities']['hashtags'])):
        hashtags_string += tweet['entities']['hashtags'][nr]['text'] + ' '
    if not hashtags_string:
        hashtags_string = 'NO_HASHTAGS'
    return hashtags_string

def get_tweets_from_search(twitter, max_attempts, max_tweets_to_get, search_phrase):
    tweets_data = []
    for attempt_nr in range(0, max_attempts):
        if(max_tweets_to_get < len(tweets_data)):
            break

        if(0 == attempt_nr):
            search_results = twitter.search(q=search_phrase, result_type='mixed',  count='100', lang='en', tweet_mode='extended')
        else:
            search_results = twitter.search(q=search_phrase, result_type='mixed', include_entities='true', max_id=next_max_id, count='100', lang='en', tweet_mode='extended')

        for tweet_data in search_results['statuses']:
            tweets_data.append(tweet_data)
        print('Loop: {} finished. Tweets gathered sum: {}.'.format(attempt_nr+1, len(tweets_data)))

        try:
            metadata = search_results['search_metadata']['next_results']
            next_max_id = metadata.split('max_id=')[1]
            next_max_id = next_max_id.split('&')[0]
        except:
            break
    return tweets_data

def return_as_df(all_tweets_list):
    tweet_df = pd.DataFrame(all_tweets_list)
    if tweet_df.shape[0] > 0:
        display(tweet_df.text.head(5), tweet_df.shape)
    return tweet_df

def save_tweets_as_CSV(save_path, tweet_df):
    tweet_nr = len(tweet_df)
    tweet_df.to_csv(save_path, sep='\t', encoding='utf-8', index=False)
    print('{} tweets saved to {}'.format(tweet_nr, save_path))

def get_tweets_by_search_phrase(s, max_tweets_to_get, max_attempts, twitter_auth_path, SAVE_DIR=''):
    scrap_time = datetime.now().isoformat()
    twitter = connect_to_twitter(twitter_auth_path)
    tweets_data = get_tweets_from_search(twitter, max_attempts, max_tweets_to_get, search_phrase)

    extracted_tweets_data = []
    for tweet in tweets_data:
        tweet_data = {}
        tweet_data = get_tweet_data(tweet, tweet_data)
        tweet_data = get_tweet_author_data(tweet, tweet_data)
        tweet_data = scraper_metadata(tweet_data, scrap_time, search_phrase)
        extracted_tweets_data.append(tweet_data)
    
    tweet_df = return_as_df(extracted_tweets_data)
    time_tag = ''.join([l for l in scrap_time[:19] if l.isdigit()])
    save_path = SAVE_DIR + ''.join(search_phrase.split(' ')) +'_' + time_tag + '_tweets.csv'
    save_tweets_as_CSV(save_path, tweet_df)
    
    return tweet_df

# Run scraper for each of 
# -----------------------
# AI phrases : ['Automation', 'ArtificialIntelligence', 'AI'] With hashtag and without
# plus
# Psychology phrases : ['Automation', 'ArtificialIntelligence', 'AI'] With hashtag and without

def get_phrases(phrase1, phrase2):
    phrase_1_options = [symbol + phrase1 for symbol in ['#', '']]
    phrase_2_options = [symbol + phrase2 for symbol in ['#', '']]
    options = [phrase_1_options[0] + ' ' + phrase_2_options[0],
               phrase_1_options[0] + ' ' + phrase_2_options[1],
               phrase_1_options[1] + ' ' + phrase_2_options[0],
               phrase_1_options[1] + ' ' + phrase_2_options[1],]
    return options

def get_all_phrases(phrases, phrase2):
    search_phrases = [ get_phrases(phrase, phrase2) for phrase in phrases]
    all_search_phrases = []
    for phrases in zip(*search_phrases):
        all_search_phrases.extend(phrases)
    return all_search_phrases

In [11]:
# Example setting
# ---------------
# Search on 'phrase + ' ' psychology, both with and without hash prefix

# Search phrases
phrases = ['Automation', 'ArtificialIntelligence', 'AI']
phrase2 = 'psychology' # human, sociology, markeing 

# Data was gathered in number of such runs with 'sociology', 'human', 'marketing in place of psychology'.


# Save data to folder
SAVE_DIR = 'tweets_data/'

# Scraper auth & limits
max_tweets_to_get = 50000
max_attempts = 50
twitter_auth_path = 'twitter_auth.json'

# -----------------------------------------------------

# Generate phrases
all_search_phrases = get_all_phrases(phrases, phrase2)

# Show example search phrases:
show = 10
num_files = len(all_search_phrases)*4
print('\nThere are {} phrases. Examples:\n-------------------------------'.format(num_files))
for phrase in all_search_phrases[:show]:
    print("- ", phrase)
print('\n+ {} more\n\n'.format ((num_files - show)))


There are 48 phrases. Examples:
-------------------------------
-  #Automation #psychology
-  #ArtificialIntelligence #psychology
-  #AI #psychology
-  #Automation psychology
-  #ArtificialIntelligence psychology
-  #AI psychology
-  Automation #psychology
-  ArtificialIntelligence #psychology
-  AI #psychology
-  Automation psychology

+ 38 more




In [36]:
# Run scraper 
all_searches_data = []
for i, search_phrase in enumerate(all_search_phrases):
    print('\n', i, search_phrase)
    search_phrase_tweets_df = get_tweets_by_search_phrase(search_phrase, max_tweets_to_get, max_attempts, 
                                                          twitter_auth_path, SAVE_DIR)
    all_searches_data.append(search_phrase_tweets_df)

num_of_frames = len(all_searches_data)
num_of_rows = sum([df.shape[0] for df in all_searches_data])
print('{} dataframes collected with total {} of samples.'.format(num_of_frames, num_of_rows))

Search phrases: ['#Automation #human', '#ArtificialIntelligence #human', '#AI #human', '#Automation human', '#ArtificialIntelligence human', '#AI human', 'Automation #human', 'ArtificialIntelligence #human', 'AI #human', 'Automation human', 'ArtificialIntelligence human', 'AI human']

 0 #Automation #human
Loop: 1 finished. Tweets gathered sum: 100.
Loop: 2 finished. Tweets gathered sum: 200.
Loop: 3 finished. Tweets gathered sum: 300.
Loop: 4 finished. Tweets gathered sum: 400.
Loop: 5 finished. Tweets gathered sum: 500.
Loop: 6 finished. Tweets gathered sum: 505.


0    RT @BobSudothis: 75% want more #human vs #mach...
1    RT @BobSudothis: 75% want more #human vs #mach...
2    RT @BobSudothis: 75% want more #human vs #mach...
3    RT @BobSudothis: 75% want more #human vs #mach...
4    75% want more #human vs #machine interactions ...
Name: text, dtype: object

(505, 22)

505 tweets saved to tweets_data/#Automation #human_20190119202843_tweets.csv

 1 #ArtificialIntelligence #human
Loop: 1 finished. Tweets gathered sum: 99.


0    Cognitive Computing: \n\nMore #Human Than #Art...
1    RT @SpirosMargaris: Cognitive Computing: \n\nM...
2    RT @SpirosMargaris: Cognitive Computing: \n\nM...
3    RT @MikeQuindazzi: 1000's of simulated neurons...
4    RT @MikeQuindazzi: 1000's of simulated neurons...
Name: text, dtype: object

(99, 22)

99 tweets saved to tweets_data/#ArtificialIntelligence #human_20190119202847_tweets.csv

 2 #AI #human
Loop: 1 finished. Tweets gathered sum: 100.
Loop: 2 finished. Tweets gathered sum: 200.
Loop: 3 finished. Tweets gathered sum: 300.
Loop: 4 finished. Tweets gathered sum: 400.
Loop: 5 finished. Tweets gathered sum: 500.
Loop: 6 finished. Tweets gathered sum: 600.
Loop: 7 finished. Tweets gathered sum: 700.
Loop: 8 finished. Tweets gathered sum: 800.
Loop: 9 finished. Tweets gathered sum: 900.
Loop: 10 finished. Tweets gathered sum: 987.


0    Cognitive Computing: \n\nMore #Human Than #Art...
1    RT @BobSudothis: 75% want more #human vs #mach...
2    #Researchgate #Stats #UNINA #Naples #Universit...
3    RT @BobSudothis: 75% want more #human vs #mach...
4    RT @vanguardsw: RT @chboursin "#AI will achiev...
Name: text, dtype: object

(987, 22)

987 tweets saved to tweets_data/#AI #human_20190119202848_tweets.csv

 3 #Automation human
Loop: 1 finished. Tweets gathered sum: 100.
Loop: 2 finished. Tweets gathered sum: 200.
Loop: 3 finished. Tweets gathered sum: 300.
Loop: 4 finished. Tweets gathered sum: 400.
Loop: 5 finished. Tweets gathered sum: 500.
Loop: 6 finished. Tweets gathered sum: 600.
Loop: 7 finished. Tweets gathered sum: 700.
Loop: 8 finished. Tweets gathered sum: 800.
Loop: 9 finished. Tweets gathered sum: 900.
Loop: 10 finished. Tweets gathered sum: 1000.
Loop: 11 finished. Tweets gathered sum: 1100.
Loop: 12 finished. Tweets gathered sum: 1122.


0    How #AI Will Augment the Human Workforce: http...
1    RT @BobSudothis: 75% want more #human vs #mach...
2    RT @KirkDBorne: How #AI Will Augment the Human...
3    RT @ValinCorp: Check out Omron's TM Series Col...
4    RT @ipfconline1: How Artificial Intelligence I...
Name: text, dtype: object

(1122, 22)

1122 tweets saved to tweets_data/#Automation human_20190119202855_tweets.csv

 4 #ArtificialIntelligence human
Loop: 1 finished. Tweets gathered sum: 100.
Loop: 2 finished. Tweets gathered sum: 200.
Loop: 3 finished. Tweets gathered sum: 300.
Loop: 4 finished. Tweets gathered sum: 400.
Loop: 5 finished. Tweets gathered sum: 500.
Loop: 6 finished. Tweets gathered sum: 600.
Loop: 7 finished. Tweets gathered sum: 700.
Loop: 8 finished. Tweets gathered sum: 748.


0    Cognitive Computing: \n\nMore #Human Than #Art...
1    #ArtificialIntelligence has the great potentia...
2    Is #AI about Taking Human Jobs or Creating The...
3    RT @wil_bielert: RT @mvollmer1: When will the ...
4    RT @FrRonconi: #Cognitive Computing: More Huma...
Name: text, dtype: object

(748, 22)

748 tweets saved to tweets_data/#ArtificialIntelligence human_20190119202903_tweets.csv

 5 #AI human
Loop: 1 finished. Tweets gathered sum: 100.
Loop: 2 finished. Tweets gathered sum: 200.
Loop: 3 finished. Tweets gathered sum: 300.
Loop: 4 finished. Tweets gathered sum: 400.
Loop: 5 finished. Tweets gathered sum: 500.
Loop: 6 finished. Tweets gathered sum: 600.
Loop: 7 finished. Tweets gathered sum: 700.
Loop: 8 finished. Tweets gathered sum: 800.
Loop: 9 finished. Tweets gathered sum: 900.
Loop: 10 finished. Tweets gathered sum: 1000.
Loop: 11 finished. Tweets gathered sum: 1100.
Loop: 12 finished. Tweets gathered sum: 1200.
Loop: 13 finished. Tweets gathered sum: 1300.
Loop: 14 finished. Tweets gathered sum: 1400.
Loop: 15 finished. Tweets gathered sum: 1493.


0    .@Nature article by @EricTopol on high-perform...
1    How #AI Will Augment the Human Workforce: http...
2    Cognitive Computing: \n\nMore #Human Than #Art...
3    RT @wil_bielert: RT @mvollmer1: When will the ...
4    RT @BobSudothis: 75% want more #human vs #mach...
Name: text, dtype: object

(1493, 22)

1493 tweets saved to tweets_data/#AI human_20190119202908_tweets.csv

 6 Automation #human
Loop: 1 finished. Tweets gathered sum: 100.
Loop: 2 finished. Tweets gathered sum: 200.
Loop: 3 finished. Tweets gathered sum: 300.
Loop: 4 finished. Tweets gathered sum: 400.
Loop: 5 finished. Tweets gathered sum: 500.
Loop: 6 finished. Tweets gathered sum: 513.


0    RT @BobSudothis: 75% want more #human vs #mach...
1    RT @BobSudothis: 75% want more #human vs #mach...
2    RT @BobSudothis: 75% want more #human vs #mach...
3    RT @BobSudothis: 75% want more #human vs #mach...
4    75% want more #human vs #machine interactions ...
Name: text, dtype: object

(513, 22)

513 tweets saved to tweets_data/Automation #human_20190119202917_tweets.csv

 7 ArtificialIntelligence #human
Loop: 1 finished. Tweets gathered sum: 99.


0    Cognitive Computing: \n\nMore #Human Than #Art...
1    RT @SpirosMargaris: Cognitive Computing: \n\nM...
2    RT @SpirosMargaris: Cognitive Computing: \n\nM...
3    RT @MikeQuindazzi: 1000's of simulated neurons...
4    RT @MikeQuindazzi: 1000's of simulated neurons...
Name: text, dtype: object

(99, 22)

99 tweets saved to tweets_data/ArtificialIntelligence #human_20190119202921_tweets.csv

 8 AI #human
Loop: 1 finished. Tweets gathered sum: 100.
Loop: 2 finished. Tweets gathered sum: 200.
Loop: 3 finished. Tweets gathered sum: 300.
Loop: 4 finished. Tweets gathered sum: 400.
Loop: 5 finished. Tweets gathered sum: 500.
Loop: 6 finished. Tweets gathered sum: 600.
Loop: 7 finished. Tweets gathered sum: 700.
Loop: 8 finished. Tweets gathered sum: 800.
Loop: 9 finished. Tweets gathered sum: 900.
Loop: 10 finished. Tweets gathered sum: 1000.
Loop: 11 finished. Tweets gathered sum: 1021.


0    Cognitive Computing: \n\nMore #Human Than #Art...
1    RT @BobSudothis: 75% want more #human vs #mach...
2    #Researchgate #Stats #UNINA #Naples #Universit...
3    RT @BobSudothis: 75% want more #human vs #mach...
4    RT @vanguardsw: RT @chboursin "#AI will achiev...
Name: text, dtype: object

(1021, 22)

1021 tweets saved to tweets_data/AI #human_20190119202922_tweets.csv

 9 Automation human
Loop: 1 finished. Tweets gathered sum: 100.
Loop: 2 finished. Tweets gathered sum: 200.
Loop: 3 finished. Tweets gathered sum: 200.


0    How #AI Will Augment the Human Workforce: http...
1    From #AI to drones and #3Dprinting, emerging t...
2    RT @BobSudothis: 75% want more #human vs #mach...
3    RT @KirkDBorne: How #AI Will Augment the Human...
4    RT @AiConstellation: In a world of increasing ...
Name: text, dtype: object

(200, 22)

200 tweets saved to tweets_data/Automation human_20190119202932_tweets.csv

 10 ArtificialIntelligence human
Loop: 1 finished. Tweets gathered sum: 100.
Loop: 2 finished. Tweets gathered sum: 200.
Loop: 3 finished. Tweets gathered sum: 300.
Loop: 4 finished. Tweets gathered sum: 400.
Loop: 5 finished. Tweets gathered sum: 500.
Loop: 6 finished. Tweets gathered sum: 600.
Loop: 7 finished. Tweets gathered sum: 700.
Loop: 8 finished. Tweets gathered sum: 749.


0    Cognitive Computing: \n\nMore #Human Than #Art...
1    Is #AI about Taking Human Jobs or Creating The...
2    #ArtificialIntelligence has the great potentia...
3    RT @mvollmer1: When will the #AI wave happen? ...
4    RT @wil_bielert: RT @mvollmer1: When will the ...
Name: text, dtype: object

(749, 22)

749 tweets saved to tweets_data/ArtificialIntelligence human_20190119202934_tweets.csv

 11 AI human
Loop: 1 finished. Tweets gathered sum: 100.
Loop: 2 finished. Tweets gathered sum: 200.
Loop: 3 finished. Tweets gathered sum: 300.
Loop: 4 finished. Tweets gathered sum: 400.
Loop: 5 finished. Tweets gathered sum: 500.
Loop: 6 finished. Tweets gathered sum: 600.
Loop: 7 finished. Tweets gathered sum: 700.
Loop: 8 finished. Tweets gathered sum: 800.
Loop: 9 finished. Tweets gathered sum: 900.
Loop: 10 finished. Tweets gathered sum: 1000.
Loop: 11 finished. Tweets gathered sum: 1100.
Loop: 12 finished. Tweets gathered sum: 1200.
Loop: 13 finished. Tweets gathered sum: 1300.
Loop: 14 finished. Tweets gathered sum: 1400.
Loop: 15 finished. Tweets gathered sum: 1500.
Loop: 16 finished. Tweets gathered sum: 1600.
Loop: 17 finished. Tweets gathered sum: 1700.
Loop: 18 finished. Tweets gathered sum: 1800.
Loop: 19 finished. Tweets gathered sum: 1900.
Loop: 20 finished. Tweets gathered sum: 200

0    There’s no way that a malevolent AI could ever...
1    How #AI Will Augment the Human Workforce: http...
2    Pair of ancient skeletons thought to represent...
3    RT @MiaD: Super excited to kick off our first ...
4    RT @mvollmer1: When will the #AI wave happen? ...
Name: text, dtype: object

(4099, 22)

4099 tweets saved to tweets_data/AI human_20190119202939_tweets.csv
12 dataframes collected with total 11635 of samples.


In [7]:
# Prepare dataset
# ---------------

# 1. Make one 24.4k samples DataFrame out of ~50 CSV's

def get_dsets_together(folder_path,sep='\t',encoding='utf-8'):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    num_files = len(all_files)

    dfs = []
    for i, file in enumerate(all_files):
        try:
            df = pd.read_csv(file, sep=sep, encoding=encoding)
            if df.shape[0] > 0:
                dfs.append(df)
        except:
            print('--not-loaded-->', file)       
    
    return pd.concat(dfs, ignore_index=True)

folder_path = r'C:\\Users\\p\\Projects\\100analysis\\tweets_data' 
df =get_dsets_together(folder_path)
df.reset_index(inplace=True)

print('All CSVs size:', df.shape)
print('\nAll CSVs cols:', df.columns)

# 1. Check tweets language, remove non english.
print('\nCheck tweet language distribution:\n{}'.format(df.lang.value_counts().head(10)))
df = df[(df.lang == 'en')]

# 2. Check, remove duplicates and reset index.
print('\n\nCheck for duplicates\n- Nr of entries {}\n- Uniq  entries {}'.format(df.shape[0], 
                                                                                df.text.unique().shape[0]))
df = df.drop_duplicates(subset='text', keep='first')
df.reset_index(inplace=True, drop=True)

# 3. Check distribution of tweets over categories
print('\n\nDistribution of unique, english tweets over category tag [phrase, hash]:\n{}'.format(df.scrap_phrase.value_counts()))

# 4. Save & reload, backup
save_tweets_as_CSV('AI_Psycho_tweets.csv', df)
df = pd.read_csv('AI_Psycho_tweets.csv', sep='\t', encoding='utf-8')
backup = df.copy(deep=True)

# 5. Check shape and look
display('Combined_dataset of shape {}:'.format(df.shape), df.loc[:,['text','hashtags']].head(4))

All CSVs size: (24416, 23)

All CSVs cols: Index(['index', 'account_created_at', 'author', 'author_description',
       'author_favourites_count', 'author_followers_count',
       'author_friends_count', 'author_id', 'author_listed_count',
       'author_location', 'author_statuses_count', 'created_at',
       'favorite_count', 'file_tag', 'hashtags', 'in_reply_to_screen_name',
       'in_reply_to_status_id', 'lang', 'retweet_count', 'scrap_phrase',
       'scrap_time', 'text', 'tweet_id'],
      dtype='object')

Check tweet language distribution:
en                24394
AI human              2
#AI human             2
#AI psychology        1
AI psychology         1
Name: lang, dtype: int64


Check for duplicates
- Nr of entries 24394
- Uniq  entries 6817


Distribution of unique, english tweets over category tag [phrase, hash]:
AI human                                 1516
#AI marketing                            1138
#AI #marketing                            749
#AI human             

'Combined_dataset of shape (6817, 23):'

Unnamed: 0,text,hashtags
0,Cognitive Computing: \r\r\n\r\r\nMore #Human Than #ArtificialIntelligence \r\r\n\r\r\nhttps://t....,Human ArtificialIntelligence fintech insurtech AI MachineLearning DeepLearning robotics
1,RT @BobSudothis: 75% want more #human vs #machine interactions to improve #CX in the #futureofwo...,human machine CX futureofwork PwC AI
2,#Researchgate #Stats #UNINA #Naples #University #Researchers #VanGELOAssoluto #Federico2 #book o...,Researchgate Stats UNINA Naples University Researchers VanGELOAssoluto Federico2 book GOD Human ...
3,"RT @vanguardsw: RT @chboursin ""#AI will achieve #human-like #skills &gt; 2026: write high-school...",AI human skills


In [20]:
# Explore data quality: types, nans, strange values
# -------------------------------------------------

# Check df for Nans and suspicious dtypes
def get_suspicious_columns_data(df):
    suspicious =[]
    for column in df.columns:
    
        raport_data = [column]
    
        # Nan values sum
        nan_sum = df[column].isna().sum()
        
        # Undefined datatypes
        c_types = set([type(val) for val in df[column].tolist()])
    
    
        if len(c_types) > 1 and nan_sum > 0:
            raport_data.append(str(c_types))
            raport_data.append(nan_sum)
        elif len(c_types) > 1 and not nan_sum > 0:
            raport_data.append(str(c_types))
            raport_data.append(0)
        elif not len(c_types) > 1 and nan_sum > 0:
            raport_data.append('')
            raport_data.append(nan_sum)
        
        if len(raport_data) > 1:
            suspicious.append(raport_data)
    return suspicious
    

# Explore
print(df.shape)
print(df.columns)

see = ['text', 'retweet_count', 'favorite_count']
data_types = pd.DataFrame([[col, df[col].dtype] for col in see],columns=['Column name','dtype'])

display(data_types)
display(df.loc[:, see].head(5))
display(pd.DataFrame(get_suspicious_columns_data(df), columns=['column','dtypes','Nan']))
# This are ok.

# Check values at retweet_count, favorite_count, scrap_phrase etc.
#df.scrap_phrase.value_counts().tolist()

(6817, 24)
Index(['index', 'account_created_at', 'author', 'author_description',
       'author_favourites_count', 'author_followers_count',
       'author_friends_count', 'author_id', 'author_listed_count',
       'author_location', 'author_statuses_count', 'created_at',
       'favorite_count', 'file_tag', 'hashtags', 'in_reply_to_screen_name',
       'in_reply_to_status_id', 'lang', 'retweet_count', 'scrap_phrase',
       'scrap_time', 'text', 'tweet_id', 'text_raw'],
      dtype='object')


Unnamed: 0,Column name,dtype
0,text,object
1,retweet_count,float64
2,favorite_count,float64


Unnamed: 0,text,retweet_count,favorite_count
0,Cognitive Computing: \r\r\n\r\r\nMore #Human Than #ArtificialIntelligence \r\r\n\r\r\nhttps://t....,26.0,32.0
1,RT @BobSudothis: 75% want more #human vs #machine interactions to improve #CX in the #futureofwo...,4.0,0.0
2,#Researchgate #Stats #UNINA #Naples #University #Researchers #VanGELOAssoluto #Federico2 #book o...,0.0,0.0
3,"RT @vanguardsw: RT @chboursin ""#AI will achieve #human-like #skills &gt; 2026: write high-school...",7.0,0.0
4,75% want more #human vs #machine interactions to improve #CX in the #futureofwork &gt;&gt;&gt; #...,4.0,2.0


Unnamed: 0,column,dtypes,Nan
0,author_description,"{<class 'float'>, <class 'str'>}",422
1,author_location,"{<class 'float'>, <class 'str'>}",1148
2,in_reply_to_screen_name,"{<class 'str'>, <class 'float'>}",6483
3,in_reply_to_status_id,,6517
4,tweet_id,,3


In [149]:
# RETRIVE:
# -------

df = pd.read_csv('AI_Psycho_tweets.csv', sep='\t', encoding='utf-8')
del df['index'] #lol
backup = df.copy(deep=True)
df.text.head(10)

0    Cognitive Computing: \r\r\n\r\r\nMore #Human Than #ArtificialIntelligence \r\r\n\r\r\nhttps://t....
1    RT @BobSudothis: 75% want more #human vs #machine interactions to improve #CX in the #futureofwo...
2    #Researchgate #Stats #UNINA #Naples #University #Researchers #VanGELOAssoluto #Federico2 #book o...
3    RT @vanguardsw: RT @chboursin "#AI will achieve #human-like #skills &gt; 2026: write high-school...
4    75% want more #human vs #machine interactions to improve #CX in the #futureofwork &gt;&gt;&gt; #...
5    RT @SpirosMargaris: Cognitive Computing: \r\r\n\r\r\nMore #Human Than #ArtificialIntelligence \r...
6    RT @Windy07041: This #interview really is #groundbreaking. I can't wrap my #head all the way aro...
7    RT @TPerplexa: More about the #Qanon operation\r\r\nLearn about #AI, #SOCI, basic #Human psychol...
8    75% want more #human vs #machine interactions to improve #CX in the #futureofwork \r\r\n\r\r\n#A...
9    RT @AngelHealthTech: Computing power + #AI #Algori

In [151]:
# Prepare text for exploration
# ----------------------------

# 1. URL and text without
def get_url(text):
    try: 
        return re.search("(?P<url>https?://[^\s]+)", text).group("url")
    except: 
        return ''
def remove_url(text):
    return re.sub('https?://[A-Za-z0-9./]+','',text)
def get_and_remove_url_from_text(text):
    return get_url(text), remove_url(text)

# 2. RT sign and text without it
def get_and_remove_rt_from_text(text):
    if text[:2] == 'RT':
        return 1, text[3:]
    else:
        return 0, text

# 3. Hashtags into words, mentions into 'PERSON'
def get_mentions(text):
    return ' '.join([w[1:] for w in text.split(' ') if w.startswith('@')])
def get_hashtags(text):
    return ' '.join([w[1:] for w in text.split(' ') if w.startswith('#')])

# 3b. Hash, mention strategies
def hashtags_and_mentions_to_text(text):
    string = ""
    for w in text.split():
        if w.startswith('@') or w.startswith('#'):
            string = "".join([string, ' ', w[1:]])
        else:
            string = "".join([string, ' ', w])            
    return string
def get_raw_holders(text):
    text = text.split()
    raw_holders = ''
    for word in text:
        if word.startswith('@'):
            raw_holders += ' PERSON'
        elif word.startswith('#'):
            raw_holders += ' HASH'
        else:
            raw_holders += ' WORD'
    return raw_holders[1:]
def get_placeholders(text):
    text = text.split()
    placeholders = ''
    for word in text:
        if word.startswith('@'):
            placeholders += ' PERSON'
        elif word.startswith('#'):
            placeholders += ' HASH'
        else:
            placeholders += ' ' + word
    return placeholders[1:]
def get_hash_indexes(text):
    hash_idxs, at_idxs, w_idxs = '', '', ''
    for i, w in enumerate(text.split()):
        if w.startswith('#'):
            hash_idxs += ' ' + str(i)
        elif w.startswith('@'):
            at_idxs += ' ' + str(i)
        else:
            w_idxs  += ' ' + str(i)
    return hash_idxs[1:], at_idxs[1:], w_idxs[1:]

def get_hash_indexes_DEHASHED(text):
    hash_idxs, at_idxs, w_idxs = '', '', ''
    for i, w in enumerate(text.split()):
        if w == 'HASH':
            hash_idxs += ' ' + str(i)
        elif w == 'PERSON':
            at_idxs += ' ' + str(i)
        else:
            w_idxs  += ' ' + str(i)
    return hash_idxs[1:], at_idxs[1:], w_idxs[1:]

# ----> ADD GETTING WORDS OUT OF HASHTAG/MENTION CAMMELCASE

# 4. Date as a feature
def get_date_day(date):
    return int(date.split(' ')[2])

# 5. Clean the text
def strip_html(text, praser='lxml'): # lxml', 'html.parser'
    return BeautifulSoup(text, praser).get_text()
def BOM_replace(text):
    try:
        return text.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        return text
def strip_inner_spaces(text):
    return ' '.join([w.strip() for w in text.split()])
def lowercase_text(text):
    return text.lower()

# 6. Prepare text for analysis: lemmatize, remove stop words & special characters
def lemmatize_words(text):
    WNL = WordNetLemmatizer()
    return ' '.join([WNL.lemmatize(word, pos='v') for word in text.split()])
def remove_special_characters(text):
    return re.sub("[^a-zA-Z]", " ", text)
def remove_stop_words(text):
    return ' '.join([w for w in text.split() if not w in set(stopwords.words('english'))])

# 7. Get all Tag words in texts at one (4-6 times faster)
def get_text_tokens(text):
    # Token-type containers for text
    poses, tags, deps, heads, idxes = [], [], [], [], []
    # Get tokens
    for i, token in enumerate(nlp(text)):
        if token.is_space == False:
            deps.append(token.dep_)
            heads.append(token.head.text)
            tags.append(token.tag_)
            poses.append(token.pos_)
            idxes.append(token.idx)
        
    # Placeholders, each list as a string
    results = []
    for tl in [poses, tags, deps, heads, idxes]:
        if len(tl) == 0:
            tl = ['']*i
        if not tl[0] is str:
            tl = [str(a) for a in tl]
        result = ' '.join(tl)
        results.append(result)
    # Word index (word order)
    results.append(' '.join([str(nr) for nr in range(i)])) 
    return results



In [152]:
# Data cleaning and preparation pipe
# ----------------------------------

# Retrive backup version
df = backup.copy(deep=True)

# MINI DF FOR TESTING
#df = df.loc[:10,:].copy(deep=True)

# Mesure time
t1 = datetime.datetime.now()

# Backup for raw text
df['text_raw'] = df.text

# Initial features
df['link'], df['text'] = zip(*df.text.apply(get_and_remove_url_from_text))
df['RT'], df['text'] = zip(*df.text.apply(get_and_remove_rt_from_text))
df['day'] = df.created_at.apply(get_date_day)


# Hashtag, mentions initial features
df['hashtags'] = df.text.apply(get_hashtags)
df['n_hash'] = df.hashtags.apply(lambda x: len(x.split()))
df['mentions'] = df.text.apply(get_mentions)
df['n_mentions'] = df.mentions.apply(lambda x: len(x.split()))

t2 = datetime.datetime.now()
print('Initial features, hashtag, mentions done:', t2-t1, t2-t1)

# Text cleaning pipe
df['text'] = df.text.apply(strip_html)
df['text'] = df.text.apply(BOM_replace)
df['text'] = df.text.apply(strip_inner_spaces)
df['text'] = df.text.apply(lowercase_text)

t3 = datetime.datetime.now()
print('Text cleaning pipe done:', t3-t1, t3-t2)

# Hash, mention strategies for text:
df['dehashed'] = df.text.apply(hashtags_and_mentions_to_text)
df['raw_holders'] = df.text.apply(get_raw_holders)
df['placeholders'] = df.text.apply(get_placeholders)

# Raw indexing for easy acess
df['raw_h_idx'], df['raw_at_idx'], df['raw_w_idx'] = zip(*df.text.apply(get_hash_indexes))

# Strategy pick
df['text'] = df.text

df['SANITY 1 w'] = len(df.text.str.split())
df['SANITY 1 c'] = df.text.str.len()

# Further text preparation pipe, reindexing
df['text'] = df.text.apply(remove_stop_words)
df['text'] = df.text.apply(remove_special_characters)
df['text'] = df.text.apply(strip_inner_spaces)
df['text'] = df.text.apply(lemmatize_words)
#df['h_idx'], df['at_idx'], df['w_idx'] = zip(*df.text.apply(get_hash_indexes))

# Sanity check
df['SANITY 2 w'] = len(df.text.str.split())
df['SANITY 2 c'] = df.text.str.len()

t4 = datetime.datetime.now()
print('Hashes strategy, prep, sanity, reindex:', t4-t1, t4-t3)

# Remove special characters

# Get all tags from spacy nlp at one (4-6 times faster)
df['POSES'], df['TAGS'], df['DEPS'], df['HEADS'], df['IDXS'], df['W_IDXS'] = zip(*df.text.apply(lambda x: get_text_tokens(x)))

t5 = datetime.datetime.now()
print('Spacy tags:', t5-t1, t5-t4)

df.loc[:2,:].T

Initial features, hashtag, mentions done: 0:00:01.026058 0:00:01.026058
Text cleaning pipe done: 0:00:06.871393 0:00:05.845335
Hashes strategy, prep, sanity, reindex: 0:02:12.896474 0:02:06.025081
Spacy tags: 0:12:49.859906 0:10:36.963432


Unnamed: 0,0,1,2
account_created_at,Wed Aug 06 05:20:16 +0000 2014,Thu Mar 15 04:51:22 +0000 2018,Sun Jun 10 18:04:31 +0000 2012
author,SpirosMargaris,virginiakelly78,SalViVicidomini
author_description,@wefoxHQ @SparkLabsGlobal @GetHufsy @LodexAus @MediaStalker1 @ArbidexToken @F10_accelerator | No...,"Researcher, Compiler, Professional Troublemaker. :)","♈Sono nato, e prima o poi morrò! Il resto è rumore entropico! #FB https://t.co/FmPvrmCh0w ●●My #..."
author_favourites_count,118656,1,62
author_followers_count,68699,36,181
author_friends_count,10429,42,707
author_id,2.71121e+09,9.74146e+17,6.04715e+08
author_listed_count,5260,2,141
author_location,All Over the World,"Kansas City, MO","EARTH.pk, 2.O !!!"
author_statuses_count,135407,455,17289


In [154]:
# SAVE PREPARED TEXT DF TO FILE
# -----------------------------

df.to_csv('AI_Psycho_tweets_prepared.csv', sep='\t', encoding='utf-8', index=False)

In [165]:
# RETRIVE FROM FILE
# -----------------

df = pd.read_csv('AI_Psycho_tweets_prepared.csv', sep='\t', encoding='utf-8')
#backup = df.copy(deep=True) # To retrive -> df = backup.copy(deep=True)

print(df.shape)

(6817, 45)


In [162]:
# Simple Twitter data + simple sentiment score CSV for further use
# ----------------------------------------------------------------

import textblob
from textblob import TextBlob 
    
def get_sentiment_val(text): 
    sent_value = TextBlob(text)
    return sent_value.sentiment.polarity

intresting_columns = ['text', 'hashtags', 'retweet_count', 'favorite_count']
example_twitter_data_6k = df.loc[:, intresting_columns]
example_twitter_data_6k['sentiment'] = example_twitter_data_6k.text.apply(get_sentiment_val)
example_twitter_data_6k.to_csv('example_twitter_data_6k.csv', sep='\t', encoding='utf-8', index=False)

In [114]:
# Word counts

all_words = []
for tweet in df.text.tolist():
    for word in tweet.split():
        all_words.append(word)

print('words:',len(all_words))
print('unique:',len(set(all_words)))
pd.DataFrame(all_words, columns=['w']).w.value_counts().head(50)

words: 107591
unique: 12015


ai                        5155
market                    2999
human                     2810
artificialintelligence    1322
automation                1099
bigdata                    934
iot                        848
machinelearning            765
s                          757
via                        668
digitalmarketing           641
intelligence               605
socialmedia                557
business                   536
infographic                510
use                        510
artificial                 507
rt                         506
seo                        489
tech                       483
data                       458
make                       456
analytics                  437
deeplearning               422
machine                    411
technology                 407
smm                        406
ml                         398
mikequindazzi              396
new                        382
contentmarketing           379
future                     374
startup 

In [None]:
import pandas as pd

# Example DF
df = pd.read_csv('AI_Psycho_tweets_prepared.csv', sep='\t', encoding='utf-8')

tf = df.loc[:, ['text','POSES']]

tf["tl"], tf['Tl'] = tf.text.str.count(' '), tf.POSES.str.count(' ')
x = tf[tf['tl'] != tf['Tl']]

# Delete rows with non equal numbers of words to POSes (44 of 6.8k)
tf = tf.drop(x.index
             
def get_adj(TAGS):
    tags = TAGS.split()
    for t

tf['ADJ'] = tf.TAGS.apply(get_adj)

tf.shape

In [None]:
# All tweets avrg sentiment
print(df.shape[0], '-', (df.sentiment.sum() / df.shape[0]))

# Non neutral tweets sentiment
noneu_tweets = df[df.sentiment != 0]
print(noneu_tweets.shape[0], '-', (noneu_tweets.sentiment.sum() / noneu_tweets.shape[0]))

# Negative tweets avrg sentiment
neg_tweets = df[df.sentiment < 0]
print(neg_tweets.shape, neg_tweets.sentiment.sum() / neg_tweets.shape[0])

pos_tweets = df[df.sentiment > 0]
print(pos_tweets.shape, pos_tweets.sentiment.sum() / pos_tweets.shape[0])

In [None]:
# To be continued