# Analysing IRA tweets

In [122]:
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
import re
import matplotlib.pyplot as plt
import glob

import findspark
findspark.init()

from pyspark import SparkContext

from pyspark.sql import *
from pyspark.sql.functions import to_timestamp
from pyspark.mllib.stat import Statistics
from pyspark.sql.functions import explode

%matplotlib inline

spark = SparkSession.builder.getOrCreate()

In [123]:
# Set up data directory
DATA_DIR = 'data/'

## 1. Getting to know our main dataset

#### Reading the data

In [124]:
tweets_df = pd.concat([pd.read_csv(f) for f in glob.glob(DATA_DIR+'*.csv')], ignore_index = True)

#### First glance at the data

In [125]:
print(tweets_df.shape)
tweets_df.head()

(2973371, 15)


Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
0,1674084000.0,GAB1ALDANA,People are too toxic. I think I have people po...,United States,English,7/30/2016 20:15,7/30/2016 20:15,3395,2014,2150,RETWEET,Hashtager,0,1,HashtagGamer
1,1674084000.0,GAB1ALDANA,#NowPlaying Don't Shoot (I'm a Man) by @DEVO -...,United States,English,7/30/2016 20:15,7/30/2016 20:15,3395,2014,2146,RETWEET,Hashtager,0,1,HashtagGamer
2,1674084000.0,GAB1ALDANA,the 'I'm the most boring person in the world' ...,United States,English,7/30/2016 20:16,7/30/2016 20:16,3395,2013,2159,RETWEET,Hashtager,0,1,HashtagGamer
3,1674084000.0,GAB1ALDANA,#MyAchillesHeel slippery floors https://t.co/R...,United States,Norwegian,7/30/2016 20:16,7/30/2016 20:16,3395,2013,2160,RETWEET,Hashtager,0,1,HashtagGamer
4,1674084000.0,GAB1ALDANA,#MyAchillesHeel Boring narcissists.....nothing...,United States,English,7/30/2016 20:16,7/30/2016 20:16,3395,2013,2158,RETWEET,Hashtager,0,1,HashtagGamer


#### Typecasting and cleaning of NaN values
We first clean the data a bit. We replace the string representing dates by datetime objects ; we reduce the dimension of the *external_author_id* column. We then fill the NaN values to make them safer and easier to handle.

In [126]:
# Datetime casts
tweets_df['publish_date'] = pd.to_datetime(tweets_df['publish_date'], format='%m/%d/%Y %H:%M')
tweets_df['harvested_date'] = pd.to_datetime(tweets_df['harvested_date'], format='%m/%d/%Y %H:%M')

In [127]:
# Set external_author_id as a Categorical feature to avoid dealing with those very high numbers
tweets_df.external_author_id = pd.Categorical(tweets_df.external_author_id).codes

In [128]:
# Filling NaN values for region ("Unknown") and post_type ("Regular", indicating a regular tweet)

print(tweets_df.isnull().any())

tweets_df['content']=tweets_df['content'].fillna("EMPTY")
tweets_df['region']=tweets_df['region'].fillna("Unknown")
tweets_df['post_type']=tweets_df['post_type'].fillna("REGULAR")
tweets_df['account_type']=tweets_df['account_type'].fillna("?")

print(tweets_df.isnull().any())
print("NaN values still present in the dataframe: " + str(tweets_df.isnull().values.any()) + "\n")


external_author_id    False
author                False
content                True
region                 True
language              False
publish_date          False
harvested_date        False
following             False
followers             False
updates               False
post_type              True
account_type           True
new_june_2018         False
retweet               False
account_category      False
dtype: bool
external_author_id    False
author                False
content               False
region                False
language              False
publish_date          False
harvested_date        False
following             False
followers             False
updates               False
post_type             False
account_type          False
new_june_2018         False
retweet               False
account_category      False
dtype: bool
NaN values still present in the dataframe: False



In [129]:
print("Data types of the columns: \n" + str(tweets_df.dtypes) + "\n")

for column in tweets_df:
    print("Column " + str(column) + " is of type " + str(type(column)) + " and value range from " + str(min(tweets_df[column])) + " to " + str(max(tweets_df[column])))

Data types of the columns: 
external_author_id             int16
author                        object
content                       object
region                        object
language                      object
publish_date          datetime64[ns]
harvested_date        datetime64[ns]
following                      int64
followers                      int64
updates                        int64
post_type                     object
account_type                  object
new_june_2018                  int64
retweet                        int64
account_category              object
dtype: object

Column external_author_id is of type <class 'str'> and value range from -1 to 2488
Column author is of type <class 'str'> and value range from 10_GOP to _YOUR_LIFESTYLE
Column content is of type <class 'str'> and value range from !   https://t.co/5kbY3yFl2M  @TGpavlova to 🧀🧀🧀🧀 https://t.co/UViqoe2pUY
Column region is of type <class 'str'> and value range from Afghanistan to Unknown
Column language i

#### Reducing dataframe by removing redundant columns
We can now check for redundant columns, that is columns that present values that can be obtained from other columns.

In [130]:
# Check if column col2 is redundant with respect to column col1 in dataframe df
def isRedundant(df, col1, col2):
    for val in df[col1].unique():
        if df[df[col1]==val][col2].unique().size>1:
            return False
    return True

In [131]:
# We only look at interesting columns (for example, 'content' cannot really help deduce any other column)
for col1 in ['author', 'post_type', 'account_type']:
    for col2 in ['new_june_2018', 'account_category', 'account_type', 'retweet']:
        if col1!=col2 and isRedundant(tweets_df, col1, col2):
            print("Redundancy found: " + col2 + " can be deduced from " + col1)

Redundancy found: new_june_2018 can be deduced from author
Redundancy found: account_category can be deduced from author
Redundancy found: account_type can be deduced from author
Redundancy found: retweet can be deduced from post_type
Redundancy found: account_category can be deduced from account_type


Redundant columns found:
* *new_june_2018* can be deduced from *author*
* *account_category* can be deduced both from *author* and from *account_type*
* *account_type* can be deduced from *author*
* *retweet* can be deduced from *post_type*

We can thus put those values in additionnal, much smaller, dataframes and remove the useless columns from our main dataframe.

In [132]:
# When a value can be deduced by two others, we always select the column with the fewest different values
new_june_df = tweets_df[['author', 'new_june_2018']].drop_duplicates()
acc_type_df = tweets_df[['author', 'account_type']].drop_duplicates()
acc_cat_df  = tweets_df[['account_type', 'account_category']].drop_duplicates()
retweet_df  = tweets_df[['post_type', 'retweet']].drop_duplicates()

tweets_df.drop('new_june_2018', axis = 1, inplace = True)
tweets_df.drop('account_type', axis = 1, inplace = True)
tweets_df.drop('account_category', axis = 1, inplace = True)
tweets_df.drop('retweet', axis = 1, inplace = True)

tweets_df.head()

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type
0,432,GAB1ALDANA,People are too toxic. I think I have people po...,United States,English,2016-07-30 20:15:00,2016-07-30 20:15:00,3395,2014,2150,RETWEET
1,432,GAB1ALDANA,#NowPlaying Don't Shoot (I'm a Man) by @DEVO -...,United States,English,2016-07-30 20:15:00,2016-07-30 20:15:00,3395,2014,2146,RETWEET
2,432,GAB1ALDANA,the 'I'm the most boring person in the world' ...,United States,English,2016-07-30 20:16:00,2016-07-30 20:16:00,3395,2013,2159,RETWEET
3,432,GAB1ALDANA,#MyAchillesHeel slippery floors https://t.co/R...,United States,Norwegian,2016-07-30 20:16:00,2016-07-30 20:16:00,3395,2013,2160,RETWEET
4,432,GAB1ALDANA,#MyAchillesHeel Boring narcissists.....nothing...,United States,English,2016-07-30 20:16:00,2016-07-30 20:16:00,3395,2013,2158,RETWEET


In [133]:
tweets_df.describe(include='all')

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type
count,2973371.0,2973371,2973371,2973371,2973371,2973371,2973371,2973371.0,2973371.0,2973371.0,2973371
unique,,2848,2365943,36,56,896684,906316,,,,3
top,,EXQUOTE,В городе Сочи. Олимпиада – праздник или стихий...,United States,English,2017-08-16 01:29:00,2016-03-22 17:35:00,,,,REGULAR
freq,,59652,670,2055882,2128963,202,1333,,,,1662425
first,,,,,,2012-02-02 00:35:00,2012-02-02 00:40:00,,,,
last,,,,,,2018-05-30 21:01:00,2018-06-20 04:03:00,,,,
mean,1487.552,,,,,,,3433.524,7018.913,10497.56,
std,753.3947,,,,,,,5609.881,14584.63,17687.29,
min,-1.0,,,,,,,-1.0,-1.0,-1.0,
25%,711.0,,,,,,,327.0,320.0,1787.0,


#### First conclusions
From what we have seen so far, we can see that not every column in the dataset is useful, some are implied by others. We also better understood the meaning of some columns: *external_author_id* corresponds to the original author of a tweet and is useful in the case of retweets.

From the summary right above this cell, we can see that not all *content* are unique: it probably comes on one hand from the NaN values, and on the other hand from several accounts copy-pasting the same text or retweeting it. It also appears that more than 50 languages are used, and posts come from around 35 different regions.

Now that we have a better grasp of the content of our main data files, we can start digging deeper into it, and looking at other data sets.

## 2. Digging into the contents of the tweets

#### Harvesting the urls present in the tweets

In [134]:
url_tab = []
for cont in tweets_df.content:
    url = re.findall("(?P<url>https?://[^\s]+)", cont)
    if url:
        url_tab.extend(url)

In [135]:
print(len(url_tab))

2860564


In [136]:
for url in url_tab[:2]:
    print(url)

https://t.co/9ilDvPExkB
https://t.co/6gKOVVcUr0
