# Analysing IRA tweets

In [144]:
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
import re
import matplotlib.pyplot as plt

import findspark
findspark.init()

from pyspark import SparkContext

from pyspark.sql import *
from pyspark.sql.functions import to_timestamp
from pyspark.mllib.stat import Statistics
from pyspark.sql.functions import explode

%matplotlib inline

spark = SparkSession.builder.getOrCreate()

In [145]:
# Set up data directory
DATA_DIR = 'data/'

#### Reading the data

In [146]:
tweets1 = pd.read_csv(DATA_DIR+'IRAhandle_tweets_1.csv', sep=',')

#### First glance at the data

In [147]:
print(tweets1.isnull().any())
tweets1.head()

external_author_id    False
author                False
content               False
region                 True
language              False
publish_date          False
harvested_date        False
following             False
followers             False
updates               False
post_type              True
account_type          False
new_june_2018         False
retweet               False
account_category      False
dtype: bool


Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
0,9.06e+17,10_GOP,"""We have a sitting Democrat US Senator on tria...",Unknown,English,10/1/2017 19:58,10/1/2017 19:59,1052,9636,253,,Right,0,0,RightTroll
1,9.06e+17,10_GOP,Marshawn Lynch arrives to game in anti-Trump s...,Unknown,English,10/1/2017 22:43,10/1/2017 22:43,1054,9637,254,,Right,0,0,RightTroll
2,9.06e+17,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,10/1/2017 22:50,10/1/2017 22:51,1054,9637,255,RETWEET,Right,0,1,RightTroll
3,9.06e+17,10_GOP,JUST IN: President Trump dedicates Presidents ...,Unknown,English,10/1/2017 23:52,10/1/2017 23:52,1062,9642,256,,Right,0,0,RightTroll
4,9.06e+17,10_GOP,"19,000 RESPECTING our National Anthem! #StandF...",Unknown,English,10/1/2017 2:13,10/1/2017 2:13,1050,9645,246,RETWEET,Right,0,1,RightTroll


#### Typecasting and cleaning of NaN values
We first clean the data a bit. We replace the string representing dates by datetime objects ; we reduce the dimension of the *external_author_id* column. We then fill the NaN values to make them safer and easier to handle.

In [148]:
# Datetime casts
tweets1['publish_date'] = tweets1['publish_date'].astype('datetime64[ns]')
tweets1['harvested_date'] = tweets1['harvested_date'].astype('datetime64[ns]')

# Set external_author_id as a Categorical feature to avoid dealing with those very high numbers
tweets1.external_author_id = pd.Categorical(tweets1.external_author_id).codes

# Filling NaN values for region ("Unknown") and post_type ("Regular", indicating a regular tweet)
tweets1['region']=tweets1['region'].fillna("Unknown")
tweets1['post_type']=tweets1['post_type'].fillna("REGULAR")
print("NaN values still present in the dataframe: " + str(tweets1.isnull().values.any()) + "\n")

print("Data types of the columns: \n" + str(tweets1.dtypes) + "\n")

for column in tweets1:
    print("Column " + str(column) + " is of type " + str(type(column)) + " and value range from " + str(min(tweets1[column])) + " to " + str(max(tweets1[column])))
            
tweets1.head()

NaN values still present in the dataframe: False

Data types of the columns: 
external_author_id             int16
author                        object
content                       object
region                        object
language                      object
publish_date          datetime64[ns]
harvested_date        datetime64[ns]
following                      int64
followers                      int64
updates                        int64
post_type                     object
account_type                  object
new_june_2018                  int64
retweet                        int64
account_category              object
dtype: object

Column external_author_id is of type <class 'str'> and value range from 0 to 338
Column author is of type <class 'str'> and value range from 10_GOP to BLMSOLDIER
Column content is of type <class 'str'> and value range from ! @4mysquad @Solutioneer72 Funny & the #MSM doing their damndest to make cops look good, does anyone believe what #Cops or #MSM s

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
0,336,10_GOP,"""We have a sitting Democrat US Senator on tria...",Unknown,English,2017-10-01 19:58:00,2017-10-01 19:59:00,1052,9636,253,REGULAR,Right,0,0,RightTroll
1,336,10_GOP,Marshawn Lynch arrives to game in anti-Trump s...,Unknown,English,2017-10-01 22:43:00,2017-10-01 22:43:00,1054,9637,254,REGULAR,Right,0,0,RightTroll
2,336,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,2017-10-01 22:50:00,2017-10-01 22:51:00,1054,9637,255,RETWEET,Right,0,1,RightTroll
3,336,10_GOP,JUST IN: President Trump dedicates Presidents ...,Unknown,English,2017-10-01 23:52:00,2017-10-01 23:52:00,1062,9642,256,REGULAR,Right,0,0,RightTroll
4,336,10_GOP,"19,000 RESPECTING our National Anthem! #StandF...",Unknown,English,2017-10-01 02:13:00,2017-10-01 02:13:00,1050,9645,246,RETWEET,Right,0,1,RightTroll


#### Reducing dataframe by removing redundant columns
We can now check for redundant columns, that is columns that present values that can be obtained from other columns.

In [149]:
# Check if column col2 is redundant with respect to column col1 in dataframe df
def isRedundant(df, col1, col2):
    for val in df[col1].unique():
        if df[df[col1]==val][col2].unique().size>1:
            return False
    return True

In [150]:
# We only look at interesting columns (for example, 'content' cannot really help deduce any other column)
for col1 in ['external_author_id', 'author', 'region', 'language', 'post_type', 'account_type', 'new_june_2018', 'account_category']:
    for col2 in tweets1:
        if col1!=col2 and isRedundant(tweets1, col1, col2):
            print("Redundancy found: " + col2 + " can be deduced from " + col1)

Redundancy found: new_june_2018 can be deduced from external_author_id
Redundancy found: account_type can be deduced from author
Redundancy found: new_june_2018 can be deduced from author
Redundancy found: account_category can be deduced from author
Redundancy found: retweet can be deduced from post_type
Redundancy found: account_category can be deduced from account_type


Redundant columns found:
* $\texttt{new_june_2018}$ can be deduced both from $\texttt{external_author_id}$ and from $\texttt{author}$
* $\texttt{account_category}$ can be deduced both from $\texttt{author}$ and from $\texttt{account_type}$
* $\texttt{account_type}$ can be deduced from $\texttt{author}$
* $\texttt{retweet}$ can be deduced from $\texttt{post_type}$

We can thus put those values in additionnal, much smaller, dataframes and remove the useless columns from our main dataframe.

In [151]:
# When a value can be deduced by two others, we always select the column with the fewest different values
new_june_df = tweets1[['external_author_id', 'new_june_2018']].drop_duplicates()
acc_type_df = tweets1[['author', 'account_type']].drop_duplicates()
acc_cat_df  = tweets1[['account_type', 'account_category']].drop_duplicates()
retweet_df  = tweets1[['post_type', 'retweet']].drop_duplicates()

tweets1.drop('new_june_2018', axis = 1, inplace = True)
tweets1.drop('account_type', axis = 1, inplace = True)
tweets1.drop('account_category', axis = 1, inplace = True)
tweets1.drop('retweet', axis = 1, inplace = True)

tweets1.head()

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type
0,336,10_GOP,"""We have a sitting Democrat US Senator on tria...",Unknown,English,2017-10-01 19:58:00,2017-10-01 19:59:00,1052,9636,253,REGULAR
1,336,10_GOP,Marshawn Lynch arrives to game in anti-Trump s...,Unknown,English,2017-10-01 22:43:00,2017-10-01 22:43:00,1054,9637,254,REGULAR
2,336,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,2017-10-01 22:50:00,2017-10-01 22:51:00,1054,9637,255,RETWEET
3,336,10_GOP,JUST IN: President Trump dedicates Presidents ...,Unknown,English,2017-10-01 23:52:00,2017-10-01 23:52:00,1062,9642,256,REGULAR
4,336,10_GOP,"19,000 RESPECTING our National Anthem! #StandF...",Unknown,English,2017-10-01 02:13:00,2017-10-01 02:13:00,1050,9645,246,RETWEET


In [152]:
tweets1.describe(include='all')

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type
count,381016.0,381016,381016,381016,381016,381016,381016,381016.0,381016.0,381016.0,381016
unique,,418,355704,19,51,186359,192483,,,,3
top,,AMELIEBALDWIN,В городе Сочи. Олимпиада – праздник или стихий...,United States,English,2017-08-15 17:10:00,2017-08-16 01:32:00,,,,RETWEET
freq,,35371,59,251587,291571,89,90,,,,190015
first,,,,,,2012-10-30 12:27:00,2012-10-30 12:31:00,,,,
last,,,,,,2018-05-27 12:54:00,2018-05-27 12:56:00,,,,
mean,193.814121,,,,,,,2229.714854,2563.028713,6653.524532,
std,113.17924,,,,,,,3573.08502,4945.745248,8489.167614,
min,0.0,,,,,,,0.0,0.0,1.0,
25%,61.0,,,,,,,189.0,147.0,1293.0,


#### First conclusions
From what we have seen so far, we can see that not every column in the dataset is useful, some are implied by others. We can also see that one *external_author_id* can use several Twitter handles (*author*), and vice-versa: one *author* can correspond to several *external_author_id*.

From the summary right above this cell, we can see that not all *content* are unique: it probably comes from several accounts copy-pasting the same text or from retweets. It also appears that around 50 languages are used, and posts come from almost 20 different regions.

Now that we have a better grasp of the content of one of the data files, we should gather them all and treat the whole dataset like we just did for this sample.

### TODO: verify that the data cleaning and the redundancy between columns are consistent for the other datasets