In [1]:
import os
import random
import json
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Create a random held-out testing dataset (15%) of each side to measure performance on unseen twitter users
random.seed(1)
samplesize_denier = int(round(0.15*len(os.listdir('data/tweets/contrarian')),0))
test_denier = random.sample(range(len(os.listdir('data/tweets/contrarian'))), samplesize_denier)
train_denier = [i for i in range(len(os.listdir('data/tweets/contrarian'))) if i not in test_denier]

samplesize_pro = int(round(0.15*len(os.listdir('data/tweets/convinced')),0))
test_pro = random.sample(range(len(os.listdir('data/tweets/convinced'))), samplesize_pro)  # Pick 4 random items from the list
train_pro = [i for i in range(len(os.listdir('data/tweets/convinced'))) if i not in test_pro]

In [3]:
# Create the training and validation data sets
print('---Load training/validation data: contrarian tweets---\n')
tweets_contrarian = []
username_contrarian = []
name_contrarian = []
date_contrarian = []
time_contrarian = []
for i, file in enumerate([os.listdir('data/tweets/contrarian')[j] for j in train_denier]):
    print(i, file)
    for line in open(''.join(['data/tweets/contrarian/', file]), 'r'):
        tweets_contrarian.append(json.loads(line)['tweet'])
        username_contrarian.append(json.loads(line)['username'])
        name_contrarian.append(json.loads(line)['name'])
        date_contrarian.append(json.loads(line)['date'])
        time_contrarian.append(json.loads(line)['time'])
print("\n", len(tweets_contrarian), "contrarian tweets loaded.")

print('\n\n---Load training/validation data: convinced tweets---\n')
tweets_convinced = []
username_convinced = []
name_convinced = []
date_convinced = []
time_convinced = []
for i, file in enumerate([os.listdir('data/tweets/convinced')[j] for j in train_pro]):
    print(i, file)
    for line in open(''.join(['data/tweets/convinced/', file]), 'r'):
        tweets_convinced.append(json.loads(line)['tweet'])
        username_convinced.append(json.loads(line)['username'])
        name_convinced.append(json.loads(line)['name'])
        date_convinced.append(json.loads(line)['date'])
        time_convinced.append(json.loads(line)['time'])
print("\n", len(tweets_convinced), "convinced tweets loaded.")        
        
df = pd.DataFrame(list(zip(
    tweets_contrarian + tweets_convinced, 
    [1]*len(tweets_contrarian) + [0]*len(tweets_convinced),
    username_contrarian + username_convinced,
    name_contrarian + name_convinced,
    date_contrarian + date_convinced,
    time_contrarian + time_convinced)),
                    columns = ['text', 'label', 'username', 'name', 'date', 'time'])

# Split training data into training and validation dataset
train, valid = train_test_split(df, test_size=0.25, random_state=1, shuffle=True)
print("\n\nTraining data set with {} tweets created.".format(len(train)))
print("Validation data set with {} tweets created.".format(len(valid)))        

---Load training/validation data: contrarian tweets---

0 PMgeezer.json
1 FreedomWorks.json
2 FoF_Liberty.json
3 JamesDelingpole.json
4 ellymelly.json
5 EcoSenseNow.json
6 ACSHorg.json
7 HeartlandInst.json
8 CFACT.json
9 FoxNews.json
10 ReasonFdn.json
11 EnergyBrief.json
12 RogerPielkeJr.json
13 FriendsOScience.json
14 HudsonInstitute.json
15 velardedaoiz2.json
16 DailySignal.json
17 wattsupwiththat.json
18 WAPolicyCenter.json
19 IERenergy.json
20 HaveWeAllGoneM1.json
21 tan123.json
22 catoletters.json
23 AlexEpstein.json
24 StopTheseThings.json
25 curryja.json
26 ShellenbergerMD.json
27 ClimateAudit.json
28 ronnieressler.json
29 JunkScience.json
30 JaggerMickOZ.json
31 Toimatom.json
32 BreitbartNews.json
33 DavidRoseUK.json
34 BjornLomborg.json
35 rmack2x.json
36 ceidotorg.json
37 PacificResearch.json
38 IvoVegter.json
39 CatoInstitute.json
40 Heritage.json
41 nigella_i5e.json
42 AboutFreedom999.json
43 ManhattanInst.json
44 Tony__Heller.json
45 AEI.json
46 FraserInstitute.json
47 Cli

In [4]:
# Create an independent testing data set
print('---Load independent testing data: contrarian tweets---\n')
tweets_contrarian = []
username_contrarian = []
name_contrarian = []
date_contrarian = []
time_contrarian = []
for i, file in enumerate([os.listdir('data/tweets/contrarian')[j] for j in test_denier]):
    print(i, file)
    for line in open(''.join(['data/tweets/contrarian/', file]), 'r'):
        tweets_contrarian.append(json.loads(line)['tweet'])
        username_contrarian.append(json.loads(line)['username'])
        name_contrarian.append(json.loads(line)['name'])
        date_contrarian.append(json.loads(line)['date'])
        time_contrarian.append(json.loads(line)['time'])
print("\n", len(tweets_contrarian), "contrarian tweets loaded.")

print('\n\n---Load independent testing data: convinced tweets---\n')
tweets_convinced = []
username_convinced = []
name_convinced = []
date_convinced = []
time_convinced = []
for i, file in enumerate([os.listdir('data/tweets/convinced')[j] for j in test_pro]):
    print(i, file)
    for line in open(''.join(['data/tweets/convinced/', file]), 'r'):
        tweets_convinced.append(json.loads(line)['tweet'])
        username_convinced.append(json.loads(line)['username'])
        name_convinced.append(json.loads(line)['name'])
        date_convinced.append(json.loads(line)['date'])
        time_convinced.append(json.loads(line)['time'])
print("\n", len(tweets_convinced), "convinced tweets loaded.")        
        
test = pd.DataFrame(list(zip(
    tweets_contrarian + tweets_convinced, 
    [1]*len(tweets_contrarian) + [0]*len(tweets_convinced),
    username_contrarian + username_convinced,
    name_contrarian + name_convinced,
    date_contrarian + date_convinced,
    time_contrarian + time_convinced)),
                    columns = ['text', 'label', 'username', 'name', 'date', 'time'])

print("\n\nTesting data set with {} tweets created.".format(len(test)))

---Load independent testing data: contrarian tweets---

0 ActivistPost.json
1 RyanMaue.json
2 mattwridley.json
3 ElianaBenador.json
4 SeibtNaomi.json
5 EnergyCitizens.json
6 Haggisman57.json
7 NationalCenter.json
8 ClimateRealists.json

 231942 contrarian tweets loaded.


---Load independent testing data: convinced tweets---

0 IPCC_CH.json
1 GreenpeaceUK.json
2 ClimateCentral.json
3 NASAClimate.json
4 ProjectDrawdown.json
5 ClimatePower.json

 142178 convinced tweets loaded.


Testing data set with 374120 tweets created.


In [5]:
# Save the data sets
train.to_csv('data/train.csv', index = False, encoding = 'utf-8')
valid.to_csv('data/valid.csv', index = False, encoding = 'utf-8')
test.to_csv('data/test.csv', index = False, encoding = 'utf-8')