In [1]:
import pandas as pd
import os
from tqdm import tqdm

## Load the dataset

In [2]:
path_training_data = "./datasets/full_dataset.csv"
os.path.exists(path_training_data)

True

In [3]:
dataset = pd.read_csv(
    path_training_data, 
    header = None,
    encoding = 'latin-1'
    )

In [4]:
dataset.columns = ["polarity", "id", "date", "query", "user", "text"]

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   polarity  1600000 non-null  int64 
 1   id        1600000 non-null  int64 
 2   date      1600000 non-null  object
 3   query     1600000 non-null  object
 4   user      1600000 non-null  object
 5   text      1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [6]:
dataset["polarity"].unique()

array([4, 0])

Dates are imported as strings, we convert them to date objetcs

In [7]:
import datetime

In [8]:
# converting date format to UTC
dataset["date"]=dataset["date"].apply(lambda st: st.replace("PDT", "UTC-07:00"))

In [9]:
dataset["date"] = pd.to_datetime(dataset["date"])

In [10]:
type(dataset["date"][0])

pandas._libs.tslibs.timestamps.Timestamp

## Group by user and order by date

In [11]:
def reorder_by_date(dataframe):

    dataframe.sort_values(by=["date"], ascending = True, inplace = True) 
    # dataframe = dataframe.drop(["datetime"], axis = 1)
    return dataframe

Constructing the dataset.

After inspection, the three top posters (tweetpet, webwoke, lost_dog), are bots

In [12]:
dataset.groupby("user").size().sort_values()

user
dancelikejordan      1
havin_an_affair      1
havicyeo             1
haveyoumettony       1
havetoexplode        1
                  ... 
VioletsCRUK        279
SallytheShizzle    281
tweetpet           310
webwoke            345
lost_dog           549
Length: 659775, dtype: int64

In [13]:
min_n_tokens = 5
min_n_docs = 5
bot_list = ["tweetpet", "lost_dog", "webwoke"]

In [14]:
final_dataset = []
for user, sub_df in tqdm( dataset.groupby("user")):
    entry = {}
    entry["user"] = user
    # keep only those texts long enough
    tokens_list = list(sub_df["text"].apply(str.split))
    tokens_indexes = [len(tokens) >= min_n_tokens for tokens in tokens_list]
    sub_df = sub_df[tokens_indexes] 
    # keep only those users with several texts
    if len(sub_df) < min_n_docs:
        continue
    if user in bot_list:
        continue
    sub_df = reorder_by_date(sub_df)

    texts = list(sub_df["text"])
    polarities = sub_df["polarity"]
    
    # in the original dataset,
    # 0 means negative, 
    # 4 means positive

    polarities = [
        1 if pol == 4 else 0 
        for pol in sub_df["polarity"]
    ]
    entry["labeled_texts"] = [
        {
        "text":text,
        "polarity":pol
        }
        for text, pol in zip(texts, polarities)
    ]

    final_dataset.append(entry)
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
100%|██████████| 659775/659775 [12:04<00:00, 910.39it/s]


A function to clean the text scrapped from Twitter

In [15]:
from myutils.utils import basic_text_cleaning

In [16]:
len(final_dataset)

59794

In [17]:
for entry in final_dataset:

    for text_entry in entry["labeled_texts"]:
        text_entry["text"] = basic_text_cleaning(
            text_entry["text"]
        )

## Tag the users

since we are trying to classify the users as in risk or not in risk, we need to assign categories to them in order to train a classifier later.
That is, we need to have an annotated user dataset.

To decide if a user is in risk or not, I categorise a user as in risk if at least 2/3rds of their tweets are negative.

In [18]:
proportion_for_negative_label = 2/3

In [19]:
for entry in final_dataset:
    n_docs = len(entry["labeled_texts"])
    n_negative_docs = 0
    for text_entry in entry["labeled_texts"]:
        if text_entry["polarity"] == 0:
            n_negative_docs +=1

    if n_negative_docs >= proportion_for_negative_label * n_docs:
        user_label = 1 # in risk
    else:
        user_label = 0 # not in risk
    entry["label"] = user_label

In [20]:
pd.DataFrame(final_dataset).groupby("label").size()

label
0    39778
1    20016
dtype: int64

In [21]:
final_dataset[30]

{'user': '10isjess',
 'labeled_texts': [{'text': 'totally jealous, i want to be at that party',
   'polarity': 0},
  {'text': 'feel better. i would offer to help but i am working on a presentation for tonight  on depression',
   'polarity': 0},
  {'text': 'went to run a 5k in honor of daniel wultz', 'polarity': 0},
  {'text': "don't want to be at work", 'polarity': 0},
  {'text': 'omg i have to write reports, only 11 more days till summer break',
   'polarity': 1},
  {'text': "can't get any work done  to busy thinking about my boyfriend joey and how  we can suck the nectar together ;)",
   'polarity': 0},
  {'text': 'serious withdrawals  thanks to those who posted pics they have been added to my phone',
   'polarity': 0},
  {'text': "seriously is this day going in slow motion. ugh love pics but can't look anymore need to work",
   'polarity': 0},
  {'text': 'oy my dad and i suffer from those. i feel for him. my dad just had one a couple of weeks ago. i hope he feels better soon',
   'p

In [22]:
import json

In [23]:
with open("./datasets/filtered_dataset.json", "w") as f:
    json.dump(
        final_dataset, f, indent=2
    )

## Train and test splits

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
user_list = [
    entry["user"] for entry in final_dataset
]
label_list = [
    entry["label"] for entry in final_dataset
]

In [26]:
users_train, users_test, labels_train, labels_test = train_test_split(
    user_list, label_list,
    test_size=0.20,
    random_state=42
)

In [27]:
train_dataset = [entry for entry in final_dataset if entry["user"] 
                 in users_train
                 ]

In [28]:
test_dataset = [entry for entry in final_dataset if entry["user"] 
                 in users_test
                 ]

In [29]:
with open("./datasets/train_dataset.json", "w") as f:
    json.dump(
        train_dataset, f, indent=2
    )

In [30]:
with open("./datasets/test_dataset.json", "w") as f:
    json.dump(
        test_dataset, f, indent=2
    )

In [31]:
len(train_dataset)

47835

In [32]:
# downsampling training dataset for experiments

import random
random.seed(42)
downsample_factor = 0.05

downsampled_train_dataset = random.sample(
    train_dataset,
    int( downsample_factor * len(train_dataset) ),
    )

In [33]:
with open("./datasets/downsampled_train_dataset.json", "w") as f:
    json.dump(
        downsampled_train_dataset, f, indent=2
    )